1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30 
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct intel_gt *gt;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->gt = gt;
60 
61 	h->ctx = kernel_context(gt->i915);
62 	if (IS_ERR(h->ctx))
63 		return PTR_ERR(h->ctx);
64 
65 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66 
67 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68 	if (IS_ERR(h->hws)) {
69 		err = PTR_ERR(h->hws);
70 		goto err_ctx;
71 	}
72 
73 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74 	if (IS_ERR(h->obj)) {
75 		err = PTR_ERR(h->obj);
76 		goto err_hws;
77 	}
78 
79 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 	if (IS_ERR(vaddr)) {
82 		err = PTR_ERR(vaddr);
83 		goto err_obj;
84 	}
85 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86 
87 	vaddr = i915_gem_object_pin_map(h->obj,
88 					i915_coherent_map_type(gt->i915));
89 	if (IS_ERR(vaddr)) {
90 		err = PTR_ERR(vaddr);
91 		goto err_unpin_hws;
92 	}
93 	h->batch = vaddr;
94 
95 	return 0;
96 
97 err_unpin_hws:
98 	i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 	i915_gem_object_put(h->obj);
101 err_hws:
102 	i915_gem_object_put(h->hws);
103 err_ctx:
104 	kernel_context_close(h->ctx);
105 	return err;
106 }
107 
108 static u64 hws_address(const struct i915_vma *hws,
109 		       const struct i915_request *rq)
110 {
111 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113 
114 static int move_to_active(struct i915_vma *vma,
115 			  struct i915_request *rq,
116 			  unsigned int flags)
117 {
118 	int err;
119 
120 	i915_vma_lock(vma);
121 	err = i915_vma_move_to_active(vma, rq, flags);
122 	i915_vma_unlock(vma);
123 
124 	return err;
125 }
126 
127 static struct i915_request *
128 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
129 {
130 	struct intel_gt *gt = h->gt;
131 	struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
132 	struct drm_i915_gem_object *obj;
133 	struct i915_request *rq = NULL;
134 	struct i915_vma *hws, *vma;
135 	unsigned int flags;
136 	void *vaddr;
137 	u32 *batch;
138 	int err;
139 
140 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
141 	if (IS_ERR(obj))
142 		return ERR_CAST(obj);
143 
144 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
145 	if (IS_ERR(vaddr)) {
146 		i915_gem_object_put(obj);
147 		return ERR_CAST(vaddr);
148 	}
149 
150 	i915_gem_object_unpin_map(h->obj);
151 	i915_gem_object_put(h->obj);
152 
153 	h->obj = obj;
154 	h->batch = vaddr;
155 
156 	vma = i915_vma_instance(h->obj, vm, NULL);
157 	if (IS_ERR(vma))
158 		return ERR_CAST(vma);
159 
160 	hws = i915_vma_instance(h->hws, vm, NULL);
161 	if (IS_ERR(hws))
162 		return ERR_CAST(hws);
163 
164 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
165 	if (err)
166 		return ERR_PTR(err);
167 
168 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
169 	if (err)
170 		goto unpin_vma;
171 
172 	rq = igt_request_alloc(h->ctx, engine);
173 	if (IS_ERR(rq)) {
174 		err = PTR_ERR(rq);
175 		goto unpin_hws;
176 	}
177 
178 	err = move_to_active(vma, rq, 0);
179 	if (err)
180 		goto cancel_rq;
181 
182 	err = move_to_active(hws, rq, 0);
183 	if (err)
184 		goto cancel_rq;
185 
186 	batch = h->batch;
187 	if (INTEL_GEN(gt->i915) >= 8) {
188 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
189 		*batch++ = lower_32_bits(hws_address(hws, rq));
190 		*batch++ = upper_32_bits(hws_address(hws, rq));
191 		*batch++ = rq->fence.seqno;
192 		*batch++ = MI_ARB_CHECK;
193 
194 		memset(batch, 0, 1024);
195 		batch += 1024 / sizeof(*batch);
196 
197 		*batch++ = MI_ARB_CHECK;
198 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
199 		*batch++ = lower_32_bits(vma->node.start);
200 		*batch++ = upper_32_bits(vma->node.start);
201 	} else if (INTEL_GEN(gt->i915) >= 6) {
202 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
203 		*batch++ = 0;
204 		*batch++ = lower_32_bits(hws_address(hws, rq));
205 		*batch++ = rq->fence.seqno;
206 		*batch++ = MI_ARB_CHECK;
207 
208 		memset(batch, 0, 1024);
209 		batch += 1024 / sizeof(*batch);
210 
211 		*batch++ = MI_ARB_CHECK;
212 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
213 		*batch++ = lower_32_bits(vma->node.start);
214 	} else if (INTEL_GEN(gt->i915) >= 4) {
215 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
216 		*batch++ = 0;
217 		*batch++ = lower_32_bits(hws_address(hws, rq));
218 		*batch++ = rq->fence.seqno;
219 		*batch++ = MI_ARB_CHECK;
220 
221 		memset(batch, 0, 1024);
222 		batch += 1024 / sizeof(*batch);
223 
224 		*batch++ = MI_ARB_CHECK;
225 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
226 		*batch++ = lower_32_bits(vma->node.start);
227 	} else {
228 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
229 		*batch++ = lower_32_bits(hws_address(hws, rq));
230 		*batch++ = rq->fence.seqno;
231 		*batch++ = MI_ARB_CHECK;
232 
233 		memset(batch, 0, 1024);
234 		batch += 1024 / sizeof(*batch);
235 
236 		*batch++ = MI_ARB_CHECK;
237 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
238 		*batch++ = lower_32_bits(vma->node.start);
239 	}
240 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
241 	intel_gt_chipset_flush(engine->gt);
242 
243 	if (rq->engine->emit_init_breadcrumb) {
244 		err = rq->engine->emit_init_breadcrumb(rq);
245 		if (err)
246 			goto cancel_rq;
247 	}
248 
249 	flags = 0;
250 	if (INTEL_GEN(gt->i915) <= 5)
251 		flags |= I915_DISPATCH_SECURE;
252 
253 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
254 
255 cancel_rq:
256 	if (err) {
257 		i915_request_skip(rq, err);
258 		i915_request_add(rq);
259 	}
260 unpin_hws:
261 	i915_vma_unpin(hws);
262 unpin_vma:
263 	i915_vma_unpin(vma);
264 	return err ? ERR_PTR(err) : rq;
265 }
266 
267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
268 {
269 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
270 }
271 
272 static void hang_fini(struct hang *h)
273 {
274 	*h->batch = MI_BATCH_BUFFER_END;
275 	intel_gt_chipset_flush(h->gt);
276 
277 	i915_gem_object_unpin_map(h->obj);
278 	i915_gem_object_put(h->obj);
279 
280 	i915_gem_object_unpin_map(h->hws);
281 	i915_gem_object_put(h->hws);
282 
283 	kernel_context_close(h->ctx);
284 
285 	igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
286 }
287 
288 static bool wait_until_running(struct hang *h, struct i915_request *rq)
289 {
290 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
291 					       rq->fence.seqno),
292 			     10) &&
293 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
294 					    rq->fence.seqno),
295 			  1000));
296 }
297 
298 static int igt_hang_sanitycheck(void *arg)
299 {
300 	struct intel_gt *gt = arg;
301 	struct i915_request *rq;
302 	struct intel_engine_cs *engine;
303 	enum intel_engine_id id;
304 	struct hang h;
305 	int err;
306 
307 	/* Basic check that we can execute our hanging batch */
308 
309 	mutex_lock(&gt->i915->drm.struct_mutex);
310 	err = hang_init(&h, gt);
311 	if (err)
312 		goto unlock;
313 
314 	for_each_engine(engine, gt->i915, id) {
315 		struct intel_wedge_me w;
316 		long timeout;
317 
318 		if (!intel_engine_can_store_dword(engine))
319 			continue;
320 
321 		rq = hang_create_request(&h, engine);
322 		if (IS_ERR(rq)) {
323 			err = PTR_ERR(rq);
324 			pr_err("Failed to create request for %s, err=%d\n",
325 			       engine->name, err);
326 			goto fini;
327 		}
328 
329 		i915_request_get(rq);
330 
331 		*h.batch = MI_BATCH_BUFFER_END;
332 		intel_gt_chipset_flush(engine->gt);
333 
334 		i915_request_add(rq);
335 
336 		timeout = 0;
337 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
338 			timeout = i915_request_wait(rq, 0,
339 						    MAX_SCHEDULE_TIMEOUT);
340 		if (intel_gt_is_wedged(gt))
341 			timeout = -EIO;
342 
343 		i915_request_put(rq);
344 
345 		if (timeout < 0) {
346 			err = timeout;
347 			pr_err("Wait for request failed on %s, err=%d\n",
348 			       engine->name, err);
349 			goto fini;
350 		}
351 	}
352 
353 fini:
354 	hang_fini(&h);
355 unlock:
356 	mutex_unlock(&gt->i915->drm.struct_mutex);
357 	return err;
358 }
359 
360 static bool wait_for_idle(struct intel_engine_cs *engine)
361 {
362 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
363 }
364 
365 static int igt_reset_nop(void *arg)
366 {
367 	struct intel_gt *gt = arg;
368 	struct i915_gpu_error *global = &gt->i915->gpu_error;
369 	struct intel_engine_cs *engine;
370 	struct i915_gem_context *ctx;
371 	unsigned int reset_count, count;
372 	enum intel_engine_id id;
373 	struct drm_file *file;
374 	IGT_TIMEOUT(end_time);
375 	int err = 0;
376 
377 	/* Check that we can reset during non-user portions of requests */
378 
379 	file = mock_file(gt->i915);
380 	if (IS_ERR(file))
381 		return PTR_ERR(file);
382 
383 	mutex_lock(&gt->i915->drm.struct_mutex);
384 	ctx = live_context(gt->i915, file);
385 	mutex_unlock(&gt->i915->drm.struct_mutex);
386 	if (IS_ERR(ctx)) {
387 		err = PTR_ERR(ctx);
388 		goto out;
389 	}
390 
391 	i915_gem_context_clear_bannable(ctx);
392 	reset_count = i915_reset_count(global);
393 	count = 0;
394 	do {
395 		mutex_lock(&gt->i915->drm.struct_mutex);
396 
397 		for_each_engine(engine, gt->i915, id) {
398 			int i;
399 
400 			for (i = 0; i < 16; i++) {
401 				struct i915_request *rq;
402 
403 				rq = igt_request_alloc(ctx, engine);
404 				if (IS_ERR(rq)) {
405 					err = PTR_ERR(rq);
406 					break;
407 				}
408 
409 				i915_request_add(rq);
410 			}
411 		}
412 
413 		igt_global_reset_lock(gt);
414 		intel_gt_reset(gt, ALL_ENGINES, NULL);
415 		igt_global_reset_unlock(gt);
416 
417 		mutex_unlock(&gt->i915->drm.struct_mutex);
418 		if (intel_gt_is_wedged(gt)) {
419 			err = -EIO;
420 			break;
421 		}
422 
423 		if (i915_reset_count(global) != reset_count + ++count) {
424 			pr_err("Full GPU reset not recorded!\n");
425 			err = -EINVAL;
426 			break;
427 		}
428 
429 		err = igt_flush_test(gt->i915, 0);
430 		if (err)
431 			break;
432 	} while (time_before(jiffies, end_time));
433 	pr_info("%s: %d resets\n", __func__, count);
434 
435 	mutex_lock(&gt->i915->drm.struct_mutex);
436 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
437 	mutex_unlock(&gt->i915->drm.struct_mutex);
438 
439 out:
440 	mock_file_free(gt->i915, file);
441 	if (intel_gt_is_wedged(gt))
442 		err = -EIO;
443 	return err;
444 }
445 
446 static int igt_reset_nop_engine(void *arg)
447 {
448 	struct intel_gt *gt = arg;
449 	struct i915_gpu_error *global = &gt->i915->gpu_error;
450 	struct intel_engine_cs *engine;
451 	struct i915_gem_context *ctx;
452 	enum intel_engine_id id;
453 	struct drm_file *file;
454 	int err = 0;
455 
456 	/* Check that we can engine-reset during non-user portions */
457 
458 	if (!intel_has_reset_engine(gt->i915))
459 		return 0;
460 
461 	file = mock_file(gt->i915);
462 	if (IS_ERR(file))
463 		return PTR_ERR(file);
464 
465 	mutex_lock(&gt->i915->drm.struct_mutex);
466 	ctx = live_context(gt->i915, file);
467 	mutex_unlock(&gt->i915->drm.struct_mutex);
468 	if (IS_ERR(ctx)) {
469 		err = PTR_ERR(ctx);
470 		goto out;
471 	}
472 
473 	i915_gem_context_clear_bannable(ctx);
474 	for_each_engine(engine, gt->i915, id) {
475 		unsigned int reset_count, reset_engine_count;
476 		unsigned int count;
477 		IGT_TIMEOUT(end_time);
478 
479 		reset_count = i915_reset_count(global);
480 		reset_engine_count = i915_reset_engine_count(global, engine);
481 		count = 0;
482 
483 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
484 		do {
485 			int i;
486 
487 			if (!wait_for_idle(engine)) {
488 				pr_err("%s failed to idle before reset\n",
489 				       engine->name);
490 				err = -EIO;
491 				break;
492 			}
493 
494 			mutex_lock(&gt->i915->drm.struct_mutex);
495 			for (i = 0; i < 16; i++) {
496 				struct i915_request *rq;
497 
498 				rq = igt_request_alloc(ctx, engine);
499 				if (IS_ERR(rq)) {
500 					err = PTR_ERR(rq);
501 					break;
502 				}
503 
504 				i915_request_add(rq);
505 			}
506 			err = intel_engine_reset(engine, NULL);
507 			mutex_unlock(&gt->i915->drm.struct_mutex);
508 			if (err) {
509 				pr_err("i915_reset_engine failed\n");
510 				break;
511 			}
512 
513 			if (i915_reset_count(global) != reset_count) {
514 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
515 				err = -EINVAL;
516 				break;
517 			}
518 
519 			if (i915_reset_engine_count(global, engine) !=
520 			    reset_engine_count + ++count) {
521 				pr_err("%s engine reset not recorded!\n",
522 				       engine->name);
523 				err = -EINVAL;
524 				break;
525 			}
526 		} while (time_before(jiffies, end_time));
527 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
528 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
529 
530 		if (err)
531 			break;
532 
533 		err = igt_flush_test(gt->i915, 0);
534 		if (err)
535 			break;
536 	}
537 
538 	mutex_lock(&gt->i915->drm.struct_mutex);
539 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
540 	mutex_unlock(&gt->i915->drm.struct_mutex);
541 
542 out:
543 	mock_file_free(gt->i915, file);
544 	if (intel_gt_is_wedged(gt))
545 		err = -EIO;
546 	return err;
547 }
548 
549 static int __igt_reset_engine(struct intel_gt *gt, bool active)
550 {
551 	struct i915_gpu_error *global = &gt->i915->gpu_error;
552 	struct intel_engine_cs *engine;
553 	enum intel_engine_id id;
554 	struct hang h;
555 	int err = 0;
556 
557 	/* Check that we can issue an engine reset on an idle engine (no-op) */
558 
559 	if (!intel_has_reset_engine(gt->i915))
560 		return 0;
561 
562 	if (active) {
563 		mutex_lock(&gt->i915->drm.struct_mutex);
564 		err = hang_init(&h, gt);
565 		mutex_unlock(&gt->i915->drm.struct_mutex);
566 		if (err)
567 			return err;
568 	}
569 
570 	for_each_engine(engine, gt->i915, id) {
571 		unsigned int reset_count, reset_engine_count;
572 		IGT_TIMEOUT(end_time);
573 
574 		if (active && !intel_engine_can_store_dword(engine))
575 			continue;
576 
577 		if (!wait_for_idle(engine)) {
578 			pr_err("%s failed to idle before reset\n",
579 			       engine->name);
580 			err = -EIO;
581 			break;
582 		}
583 
584 		reset_count = i915_reset_count(global);
585 		reset_engine_count = i915_reset_engine_count(global, engine);
586 
587 		intel_engine_pm_get(engine);
588 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
589 		do {
590 			if (active) {
591 				struct i915_request *rq;
592 
593 				mutex_lock(&gt->i915->drm.struct_mutex);
594 				rq = hang_create_request(&h, engine);
595 				if (IS_ERR(rq)) {
596 					err = PTR_ERR(rq);
597 					mutex_unlock(&gt->i915->drm.struct_mutex);
598 					break;
599 				}
600 
601 				i915_request_get(rq);
602 				i915_request_add(rq);
603 				mutex_unlock(&gt->i915->drm.struct_mutex);
604 
605 				if (!wait_until_running(&h, rq)) {
606 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
607 
608 					pr_err("%s: Failed to start request %llx, at %x\n",
609 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
610 					intel_engine_dump(engine, &p,
611 							  "%s\n", engine->name);
612 
613 					i915_request_put(rq);
614 					err = -EIO;
615 					break;
616 				}
617 
618 				i915_request_put(rq);
619 			}
620 
621 			err = intel_engine_reset(engine, NULL);
622 			if (err) {
623 				pr_err("i915_reset_engine failed\n");
624 				break;
625 			}
626 
627 			if (i915_reset_count(global) != reset_count) {
628 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
629 				err = -EINVAL;
630 				break;
631 			}
632 
633 			if (i915_reset_engine_count(global, engine) !=
634 			    ++reset_engine_count) {
635 				pr_err("%s engine reset not recorded!\n",
636 				       engine->name);
637 				err = -EINVAL;
638 				break;
639 			}
640 		} while (time_before(jiffies, end_time));
641 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
642 		intel_engine_pm_put(engine);
643 
644 		if (err)
645 			break;
646 
647 		err = igt_flush_test(gt->i915, 0);
648 		if (err)
649 			break;
650 	}
651 
652 	if (intel_gt_is_wedged(gt))
653 		err = -EIO;
654 
655 	if (active) {
656 		mutex_lock(&gt->i915->drm.struct_mutex);
657 		hang_fini(&h);
658 		mutex_unlock(&gt->i915->drm.struct_mutex);
659 	}
660 
661 	return err;
662 }
663 
664 static int igt_reset_idle_engine(void *arg)
665 {
666 	return __igt_reset_engine(arg, false);
667 }
668 
669 static int igt_reset_active_engine(void *arg)
670 {
671 	return __igt_reset_engine(arg, true);
672 }
673 
674 struct active_engine {
675 	struct task_struct *task;
676 	struct intel_engine_cs *engine;
677 	unsigned long resets;
678 	unsigned int flags;
679 };
680 
681 #define TEST_ACTIVE	BIT(0)
682 #define TEST_OTHERS	BIT(1)
683 #define TEST_SELF	BIT(2)
684 #define TEST_PRIORITY	BIT(3)
685 
686 static int active_request_put(struct i915_request *rq)
687 {
688 	int err = 0;
689 
690 	if (!rq)
691 		return 0;
692 
693 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
694 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
695 			  rq->engine->name,
696 			  rq->fence.context,
697 			  rq->fence.seqno);
698 		GEM_TRACE_DUMP();
699 
700 		intel_gt_set_wedged(rq->engine->gt);
701 		err = -EIO;
702 	}
703 
704 	i915_request_put(rq);
705 
706 	return err;
707 }
708 
709 static int active_engine(void *data)
710 {
711 	I915_RND_STATE(prng);
712 	struct active_engine *arg = data;
713 	struct intel_engine_cs *engine = arg->engine;
714 	struct i915_request *rq[8] = {};
715 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
716 	struct drm_file *file;
717 	unsigned long count = 0;
718 	int err = 0;
719 
720 	file = mock_file(engine->i915);
721 	if (IS_ERR(file))
722 		return PTR_ERR(file);
723 
724 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
725 		mutex_lock(&engine->i915->drm.struct_mutex);
726 		ctx[count] = live_context(engine->i915, file);
727 		mutex_unlock(&engine->i915->drm.struct_mutex);
728 		if (IS_ERR(ctx[count])) {
729 			err = PTR_ERR(ctx[count]);
730 			while (--count)
731 				i915_gem_context_put(ctx[count]);
732 			goto err_file;
733 		}
734 	}
735 
736 	while (!kthread_should_stop()) {
737 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
738 		struct i915_request *old = rq[idx];
739 		struct i915_request *new;
740 
741 		mutex_lock(&engine->i915->drm.struct_mutex);
742 		new = igt_request_alloc(ctx[idx], engine);
743 		if (IS_ERR(new)) {
744 			mutex_unlock(&engine->i915->drm.struct_mutex);
745 			err = PTR_ERR(new);
746 			break;
747 		}
748 
749 		if (arg->flags & TEST_PRIORITY)
750 			ctx[idx]->sched.priority =
751 				i915_prandom_u32_max_state(512, &prng);
752 
753 		rq[idx] = i915_request_get(new);
754 		i915_request_add(new);
755 		mutex_unlock(&engine->i915->drm.struct_mutex);
756 
757 		err = active_request_put(old);
758 		if (err)
759 			break;
760 
761 		cond_resched();
762 	}
763 
764 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
765 		int err__ = active_request_put(rq[count]);
766 
767 		/* Keep the first error */
768 		if (!err)
769 			err = err__;
770 	}
771 
772 err_file:
773 	mock_file_free(engine->i915, file);
774 	return err;
775 }
776 
777 static int __igt_reset_engines(struct intel_gt *gt,
778 			       const char *test_name,
779 			       unsigned int flags)
780 {
781 	struct i915_gpu_error *global = &gt->i915->gpu_error;
782 	struct intel_engine_cs *engine, *other;
783 	enum intel_engine_id id, tmp;
784 	struct hang h;
785 	int err = 0;
786 
787 	/* Check that issuing a reset on one engine does not interfere
788 	 * with any other engine.
789 	 */
790 
791 	if (!intel_has_reset_engine(gt->i915))
792 		return 0;
793 
794 	if (flags & TEST_ACTIVE) {
795 		mutex_lock(&gt->i915->drm.struct_mutex);
796 		err = hang_init(&h, gt);
797 		mutex_unlock(&gt->i915->drm.struct_mutex);
798 		if (err)
799 			return err;
800 
801 		if (flags & TEST_PRIORITY)
802 			h.ctx->sched.priority = 1024;
803 	}
804 
805 	for_each_engine(engine, gt->i915, id) {
806 		struct active_engine threads[I915_NUM_ENGINES] = {};
807 		unsigned long device = i915_reset_count(global);
808 		unsigned long count = 0, reported;
809 		IGT_TIMEOUT(end_time);
810 
811 		if (flags & TEST_ACTIVE &&
812 		    !intel_engine_can_store_dword(engine))
813 			continue;
814 
815 		if (!wait_for_idle(engine)) {
816 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
817 			       engine->name, test_name);
818 			err = -EIO;
819 			break;
820 		}
821 
822 		memset(threads, 0, sizeof(threads));
823 		for_each_engine(other, gt->i915, tmp) {
824 			struct task_struct *tsk;
825 
826 			threads[tmp].resets =
827 				i915_reset_engine_count(global, other);
828 
829 			if (!(flags & TEST_OTHERS))
830 				continue;
831 
832 			if (other == engine && !(flags & TEST_SELF))
833 				continue;
834 
835 			threads[tmp].engine = other;
836 			threads[tmp].flags = flags;
837 
838 			tsk = kthread_run(active_engine, &threads[tmp],
839 					  "igt/%s", other->name);
840 			if (IS_ERR(tsk)) {
841 				err = PTR_ERR(tsk);
842 				goto unwind;
843 			}
844 
845 			threads[tmp].task = tsk;
846 			get_task_struct(tsk);
847 		}
848 
849 		intel_engine_pm_get(engine);
850 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
851 		do {
852 			struct i915_request *rq = NULL;
853 
854 			if (flags & TEST_ACTIVE) {
855 				mutex_lock(&gt->i915->drm.struct_mutex);
856 				rq = hang_create_request(&h, engine);
857 				if (IS_ERR(rq)) {
858 					err = PTR_ERR(rq);
859 					mutex_unlock(&gt->i915->drm.struct_mutex);
860 					break;
861 				}
862 
863 				i915_request_get(rq);
864 				i915_request_add(rq);
865 				mutex_unlock(&gt->i915->drm.struct_mutex);
866 
867 				if (!wait_until_running(&h, rq)) {
868 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
869 
870 					pr_err("%s: Failed to start request %llx, at %x\n",
871 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
872 					intel_engine_dump(engine, &p,
873 							  "%s\n", engine->name);
874 
875 					i915_request_put(rq);
876 					err = -EIO;
877 					break;
878 				}
879 			}
880 
881 			err = intel_engine_reset(engine, NULL);
882 			if (err) {
883 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
884 				       engine->name, test_name, err);
885 				break;
886 			}
887 
888 			count++;
889 
890 			if (rq) {
891 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
892 					struct drm_printer p =
893 						drm_info_printer(gt->i915->drm.dev);
894 
895 					pr_err("i915_reset_engine(%s:%s):"
896 					       " failed to complete request after reset\n",
897 					       engine->name, test_name);
898 					intel_engine_dump(engine, &p,
899 							  "%s\n", engine->name);
900 					i915_request_put(rq);
901 
902 					GEM_TRACE_DUMP();
903 					intel_gt_set_wedged(gt);
904 					err = -EIO;
905 					break;
906 				}
907 
908 				i915_request_put(rq);
909 			}
910 
911 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
912 				struct drm_printer p =
913 					drm_info_printer(gt->i915->drm.dev);
914 
915 				pr_err("i915_reset_engine(%s:%s):"
916 				       " failed to idle after reset\n",
917 				       engine->name, test_name);
918 				intel_engine_dump(engine, &p,
919 						  "%s\n", engine->name);
920 
921 				err = -EIO;
922 				break;
923 			}
924 		} while (time_before(jiffies, end_time));
925 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
926 		intel_engine_pm_put(engine);
927 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
928 			engine->name, test_name, count);
929 
930 		reported = i915_reset_engine_count(global, engine);
931 		reported -= threads[engine->id].resets;
932 		if (reported != count) {
933 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
934 			       engine->name, test_name, count, reported);
935 			if (!err)
936 				err = -EINVAL;
937 		}
938 
939 unwind:
940 		for_each_engine(other, gt->i915, tmp) {
941 			int ret;
942 
943 			if (!threads[tmp].task)
944 				continue;
945 
946 			ret = kthread_stop(threads[tmp].task);
947 			if (ret) {
948 				pr_err("kthread for other engine %s failed, err=%d\n",
949 				       other->name, ret);
950 				if (!err)
951 					err = ret;
952 			}
953 			put_task_struct(threads[tmp].task);
954 
955 			if (other->uabi_class != engine->uabi_class &&
956 			    threads[tmp].resets !=
957 			    i915_reset_engine_count(global, other)) {
958 				pr_err("Innocent engine %s was reset (count=%ld)\n",
959 				       other->name,
960 				       i915_reset_engine_count(global, other) -
961 				       threads[tmp].resets);
962 				if (!err)
963 					err = -EINVAL;
964 			}
965 		}
966 
967 		if (device != i915_reset_count(global)) {
968 			pr_err("Global reset (count=%ld)!\n",
969 			       i915_reset_count(global) - device);
970 			if (!err)
971 				err = -EINVAL;
972 		}
973 
974 		if (err)
975 			break;
976 
977 		mutex_lock(&gt->i915->drm.struct_mutex);
978 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
979 		mutex_unlock(&gt->i915->drm.struct_mutex);
980 		if (err)
981 			break;
982 	}
983 
984 	if (intel_gt_is_wedged(gt))
985 		err = -EIO;
986 
987 	if (flags & TEST_ACTIVE) {
988 		mutex_lock(&gt->i915->drm.struct_mutex);
989 		hang_fini(&h);
990 		mutex_unlock(&gt->i915->drm.struct_mutex);
991 	}
992 
993 	return err;
994 }
995 
996 static int igt_reset_engines(void *arg)
997 {
998 	static const struct {
999 		const char *name;
1000 		unsigned int flags;
1001 	} phases[] = {
1002 		{ "idle", 0 },
1003 		{ "active", TEST_ACTIVE },
1004 		{ "others-idle", TEST_OTHERS },
1005 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1006 		{
1007 			"others-priority",
1008 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1009 		},
1010 		{
1011 			"self-priority",
1012 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1013 		},
1014 		{ }
1015 	};
1016 	struct intel_gt *gt = arg;
1017 	typeof(*phases) *p;
1018 	int err;
1019 
1020 	for (p = phases; p->name; p++) {
1021 		if (p->flags & TEST_PRIORITY) {
1022 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1023 				continue;
1024 		}
1025 
1026 		err = __igt_reset_engines(arg, p->name, p->flags);
1027 		if (err)
1028 			return err;
1029 	}
1030 
1031 	return 0;
1032 }
1033 
1034 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1035 {
1036 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1037 
1038 	intel_gt_reset(gt, mask, NULL);
1039 
1040 	return count;
1041 }
1042 
1043 static int igt_reset_wait(void *arg)
1044 {
1045 	struct intel_gt *gt = arg;
1046 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1047 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1048 	struct i915_request *rq;
1049 	unsigned int reset_count;
1050 	struct hang h;
1051 	long timeout;
1052 	int err;
1053 
1054 	if (!engine || !intel_engine_can_store_dword(engine))
1055 		return 0;
1056 
1057 	/* Check that we detect a stuck waiter and issue a reset */
1058 
1059 	igt_global_reset_lock(gt);
1060 
1061 	mutex_lock(&gt->i915->drm.struct_mutex);
1062 	err = hang_init(&h, gt);
1063 	if (err)
1064 		goto unlock;
1065 
1066 	rq = hang_create_request(&h, engine);
1067 	if (IS_ERR(rq)) {
1068 		err = PTR_ERR(rq);
1069 		goto fini;
1070 	}
1071 
1072 	i915_request_get(rq);
1073 	i915_request_add(rq);
1074 
1075 	if (!wait_until_running(&h, rq)) {
1076 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1077 
1078 		pr_err("%s: Failed to start request %llx, at %x\n",
1079 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1080 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1081 
1082 		intel_gt_set_wedged(gt);
1083 
1084 		err = -EIO;
1085 		goto out_rq;
1086 	}
1087 
1088 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1089 
1090 	timeout = i915_request_wait(rq, 0, 10);
1091 	if (timeout < 0) {
1092 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1093 		       timeout);
1094 		err = timeout;
1095 		goto out_rq;
1096 	}
1097 
1098 	if (i915_reset_count(global) == reset_count) {
1099 		pr_err("No GPU reset recorded!\n");
1100 		err = -EINVAL;
1101 		goto out_rq;
1102 	}
1103 
1104 out_rq:
1105 	i915_request_put(rq);
1106 fini:
1107 	hang_fini(&h);
1108 unlock:
1109 	mutex_unlock(&gt->i915->drm.struct_mutex);
1110 	igt_global_reset_unlock(gt);
1111 
1112 	if (intel_gt_is_wedged(gt))
1113 		return -EIO;
1114 
1115 	return err;
1116 }
1117 
1118 struct evict_vma {
1119 	struct completion completion;
1120 	struct i915_vma *vma;
1121 };
1122 
1123 static int evict_vma(void *data)
1124 {
1125 	struct evict_vma *arg = data;
1126 	struct i915_address_space *vm = arg->vma->vm;
1127 	struct drm_i915_private *i915 = vm->i915;
1128 	struct drm_mm_node evict = arg->vma->node;
1129 	int err;
1130 
1131 	complete(&arg->completion);
1132 
1133 	mutex_lock(&i915->drm.struct_mutex);
1134 	err = i915_gem_evict_for_node(vm, &evict, 0);
1135 	mutex_unlock(&i915->drm.struct_mutex);
1136 
1137 	return err;
1138 }
1139 
1140 static int evict_fence(void *data)
1141 {
1142 	struct evict_vma *arg = data;
1143 	struct drm_i915_private *i915 = arg->vma->vm->i915;
1144 	int err;
1145 
1146 	complete(&arg->completion);
1147 
1148 	mutex_lock(&i915->drm.struct_mutex);
1149 
1150 	/* Mark the fence register as dirty to force the mmio update. */
1151 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1152 	if (err) {
1153 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1154 		goto out_unlock;
1155 	}
1156 
1157 	err = i915_vma_pin_fence(arg->vma);
1158 	if (err) {
1159 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1160 		goto out_unlock;
1161 	}
1162 
1163 	i915_vma_unpin_fence(arg->vma);
1164 
1165 out_unlock:
1166 	mutex_unlock(&i915->drm.struct_mutex);
1167 
1168 	return err;
1169 }
1170 
1171 static int __igt_reset_evict_vma(struct intel_gt *gt,
1172 				 struct i915_address_space *vm,
1173 				 int (*fn)(void *),
1174 				 unsigned int flags)
1175 {
1176 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1177 	struct drm_i915_gem_object *obj;
1178 	struct task_struct *tsk = NULL;
1179 	struct i915_request *rq;
1180 	struct evict_vma arg;
1181 	struct hang h;
1182 	int err;
1183 
1184 	if (!engine || !intel_engine_can_store_dword(engine))
1185 		return 0;
1186 
1187 	/* Check that we can recover an unbind stuck on a hanging request */
1188 
1189 	mutex_lock(&gt->i915->drm.struct_mutex);
1190 	err = hang_init(&h, gt);
1191 	if (err)
1192 		goto unlock;
1193 
1194 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1195 	if (IS_ERR(obj)) {
1196 		err = PTR_ERR(obj);
1197 		goto fini;
1198 	}
1199 
1200 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1201 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1202 		if (err) {
1203 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1204 			goto out_obj;
1205 		}
1206 	}
1207 
1208 	arg.vma = i915_vma_instance(obj, vm, NULL);
1209 	if (IS_ERR(arg.vma)) {
1210 		err = PTR_ERR(arg.vma);
1211 		goto out_obj;
1212 	}
1213 
1214 	rq = hang_create_request(&h, engine);
1215 	if (IS_ERR(rq)) {
1216 		err = PTR_ERR(rq);
1217 		goto out_obj;
1218 	}
1219 
1220 	err = i915_vma_pin(arg.vma, 0, 0,
1221 			   i915_vma_is_ggtt(arg.vma) ?
1222 			   PIN_GLOBAL | PIN_MAPPABLE :
1223 			   PIN_USER);
1224 	if (err) {
1225 		i915_request_add(rq);
1226 		goto out_obj;
1227 	}
1228 
1229 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1230 		err = i915_vma_pin_fence(arg.vma);
1231 		if (err) {
1232 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1233 			i915_vma_unpin(arg.vma);
1234 			i915_request_add(rq);
1235 			goto out_obj;
1236 		}
1237 	}
1238 
1239 	i915_vma_lock(arg.vma);
1240 	err = i915_vma_move_to_active(arg.vma, rq, flags);
1241 	i915_vma_unlock(arg.vma);
1242 
1243 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1244 		i915_vma_unpin_fence(arg.vma);
1245 	i915_vma_unpin(arg.vma);
1246 
1247 	i915_request_get(rq);
1248 	i915_request_add(rq);
1249 	if (err)
1250 		goto out_rq;
1251 
1252 	mutex_unlock(&gt->i915->drm.struct_mutex);
1253 
1254 	if (!wait_until_running(&h, rq)) {
1255 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1256 
1257 		pr_err("%s: Failed to start request %llx, at %x\n",
1258 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1259 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1260 
1261 		intel_gt_set_wedged(gt);
1262 		goto out_reset;
1263 	}
1264 
1265 	init_completion(&arg.completion);
1266 
1267 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1268 	if (IS_ERR(tsk)) {
1269 		err = PTR_ERR(tsk);
1270 		tsk = NULL;
1271 		goto out_reset;
1272 	}
1273 	get_task_struct(tsk);
1274 
1275 	wait_for_completion(&arg.completion);
1276 
1277 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1278 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1279 
1280 		pr_err("igt/evict_vma kthread did not wait\n");
1281 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1282 
1283 		intel_gt_set_wedged(gt);
1284 		goto out_reset;
1285 	}
1286 
1287 out_reset:
1288 	igt_global_reset_lock(gt);
1289 	fake_hangcheck(gt, rq->engine->mask);
1290 	igt_global_reset_unlock(gt);
1291 
1292 	if (tsk) {
1293 		struct intel_wedge_me w;
1294 
1295 		/* The reset, even indirectly, should take less than 10ms. */
1296 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1297 			err = kthread_stop(tsk);
1298 
1299 		put_task_struct(tsk);
1300 	}
1301 
1302 	mutex_lock(&gt->i915->drm.struct_mutex);
1303 out_rq:
1304 	i915_request_put(rq);
1305 out_obj:
1306 	i915_gem_object_put(obj);
1307 fini:
1308 	hang_fini(&h);
1309 unlock:
1310 	mutex_unlock(&gt->i915->drm.struct_mutex);
1311 
1312 	if (intel_gt_is_wedged(gt))
1313 		return -EIO;
1314 
1315 	return err;
1316 }
1317 
1318 static int igt_reset_evict_ggtt(void *arg)
1319 {
1320 	struct intel_gt *gt = arg;
1321 
1322 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1323 				     evict_vma, EXEC_OBJECT_WRITE);
1324 }
1325 
1326 static int igt_reset_evict_ppgtt(void *arg)
1327 {
1328 	struct intel_gt *gt = arg;
1329 	struct i915_gem_context *ctx;
1330 	struct drm_file *file;
1331 	int err;
1332 
1333 	file = mock_file(gt->i915);
1334 	if (IS_ERR(file))
1335 		return PTR_ERR(file);
1336 
1337 	mutex_lock(&gt->i915->drm.struct_mutex);
1338 	ctx = live_context(gt->i915, file);
1339 	mutex_unlock(&gt->i915->drm.struct_mutex);
1340 	if (IS_ERR(ctx)) {
1341 		err = PTR_ERR(ctx);
1342 		goto out;
1343 	}
1344 
1345 	err = 0;
1346 	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1347 		err = __igt_reset_evict_vma(gt, ctx->vm,
1348 					    evict_vma, EXEC_OBJECT_WRITE);
1349 
1350 out:
1351 	mock_file_free(gt->i915, file);
1352 	return err;
1353 }
1354 
1355 static int igt_reset_evict_fence(void *arg)
1356 {
1357 	struct intel_gt *gt = arg;
1358 
1359 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1360 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1361 }
1362 
1363 static int wait_for_others(struct intel_gt *gt,
1364 			   struct intel_engine_cs *exclude)
1365 {
1366 	struct intel_engine_cs *engine;
1367 	enum intel_engine_id id;
1368 
1369 	for_each_engine(engine, gt->i915, id) {
1370 		if (engine == exclude)
1371 			continue;
1372 
1373 		if (!wait_for_idle(engine))
1374 			return -EIO;
1375 	}
1376 
1377 	return 0;
1378 }
1379 
1380 static int igt_reset_queue(void *arg)
1381 {
1382 	struct intel_gt *gt = arg;
1383 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1384 	struct intel_engine_cs *engine;
1385 	enum intel_engine_id id;
1386 	struct hang h;
1387 	int err;
1388 
1389 	/* Check that we replay pending requests following a hang */
1390 
1391 	igt_global_reset_lock(gt);
1392 
1393 	mutex_lock(&gt->i915->drm.struct_mutex);
1394 	err = hang_init(&h, gt);
1395 	if (err)
1396 		goto unlock;
1397 
1398 	for_each_engine(engine, gt->i915, id) {
1399 		struct i915_request *prev;
1400 		IGT_TIMEOUT(end_time);
1401 		unsigned int count;
1402 
1403 		if (!intel_engine_can_store_dword(engine))
1404 			continue;
1405 
1406 		prev = hang_create_request(&h, engine);
1407 		if (IS_ERR(prev)) {
1408 			err = PTR_ERR(prev);
1409 			goto fini;
1410 		}
1411 
1412 		i915_request_get(prev);
1413 		i915_request_add(prev);
1414 
1415 		count = 0;
1416 		do {
1417 			struct i915_request *rq;
1418 			unsigned int reset_count;
1419 
1420 			rq = hang_create_request(&h, engine);
1421 			if (IS_ERR(rq)) {
1422 				err = PTR_ERR(rq);
1423 				goto fini;
1424 			}
1425 
1426 			i915_request_get(rq);
1427 			i915_request_add(rq);
1428 
1429 			/*
1430 			 * XXX We don't handle resetting the kernel context
1431 			 * very well. If we trigger a device reset twice in
1432 			 * quick succession while the kernel context is
1433 			 * executing, we may end up skipping the breadcrumb.
1434 			 * This is really only a problem for the selftest as
1435 			 * normally there is a large interlude between resets
1436 			 * (hangcheck), or we focus on resetting just one
1437 			 * engine and so avoid repeatedly resetting innocents.
1438 			 */
1439 			err = wait_for_others(gt, engine);
1440 			if (err) {
1441 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1442 				       __func__, engine->name);
1443 				i915_request_put(rq);
1444 				i915_request_put(prev);
1445 
1446 				GEM_TRACE_DUMP();
1447 				intel_gt_set_wedged(gt);
1448 				goto fini;
1449 			}
1450 
1451 			if (!wait_until_running(&h, prev)) {
1452 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1453 
1454 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1455 				       __func__, engine->name,
1456 				       prev->fence.seqno, hws_seqno(&h, prev));
1457 				intel_engine_dump(engine, &p,
1458 						  "%s\n", engine->name);
1459 
1460 				i915_request_put(rq);
1461 				i915_request_put(prev);
1462 
1463 				intel_gt_set_wedged(gt);
1464 
1465 				err = -EIO;
1466 				goto fini;
1467 			}
1468 
1469 			reset_count = fake_hangcheck(gt, BIT(id));
1470 
1471 			if (prev->fence.error != -EIO) {
1472 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1473 				       prev->fence.error);
1474 				i915_request_put(rq);
1475 				i915_request_put(prev);
1476 				err = -EINVAL;
1477 				goto fini;
1478 			}
1479 
1480 			if (rq->fence.error) {
1481 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1482 				       rq->fence.error);
1483 				i915_request_put(rq);
1484 				i915_request_put(prev);
1485 				err = -EINVAL;
1486 				goto fini;
1487 			}
1488 
1489 			if (i915_reset_count(global) == reset_count) {
1490 				pr_err("No GPU reset recorded!\n");
1491 				i915_request_put(rq);
1492 				i915_request_put(prev);
1493 				err = -EINVAL;
1494 				goto fini;
1495 			}
1496 
1497 			i915_request_put(prev);
1498 			prev = rq;
1499 			count++;
1500 		} while (time_before(jiffies, end_time));
1501 		pr_info("%s: Completed %d resets\n", engine->name, count);
1502 
1503 		*h.batch = MI_BATCH_BUFFER_END;
1504 		intel_gt_chipset_flush(engine->gt);
1505 
1506 		i915_request_put(prev);
1507 
1508 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1509 		if (err)
1510 			break;
1511 	}
1512 
1513 fini:
1514 	hang_fini(&h);
1515 unlock:
1516 	mutex_unlock(&gt->i915->drm.struct_mutex);
1517 	igt_global_reset_unlock(gt);
1518 
1519 	if (intel_gt_is_wedged(gt))
1520 		return -EIO;
1521 
1522 	return err;
1523 }
1524 
1525 static int igt_handle_error(void *arg)
1526 {
1527 	struct intel_gt *gt = arg;
1528 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1529 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1530 	struct hang h;
1531 	struct i915_request *rq;
1532 	struct i915_gpu_state *error;
1533 	int err;
1534 
1535 	/* Check that we can issue a global GPU and engine reset */
1536 
1537 	if (!intel_has_reset_engine(gt->i915))
1538 		return 0;
1539 
1540 	if (!engine || !intel_engine_can_store_dword(engine))
1541 		return 0;
1542 
1543 	mutex_lock(&gt->i915->drm.struct_mutex);
1544 
1545 	err = hang_init(&h, gt);
1546 	if (err)
1547 		goto err_unlock;
1548 
1549 	rq = hang_create_request(&h, engine);
1550 	if (IS_ERR(rq)) {
1551 		err = PTR_ERR(rq);
1552 		goto err_fini;
1553 	}
1554 
1555 	i915_request_get(rq);
1556 	i915_request_add(rq);
1557 
1558 	if (!wait_until_running(&h, rq)) {
1559 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1560 
1561 		pr_err("%s: Failed to start request %llx, at %x\n",
1562 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1563 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1564 
1565 		intel_gt_set_wedged(gt);
1566 
1567 		err = -EIO;
1568 		goto err_request;
1569 	}
1570 
1571 	mutex_unlock(&gt->i915->drm.struct_mutex);
1572 
1573 	/* Temporarily disable error capture */
1574 	error = xchg(&global->first_error, (void *)-1);
1575 
1576 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1577 
1578 	xchg(&global->first_error, error);
1579 
1580 	mutex_lock(&gt->i915->drm.struct_mutex);
1581 
1582 	if (rq->fence.error != -EIO) {
1583 		pr_err("Guilty request not identified!\n");
1584 		err = -EINVAL;
1585 		goto err_request;
1586 	}
1587 
1588 err_request:
1589 	i915_request_put(rq);
1590 err_fini:
1591 	hang_fini(&h);
1592 err_unlock:
1593 	mutex_unlock(&gt->i915->drm.struct_mutex);
1594 	return err;
1595 }
1596 
1597 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1598 				     const struct igt_atomic_section *p,
1599 				     const char *mode)
1600 {
1601 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1602 	int err;
1603 
1604 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1605 		  engine->name, mode, p->name);
1606 
1607 	tasklet_disable_nosync(t);
1608 	p->critical_section_begin();
1609 
1610 	err = intel_engine_reset(engine, NULL);
1611 
1612 	p->critical_section_end();
1613 	tasklet_enable(t);
1614 
1615 	if (err)
1616 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1617 		       engine->name, mode, p->name);
1618 
1619 	return err;
1620 }
1621 
1622 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1623 				   const struct igt_atomic_section *p)
1624 {
1625 	struct i915_request *rq;
1626 	struct hang h;
1627 	int err;
1628 
1629 	err = __igt_atomic_reset_engine(engine, p, "idle");
1630 	if (err)
1631 		return err;
1632 
1633 	err = hang_init(&h, engine->gt);
1634 	if (err)
1635 		return err;
1636 
1637 	rq = hang_create_request(&h, engine);
1638 	if (IS_ERR(rq)) {
1639 		err = PTR_ERR(rq);
1640 		goto out;
1641 	}
1642 
1643 	i915_request_get(rq);
1644 	i915_request_add(rq);
1645 
1646 	if (wait_until_running(&h, rq)) {
1647 		err = __igt_atomic_reset_engine(engine, p, "active");
1648 	} else {
1649 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1650 		       __func__, engine->name,
1651 		       rq->fence.seqno, hws_seqno(&h, rq));
1652 		intel_gt_set_wedged(engine->gt);
1653 		err = -EIO;
1654 	}
1655 
1656 	if (err == 0) {
1657 		struct intel_wedge_me w;
1658 
1659 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1660 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1661 		if (intel_gt_is_wedged(engine->gt))
1662 			err = -EIO;
1663 	}
1664 
1665 	i915_request_put(rq);
1666 out:
1667 	hang_fini(&h);
1668 	return err;
1669 }
1670 
1671 static int igt_reset_engines_atomic(void *arg)
1672 {
1673 	struct intel_gt *gt = arg;
1674 	const typeof(*igt_atomic_phases) *p;
1675 	int err = 0;
1676 
1677 	/* Check that the engines resets are usable from atomic context */
1678 
1679 	if (!intel_has_reset_engine(gt->i915))
1680 		return 0;
1681 
1682 	if (USES_GUC_SUBMISSION(gt->i915))
1683 		return 0;
1684 
1685 	igt_global_reset_lock(gt);
1686 	mutex_lock(&gt->i915->drm.struct_mutex);
1687 
1688 	/* Flush any requests before we get started and check basics */
1689 	if (!igt_force_reset(gt))
1690 		goto unlock;
1691 
1692 	for (p = igt_atomic_phases; p->name; p++) {
1693 		struct intel_engine_cs *engine;
1694 		enum intel_engine_id id;
1695 
1696 		for_each_engine(engine, gt->i915, id) {
1697 			err = igt_atomic_reset_engine(engine, p);
1698 			if (err)
1699 				goto out;
1700 		}
1701 	}
1702 
1703 out:
1704 	/* As we poke around the guts, do a full reset before continuing. */
1705 	igt_force_reset(gt);
1706 
1707 unlock:
1708 	mutex_unlock(&gt->i915->drm.struct_mutex);
1709 	igt_global_reset_unlock(gt);
1710 
1711 	return err;
1712 }
1713 
1714 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1715 {
1716 	static const struct i915_subtest tests[] = {
1717 		SUBTEST(igt_hang_sanitycheck),
1718 		SUBTEST(igt_reset_nop),
1719 		SUBTEST(igt_reset_nop_engine),
1720 		SUBTEST(igt_reset_idle_engine),
1721 		SUBTEST(igt_reset_active_engine),
1722 		SUBTEST(igt_reset_engines),
1723 		SUBTEST(igt_reset_engines_atomic),
1724 		SUBTEST(igt_reset_queue),
1725 		SUBTEST(igt_reset_wait),
1726 		SUBTEST(igt_reset_evict_ggtt),
1727 		SUBTEST(igt_reset_evict_ppgtt),
1728 		SUBTEST(igt_reset_evict_fence),
1729 		SUBTEST(igt_handle_error),
1730 	};
1731 	struct intel_gt *gt = &i915->gt;
1732 	intel_wakeref_t wakeref;
1733 	bool saved_hangcheck;
1734 	int err;
1735 
1736 	if (!intel_has_gpu_reset(gt->i915))
1737 		return 0;
1738 
1739 	if (intel_gt_is_wedged(gt))
1740 		return -EIO; /* we're long past hope of a successful reset */
1741 
1742 	wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1743 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1744 	drain_delayed_work(&gt->hangcheck.work); /* flush param */
1745 
1746 	err = intel_gt_live_subtests(tests, gt);
1747 
1748 	mutex_lock(&gt->i915->drm.struct_mutex);
1749 	igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1750 	mutex_unlock(&gt->i915->drm.struct_mutex);
1751 
1752 	i915_modparams.enable_hangcheck = saved_hangcheck;
1753 	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1754 
1755 	return err;
1756 }
1757