1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30 
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct intel_gt *gt;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->gt = gt;
60 
61 	mutex_lock(&gt->i915->drm.struct_mutex);
62 	h->ctx = kernel_context(gt->i915);
63 	mutex_unlock(&gt->i915->drm.struct_mutex);
64 	if (IS_ERR(h->ctx))
65 		return PTR_ERR(h->ctx);
66 
67 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
68 
69 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
70 	if (IS_ERR(h->hws)) {
71 		err = PTR_ERR(h->hws);
72 		goto err_ctx;
73 	}
74 
75 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
76 	if (IS_ERR(h->obj)) {
77 		err = PTR_ERR(h->obj);
78 		goto err_hws;
79 	}
80 
81 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
82 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
83 	if (IS_ERR(vaddr)) {
84 		err = PTR_ERR(vaddr);
85 		goto err_obj;
86 	}
87 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
88 
89 	vaddr = i915_gem_object_pin_map(h->obj,
90 					i915_coherent_map_type(gt->i915));
91 	if (IS_ERR(vaddr)) {
92 		err = PTR_ERR(vaddr);
93 		goto err_unpin_hws;
94 	}
95 	h->batch = vaddr;
96 
97 	return 0;
98 
99 err_unpin_hws:
100 	i915_gem_object_unpin_map(h->hws);
101 err_obj:
102 	i915_gem_object_put(h->obj);
103 err_hws:
104 	i915_gem_object_put(h->hws);
105 err_ctx:
106 	kernel_context_close(h->ctx);
107 	return err;
108 }
109 
110 static u64 hws_address(const struct i915_vma *hws,
111 		       const struct i915_request *rq)
112 {
113 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
114 }
115 
116 static int move_to_active(struct i915_vma *vma,
117 			  struct i915_request *rq,
118 			  unsigned int flags)
119 {
120 	int err;
121 
122 	i915_vma_lock(vma);
123 	err = i915_request_await_object(rq, vma->obj,
124 					flags & EXEC_OBJECT_WRITE);
125 	if (err == 0)
126 		err = i915_vma_move_to_active(vma, rq, flags);
127 	i915_vma_unlock(vma);
128 
129 	return err;
130 }
131 
132 static struct i915_request *
133 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
134 {
135 	struct intel_gt *gt = h->gt;
136 	struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
137 	struct drm_i915_gem_object *obj;
138 	struct i915_request *rq = NULL;
139 	struct i915_vma *hws, *vma;
140 	unsigned int flags;
141 	void *vaddr;
142 	u32 *batch;
143 	int err;
144 
145 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
146 	if (IS_ERR(obj))
147 		return ERR_CAST(obj);
148 
149 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
150 	if (IS_ERR(vaddr)) {
151 		i915_gem_object_put(obj);
152 		return ERR_CAST(vaddr);
153 	}
154 
155 	i915_gem_object_unpin_map(h->obj);
156 	i915_gem_object_put(h->obj);
157 
158 	h->obj = obj;
159 	h->batch = vaddr;
160 
161 	vma = i915_vma_instance(h->obj, vm, NULL);
162 	if (IS_ERR(vma))
163 		return ERR_CAST(vma);
164 
165 	hws = i915_vma_instance(h->hws, vm, NULL);
166 	if (IS_ERR(hws))
167 		return ERR_CAST(hws);
168 
169 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
170 	if (err)
171 		return ERR_PTR(err);
172 
173 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
174 	if (err)
175 		goto unpin_vma;
176 
177 	rq = igt_request_alloc(h->ctx, engine);
178 	if (IS_ERR(rq)) {
179 		err = PTR_ERR(rq);
180 		goto unpin_hws;
181 	}
182 
183 	err = move_to_active(vma, rq, 0);
184 	if (err)
185 		goto cancel_rq;
186 
187 	err = move_to_active(hws, rq, 0);
188 	if (err)
189 		goto cancel_rq;
190 
191 	batch = h->batch;
192 	if (INTEL_GEN(gt->i915) >= 8) {
193 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
194 		*batch++ = lower_32_bits(hws_address(hws, rq));
195 		*batch++ = upper_32_bits(hws_address(hws, rq));
196 		*batch++ = rq->fence.seqno;
197 		*batch++ = MI_ARB_CHECK;
198 
199 		memset(batch, 0, 1024);
200 		batch += 1024 / sizeof(*batch);
201 
202 		*batch++ = MI_ARB_CHECK;
203 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
204 		*batch++ = lower_32_bits(vma->node.start);
205 		*batch++ = upper_32_bits(vma->node.start);
206 	} else if (INTEL_GEN(gt->i915) >= 6) {
207 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
208 		*batch++ = 0;
209 		*batch++ = lower_32_bits(hws_address(hws, rq));
210 		*batch++ = rq->fence.seqno;
211 		*batch++ = MI_ARB_CHECK;
212 
213 		memset(batch, 0, 1024);
214 		batch += 1024 / sizeof(*batch);
215 
216 		*batch++ = MI_ARB_CHECK;
217 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
218 		*batch++ = lower_32_bits(vma->node.start);
219 	} else if (INTEL_GEN(gt->i915) >= 4) {
220 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
221 		*batch++ = 0;
222 		*batch++ = lower_32_bits(hws_address(hws, rq));
223 		*batch++ = rq->fence.seqno;
224 		*batch++ = MI_ARB_CHECK;
225 
226 		memset(batch, 0, 1024);
227 		batch += 1024 / sizeof(*batch);
228 
229 		*batch++ = MI_ARB_CHECK;
230 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
231 		*batch++ = lower_32_bits(vma->node.start);
232 	} else {
233 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
234 		*batch++ = lower_32_bits(hws_address(hws, rq));
235 		*batch++ = rq->fence.seqno;
236 		*batch++ = MI_ARB_CHECK;
237 
238 		memset(batch, 0, 1024);
239 		batch += 1024 / sizeof(*batch);
240 
241 		*batch++ = MI_ARB_CHECK;
242 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
243 		*batch++ = lower_32_bits(vma->node.start);
244 	}
245 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
246 	intel_gt_chipset_flush(engine->gt);
247 
248 	if (rq->engine->emit_init_breadcrumb) {
249 		err = rq->engine->emit_init_breadcrumb(rq);
250 		if (err)
251 			goto cancel_rq;
252 	}
253 
254 	flags = 0;
255 	if (INTEL_GEN(gt->i915) <= 5)
256 		flags |= I915_DISPATCH_SECURE;
257 
258 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
259 
260 cancel_rq:
261 	if (err) {
262 		i915_request_skip(rq, err);
263 		i915_request_add(rq);
264 	}
265 unpin_hws:
266 	i915_vma_unpin(hws);
267 unpin_vma:
268 	i915_vma_unpin(vma);
269 	return err ? ERR_PTR(err) : rq;
270 }
271 
272 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
273 {
274 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
275 }
276 
277 static void hang_fini(struct hang *h)
278 {
279 	*h->batch = MI_BATCH_BUFFER_END;
280 	intel_gt_chipset_flush(h->gt);
281 
282 	i915_gem_object_unpin_map(h->obj);
283 	i915_gem_object_put(h->obj);
284 
285 	i915_gem_object_unpin_map(h->hws);
286 	i915_gem_object_put(h->hws);
287 
288 	kernel_context_close(h->ctx);
289 
290 	igt_flush_test(h->gt->i915);
291 }
292 
293 static bool wait_until_running(struct hang *h, struct i915_request *rq)
294 {
295 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
296 					       rq->fence.seqno),
297 			     10) &&
298 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
299 					    rq->fence.seqno),
300 			  1000));
301 }
302 
303 static int igt_hang_sanitycheck(void *arg)
304 {
305 	struct intel_gt *gt = arg;
306 	struct i915_request *rq;
307 	struct intel_engine_cs *engine;
308 	enum intel_engine_id id;
309 	struct hang h;
310 	int err;
311 
312 	/* Basic check that we can execute our hanging batch */
313 
314 	err = hang_init(&h, gt);
315 	if (err)
316 		return err;
317 
318 	for_each_engine(engine, gt->i915, id) {
319 		struct intel_wedge_me w;
320 		long timeout;
321 
322 		if (!intel_engine_can_store_dword(engine))
323 			continue;
324 
325 		rq = hang_create_request(&h, engine);
326 		if (IS_ERR(rq)) {
327 			err = PTR_ERR(rq);
328 			pr_err("Failed to create request for %s, err=%d\n",
329 			       engine->name, err);
330 			goto fini;
331 		}
332 
333 		i915_request_get(rq);
334 
335 		*h.batch = MI_BATCH_BUFFER_END;
336 		intel_gt_chipset_flush(engine->gt);
337 
338 		i915_request_add(rq);
339 
340 		timeout = 0;
341 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
342 			timeout = i915_request_wait(rq, 0,
343 						    MAX_SCHEDULE_TIMEOUT);
344 		if (intel_gt_is_wedged(gt))
345 			timeout = -EIO;
346 
347 		i915_request_put(rq);
348 
349 		if (timeout < 0) {
350 			err = timeout;
351 			pr_err("Wait for request failed on %s, err=%d\n",
352 			       engine->name, err);
353 			goto fini;
354 		}
355 	}
356 
357 fini:
358 	hang_fini(&h);
359 	return err;
360 }
361 
362 static bool wait_for_idle(struct intel_engine_cs *engine)
363 {
364 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
365 }
366 
367 static int igt_reset_nop(void *arg)
368 {
369 	struct intel_gt *gt = arg;
370 	struct i915_gpu_error *global = &gt->i915->gpu_error;
371 	struct intel_engine_cs *engine;
372 	struct i915_gem_context *ctx;
373 	unsigned int reset_count, count;
374 	enum intel_engine_id id;
375 	struct drm_file *file;
376 	IGT_TIMEOUT(end_time);
377 	int err = 0;
378 
379 	/* Check that we can reset during non-user portions of requests */
380 
381 	file = mock_file(gt->i915);
382 	if (IS_ERR(file))
383 		return PTR_ERR(file);
384 
385 	mutex_lock(&gt->i915->drm.struct_mutex);
386 	ctx = live_context(gt->i915, file);
387 	mutex_unlock(&gt->i915->drm.struct_mutex);
388 	if (IS_ERR(ctx)) {
389 		err = PTR_ERR(ctx);
390 		goto out;
391 	}
392 
393 	i915_gem_context_clear_bannable(ctx);
394 	reset_count = i915_reset_count(global);
395 	count = 0;
396 	do {
397 		for_each_engine(engine, gt->i915, id) {
398 			int i;
399 
400 			for (i = 0; i < 16; i++) {
401 				struct i915_request *rq;
402 
403 				rq = igt_request_alloc(ctx, engine);
404 				if (IS_ERR(rq)) {
405 					err = PTR_ERR(rq);
406 					break;
407 				}
408 
409 				i915_request_add(rq);
410 			}
411 		}
412 
413 		igt_global_reset_lock(gt);
414 		intel_gt_reset(gt, ALL_ENGINES, NULL);
415 		igt_global_reset_unlock(gt);
416 
417 		if (intel_gt_is_wedged(gt)) {
418 			err = -EIO;
419 			break;
420 		}
421 
422 		if (i915_reset_count(global) != reset_count + ++count) {
423 			pr_err("Full GPU reset not recorded!\n");
424 			err = -EINVAL;
425 			break;
426 		}
427 
428 		err = igt_flush_test(gt->i915);
429 		if (err)
430 			break;
431 	} while (time_before(jiffies, end_time));
432 	pr_info("%s: %d resets\n", __func__, count);
433 
434 	err = igt_flush_test(gt->i915);
435 out:
436 	mock_file_free(gt->i915, file);
437 	if (intel_gt_is_wedged(gt))
438 		err = -EIO;
439 	return err;
440 }
441 
442 static int igt_reset_nop_engine(void *arg)
443 {
444 	struct intel_gt *gt = arg;
445 	struct i915_gpu_error *global = &gt->i915->gpu_error;
446 	struct intel_engine_cs *engine;
447 	struct i915_gem_context *ctx;
448 	enum intel_engine_id id;
449 	struct drm_file *file;
450 	int err = 0;
451 
452 	/* Check that we can engine-reset during non-user portions */
453 
454 	if (!intel_has_reset_engine(gt))
455 		return 0;
456 
457 	file = mock_file(gt->i915);
458 	if (IS_ERR(file))
459 		return PTR_ERR(file);
460 
461 	mutex_lock(&gt->i915->drm.struct_mutex);
462 	ctx = live_context(gt->i915, file);
463 	mutex_unlock(&gt->i915->drm.struct_mutex);
464 	if (IS_ERR(ctx)) {
465 		err = PTR_ERR(ctx);
466 		goto out;
467 	}
468 
469 	i915_gem_context_clear_bannable(ctx);
470 	for_each_engine(engine, gt->i915, id) {
471 		unsigned int reset_count, reset_engine_count;
472 		unsigned int count;
473 		IGT_TIMEOUT(end_time);
474 
475 		reset_count = i915_reset_count(global);
476 		reset_engine_count = i915_reset_engine_count(global, engine);
477 		count = 0;
478 
479 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
480 		do {
481 			int i;
482 
483 			if (!wait_for_idle(engine)) {
484 				pr_err("%s failed to idle before reset\n",
485 				       engine->name);
486 				err = -EIO;
487 				break;
488 			}
489 
490 			for (i = 0; i < 16; i++) {
491 				struct i915_request *rq;
492 
493 				rq = igt_request_alloc(ctx, engine);
494 				if (IS_ERR(rq)) {
495 					err = PTR_ERR(rq);
496 					break;
497 				}
498 
499 				i915_request_add(rq);
500 			}
501 			err = intel_engine_reset(engine, NULL);
502 			if (err) {
503 				pr_err("i915_reset_engine failed\n");
504 				break;
505 			}
506 
507 			if (i915_reset_count(global) != reset_count) {
508 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
509 				err = -EINVAL;
510 				break;
511 			}
512 
513 			if (i915_reset_engine_count(global, engine) !=
514 			    reset_engine_count + ++count) {
515 				pr_err("%s engine reset not recorded!\n",
516 				       engine->name);
517 				err = -EINVAL;
518 				break;
519 			}
520 		} while (time_before(jiffies, end_time));
521 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
522 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
523 
524 		if (err)
525 			break;
526 
527 		err = igt_flush_test(gt->i915);
528 		if (err)
529 			break;
530 	}
531 
532 	err = igt_flush_test(gt->i915);
533 out:
534 	mock_file_free(gt->i915, file);
535 	if (intel_gt_is_wedged(gt))
536 		err = -EIO;
537 	return err;
538 }
539 
540 static int __igt_reset_engine(struct intel_gt *gt, bool active)
541 {
542 	struct i915_gpu_error *global = &gt->i915->gpu_error;
543 	struct intel_engine_cs *engine;
544 	enum intel_engine_id id;
545 	struct hang h;
546 	int err = 0;
547 
548 	/* Check that we can issue an engine reset on an idle engine (no-op) */
549 
550 	if (!intel_has_reset_engine(gt))
551 		return 0;
552 
553 	if (active) {
554 		err = hang_init(&h, gt);
555 		if (err)
556 			return err;
557 	}
558 
559 	for_each_engine(engine, gt->i915, id) {
560 		unsigned int reset_count, reset_engine_count;
561 		IGT_TIMEOUT(end_time);
562 
563 		if (active && !intel_engine_can_store_dword(engine))
564 			continue;
565 
566 		if (!wait_for_idle(engine)) {
567 			pr_err("%s failed to idle before reset\n",
568 			       engine->name);
569 			err = -EIO;
570 			break;
571 		}
572 
573 		reset_count = i915_reset_count(global);
574 		reset_engine_count = i915_reset_engine_count(global, engine);
575 
576 		intel_engine_pm_get(engine);
577 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
578 		do {
579 			if (active) {
580 				struct i915_request *rq;
581 
582 				rq = hang_create_request(&h, engine);
583 				if (IS_ERR(rq)) {
584 					err = PTR_ERR(rq);
585 					break;
586 				}
587 
588 				i915_request_get(rq);
589 				i915_request_add(rq);
590 
591 				if (!wait_until_running(&h, rq)) {
592 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
593 
594 					pr_err("%s: Failed to start request %llx, at %x\n",
595 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
596 					intel_engine_dump(engine, &p,
597 							  "%s\n", engine->name);
598 
599 					i915_request_put(rq);
600 					err = -EIO;
601 					break;
602 				}
603 
604 				i915_request_put(rq);
605 			}
606 
607 			err = intel_engine_reset(engine, NULL);
608 			if (err) {
609 				pr_err("i915_reset_engine failed\n");
610 				break;
611 			}
612 
613 			if (i915_reset_count(global) != reset_count) {
614 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
615 				err = -EINVAL;
616 				break;
617 			}
618 
619 			if (i915_reset_engine_count(global, engine) !=
620 			    ++reset_engine_count) {
621 				pr_err("%s engine reset not recorded!\n",
622 				       engine->name);
623 				err = -EINVAL;
624 				break;
625 			}
626 		} while (time_before(jiffies, end_time));
627 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
628 		intel_engine_pm_put(engine);
629 
630 		if (err)
631 			break;
632 
633 		err = igt_flush_test(gt->i915);
634 		if (err)
635 			break;
636 	}
637 
638 	if (intel_gt_is_wedged(gt))
639 		err = -EIO;
640 
641 	if (active)
642 		hang_fini(&h);
643 
644 	return err;
645 }
646 
647 static int igt_reset_idle_engine(void *arg)
648 {
649 	return __igt_reset_engine(arg, false);
650 }
651 
652 static int igt_reset_active_engine(void *arg)
653 {
654 	return __igt_reset_engine(arg, true);
655 }
656 
657 struct active_engine {
658 	struct task_struct *task;
659 	struct intel_engine_cs *engine;
660 	unsigned long resets;
661 	unsigned int flags;
662 };
663 
664 #define TEST_ACTIVE	BIT(0)
665 #define TEST_OTHERS	BIT(1)
666 #define TEST_SELF	BIT(2)
667 #define TEST_PRIORITY	BIT(3)
668 
669 static int active_request_put(struct i915_request *rq)
670 {
671 	int err = 0;
672 
673 	if (!rq)
674 		return 0;
675 
676 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
677 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
678 			  rq->engine->name,
679 			  rq->fence.context,
680 			  rq->fence.seqno);
681 		GEM_TRACE_DUMP();
682 
683 		intel_gt_set_wedged(rq->engine->gt);
684 		err = -EIO;
685 	}
686 
687 	i915_request_put(rq);
688 
689 	return err;
690 }
691 
692 static int active_engine(void *data)
693 {
694 	I915_RND_STATE(prng);
695 	struct active_engine *arg = data;
696 	struct intel_engine_cs *engine = arg->engine;
697 	struct i915_request *rq[8] = {};
698 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
699 	struct drm_file *file;
700 	unsigned long count = 0;
701 	int err = 0;
702 
703 	file = mock_file(engine->i915);
704 	if (IS_ERR(file))
705 		return PTR_ERR(file);
706 
707 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
708 		mutex_lock(&engine->i915->drm.struct_mutex);
709 		ctx[count] = live_context(engine->i915, file);
710 		mutex_unlock(&engine->i915->drm.struct_mutex);
711 		if (IS_ERR(ctx[count])) {
712 			err = PTR_ERR(ctx[count]);
713 			while (--count)
714 				i915_gem_context_put(ctx[count]);
715 			goto err_file;
716 		}
717 	}
718 
719 	while (!kthread_should_stop()) {
720 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
721 		struct i915_request *old = rq[idx];
722 		struct i915_request *new;
723 
724 		new = igt_request_alloc(ctx[idx], engine);
725 		if (IS_ERR(new)) {
726 			err = PTR_ERR(new);
727 			break;
728 		}
729 
730 		if (arg->flags & TEST_PRIORITY)
731 			ctx[idx]->sched.priority =
732 				i915_prandom_u32_max_state(512, &prng);
733 
734 		rq[idx] = i915_request_get(new);
735 		i915_request_add(new);
736 
737 		err = active_request_put(old);
738 		if (err)
739 			break;
740 
741 		cond_resched();
742 	}
743 
744 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
745 		int err__ = active_request_put(rq[count]);
746 
747 		/* Keep the first error */
748 		if (!err)
749 			err = err__;
750 	}
751 
752 err_file:
753 	mock_file_free(engine->i915, file);
754 	return err;
755 }
756 
757 static int __igt_reset_engines(struct intel_gt *gt,
758 			       const char *test_name,
759 			       unsigned int flags)
760 {
761 	struct i915_gpu_error *global = &gt->i915->gpu_error;
762 	struct intel_engine_cs *engine, *other;
763 	enum intel_engine_id id, tmp;
764 	struct hang h;
765 	int err = 0;
766 
767 	/* Check that issuing a reset on one engine does not interfere
768 	 * with any other engine.
769 	 */
770 
771 	if (!intel_has_reset_engine(gt))
772 		return 0;
773 
774 	if (flags & TEST_ACTIVE) {
775 		err = hang_init(&h, gt);
776 		if (err)
777 			return err;
778 
779 		if (flags & TEST_PRIORITY)
780 			h.ctx->sched.priority = 1024;
781 	}
782 
783 	for_each_engine(engine, gt->i915, id) {
784 		struct active_engine threads[I915_NUM_ENGINES] = {};
785 		unsigned long device = i915_reset_count(global);
786 		unsigned long count = 0, reported;
787 		IGT_TIMEOUT(end_time);
788 
789 		if (flags & TEST_ACTIVE &&
790 		    !intel_engine_can_store_dword(engine))
791 			continue;
792 
793 		if (!wait_for_idle(engine)) {
794 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
795 			       engine->name, test_name);
796 			err = -EIO;
797 			break;
798 		}
799 
800 		memset(threads, 0, sizeof(threads));
801 		for_each_engine(other, gt->i915, tmp) {
802 			struct task_struct *tsk;
803 
804 			threads[tmp].resets =
805 				i915_reset_engine_count(global, other);
806 
807 			if (!(flags & TEST_OTHERS))
808 				continue;
809 
810 			if (other == engine && !(flags & TEST_SELF))
811 				continue;
812 
813 			threads[tmp].engine = other;
814 			threads[tmp].flags = flags;
815 
816 			tsk = kthread_run(active_engine, &threads[tmp],
817 					  "igt/%s", other->name);
818 			if (IS_ERR(tsk)) {
819 				err = PTR_ERR(tsk);
820 				goto unwind;
821 			}
822 
823 			threads[tmp].task = tsk;
824 			get_task_struct(tsk);
825 		}
826 
827 		intel_engine_pm_get(engine);
828 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
829 		do {
830 			struct i915_request *rq = NULL;
831 
832 			if (flags & TEST_ACTIVE) {
833 				rq = hang_create_request(&h, engine);
834 				if (IS_ERR(rq)) {
835 					err = PTR_ERR(rq);
836 					break;
837 				}
838 
839 				i915_request_get(rq);
840 				i915_request_add(rq);
841 
842 				if (!wait_until_running(&h, rq)) {
843 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
844 
845 					pr_err("%s: Failed to start request %llx, at %x\n",
846 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
847 					intel_engine_dump(engine, &p,
848 							  "%s\n", engine->name);
849 
850 					i915_request_put(rq);
851 					err = -EIO;
852 					break;
853 				}
854 			}
855 
856 			err = intel_engine_reset(engine, NULL);
857 			if (err) {
858 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
859 				       engine->name, test_name, err);
860 				break;
861 			}
862 
863 			count++;
864 
865 			if (rq) {
866 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
867 					struct drm_printer p =
868 						drm_info_printer(gt->i915->drm.dev);
869 
870 					pr_err("i915_reset_engine(%s:%s):"
871 					       " failed to complete request after reset\n",
872 					       engine->name, test_name);
873 					intel_engine_dump(engine, &p,
874 							  "%s\n", engine->name);
875 					i915_request_put(rq);
876 
877 					GEM_TRACE_DUMP();
878 					intel_gt_set_wedged(gt);
879 					err = -EIO;
880 					break;
881 				}
882 
883 				i915_request_put(rq);
884 			}
885 
886 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
887 				struct drm_printer p =
888 					drm_info_printer(gt->i915->drm.dev);
889 
890 				pr_err("i915_reset_engine(%s:%s):"
891 				       " failed to idle after reset\n",
892 				       engine->name, test_name);
893 				intel_engine_dump(engine, &p,
894 						  "%s\n", engine->name);
895 
896 				err = -EIO;
897 				break;
898 			}
899 		} while (time_before(jiffies, end_time));
900 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
901 		intel_engine_pm_put(engine);
902 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
903 			engine->name, test_name, count);
904 
905 		reported = i915_reset_engine_count(global, engine);
906 		reported -= threads[engine->id].resets;
907 		if (reported != count) {
908 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
909 			       engine->name, test_name, count, reported);
910 			if (!err)
911 				err = -EINVAL;
912 		}
913 
914 unwind:
915 		for_each_engine(other, gt->i915, tmp) {
916 			int ret;
917 
918 			if (!threads[tmp].task)
919 				continue;
920 
921 			ret = kthread_stop(threads[tmp].task);
922 			if (ret) {
923 				pr_err("kthread for other engine %s failed, err=%d\n",
924 				       other->name, ret);
925 				if (!err)
926 					err = ret;
927 			}
928 			put_task_struct(threads[tmp].task);
929 
930 			if (other->uabi_class != engine->uabi_class &&
931 			    threads[tmp].resets !=
932 			    i915_reset_engine_count(global, other)) {
933 				pr_err("Innocent engine %s was reset (count=%ld)\n",
934 				       other->name,
935 				       i915_reset_engine_count(global, other) -
936 				       threads[tmp].resets);
937 				if (!err)
938 					err = -EINVAL;
939 			}
940 		}
941 
942 		if (device != i915_reset_count(global)) {
943 			pr_err("Global reset (count=%ld)!\n",
944 			       i915_reset_count(global) - device);
945 			if (!err)
946 				err = -EINVAL;
947 		}
948 
949 		if (err)
950 			break;
951 
952 		err = igt_flush_test(gt->i915);
953 		if (err)
954 			break;
955 	}
956 
957 	if (intel_gt_is_wedged(gt))
958 		err = -EIO;
959 
960 	if (flags & TEST_ACTIVE)
961 		hang_fini(&h);
962 
963 	return err;
964 }
965 
966 static int igt_reset_engines(void *arg)
967 {
968 	static const struct {
969 		const char *name;
970 		unsigned int flags;
971 	} phases[] = {
972 		{ "idle", 0 },
973 		{ "active", TEST_ACTIVE },
974 		{ "others-idle", TEST_OTHERS },
975 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
976 		{
977 			"others-priority",
978 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
979 		},
980 		{
981 			"self-priority",
982 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
983 		},
984 		{ }
985 	};
986 	struct intel_gt *gt = arg;
987 	typeof(*phases) *p;
988 	int err;
989 
990 	for (p = phases; p->name; p++) {
991 		if (p->flags & TEST_PRIORITY) {
992 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
993 				continue;
994 		}
995 
996 		err = __igt_reset_engines(arg, p->name, p->flags);
997 		if (err)
998 			return err;
999 	}
1000 
1001 	return 0;
1002 }
1003 
1004 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1005 {
1006 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1007 
1008 	intel_gt_reset(gt, mask, NULL);
1009 
1010 	return count;
1011 }
1012 
1013 static int igt_reset_wait(void *arg)
1014 {
1015 	struct intel_gt *gt = arg;
1016 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1017 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1018 	struct i915_request *rq;
1019 	unsigned int reset_count;
1020 	struct hang h;
1021 	long timeout;
1022 	int err;
1023 
1024 	if (!engine || !intel_engine_can_store_dword(engine))
1025 		return 0;
1026 
1027 	/* Check that we detect a stuck waiter and issue a reset */
1028 
1029 	igt_global_reset_lock(gt);
1030 
1031 	err = hang_init(&h, gt);
1032 	if (err)
1033 		goto unlock;
1034 
1035 	rq = hang_create_request(&h, engine);
1036 	if (IS_ERR(rq)) {
1037 		err = PTR_ERR(rq);
1038 		goto fini;
1039 	}
1040 
1041 	i915_request_get(rq);
1042 	i915_request_add(rq);
1043 
1044 	if (!wait_until_running(&h, rq)) {
1045 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1046 
1047 		pr_err("%s: Failed to start request %llx, at %x\n",
1048 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1049 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1050 
1051 		intel_gt_set_wedged(gt);
1052 
1053 		err = -EIO;
1054 		goto out_rq;
1055 	}
1056 
1057 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1058 
1059 	timeout = i915_request_wait(rq, 0, 10);
1060 	if (timeout < 0) {
1061 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1062 		       timeout);
1063 		err = timeout;
1064 		goto out_rq;
1065 	}
1066 
1067 	if (i915_reset_count(global) == reset_count) {
1068 		pr_err("No GPU reset recorded!\n");
1069 		err = -EINVAL;
1070 		goto out_rq;
1071 	}
1072 
1073 out_rq:
1074 	i915_request_put(rq);
1075 fini:
1076 	hang_fini(&h);
1077 unlock:
1078 	igt_global_reset_unlock(gt);
1079 
1080 	if (intel_gt_is_wedged(gt))
1081 		return -EIO;
1082 
1083 	return err;
1084 }
1085 
1086 struct evict_vma {
1087 	struct completion completion;
1088 	struct i915_vma *vma;
1089 };
1090 
1091 static int evict_vma(void *data)
1092 {
1093 	struct evict_vma *arg = data;
1094 	struct i915_address_space *vm = arg->vma->vm;
1095 	struct drm_mm_node evict = arg->vma->node;
1096 	int err;
1097 
1098 	complete(&arg->completion);
1099 
1100 	mutex_lock(&vm->mutex);
1101 	err = i915_gem_evict_for_node(vm, &evict, 0);
1102 	mutex_unlock(&vm->mutex);
1103 
1104 	return err;
1105 }
1106 
1107 static int evict_fence(void *data)
1108 {
1109 	struct evict_vma *arg = data;
1110 	int err;
1111 
1112 	complete(&arg->completion);
1113 
1114 	/* Mark the fence register as dirty to force the mmio update. */
1115 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1116 	if (err) {
1117 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1118 		return err;
1119 	}
1120 
1121 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1122 	if (err) {
1123 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1124 		return err;
1125 	}
1126 
1127 	err = i915_vma_pin_fence(arg->vma);
1128 	i915_vma_unpin(arg->vma);
1129 	if (err) {
1130 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1131 		return err;
1132 	}
1133 
1134 	i915_vma_unpin_fence(arg->vma);
1135 
1136 	return 0;
1137 }
1138 
1139 static int __igt_reset_evict_vma(struct intel_gt *gt,
1140 				 struct i915_address_space *vm,
1141 				 int (*fn)(void *),
1142 				 unsigned int flags)
1143 {
1144 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1145 	struct drm_i915_gem_object *obj;
1146 	struct task_struct *tsk = NULL;
1147 	struct i915_request *rq;
1148 	struct evict_vma arg;
1149 	struct hang h;
1150 	int err;
1151 
1152 	if (!engine || !intel_engine_can_store_dword(engine))
1153 		return 0;
1154 
1155 	/* Check that we can recover an unbind stuck on a hanging request */
1156 
1157 	err = hang_init(&h, gt);
1158 	if (err)
1159 		return err;
1160 
1161 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1162 	if (IS_ERR(obj)) {
1163 		err = PTR_ERR(obj);
1164 		goto fini;
1165 	}
1166 
1167 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1168 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1169 		if (err) {
1170 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1171 			goto out_obj;
1172 		}
1173 	}
1174 
1175 	arg.vma = i915_vma_instance(obj, vm, NULL);
1176 	if (IS_ERR(arg.vma)) {
1177 		err = PTR_ERR(arg.vma);
1178 		goto out_obj;
1179 	}
1180 
1181 	rq = hang_create_request(&h, engine);
1182 	if (IS_ERR(rq)) {
1183 		err = PTR_ERR(rq);
1184 		goto out_obj;
1185 	}
1186 
1187 	err = i915_vma_pin(arg.vma, 0, 0,
1188 			   i915_vma_is_ggtt(arg.vma) ?
1189 			   PIN_GLOBAL | PIN_MAPPABLE :
1190 			   PIN_USER);
1191 	if (err) {
1192 		i915_request_add(rq);
1193 		goto out_obj;
1194 	}
1195 
1196 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1197 		err = i915_vma_pin_fence(arg.vma);
1198 		if (err) {
1199 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1200 			i915_vma_unpin(arg.vma);
1201 			i915_request_add(rq);
1202 			goto out_obj;
1203 		}
1204 	}
1205 
1206 	i915_vma_lock(arg.vma);
1207 	err = i915_request_await_object(rq, arg.vma->obj,
1208 					flags & EXEC_OBJECT_WRITE);
1209 	if (err == 0)
1210 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1211 	i915_vma_unlock(arg.vma);
1212 
1213 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1214 		i915_vma_unpin_fence(arg.vma);
1215 	i915_vma_unpin(arg.vma);
1216 
1217 	i915_request_get(rq);
1218 	i915_request_add(rq);
1219 	if (err)
1220 		goto out_rq;
1221 
1222 	if (!wait_until_running(&h, rq)) {
1223 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1224 
1225 		pr_err("%s: Failed to start request %llx, at %x\n",
1226 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1227 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1228 
1229 		intel_gt_set_wedged(gt);
1230 		goto out_reset;
1231 	}
1232 
1233 	init_completion(&arg.completion);
1234 
1235 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1236 	if (IS_ERR(tsk)) {
1237 		err = PTR_ERR(tsk);
1238 		tsk = NULL;
1239 		goto out_reset;
1240 	}
1241 	get_task_struct(tsk);
1242 
1243 	wait_for_completion(&arg.completion);
1244 
1245 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1246 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1247 
1248 		pr_err("igt/evict_vma kthread did not wait\n");
1249 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1250 
1251 		intel_gt_set_wedged(gt);
1252 		goto out_reset;
1253 	}
1254 
1255 out_reset:
1256 	igt_global_reset_lock(gt);
1257 	fake_hangcheck(gt, rq->engine->mask);
1258 	igt_global_reset_unlock(gt);
1259 
1260 	if (tsk) {
1261 		struct intel_wedge_me w;
1262 
1263 		/* The reset, even indirectly, should take less than 10ms. */
1264 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1265 			err = kthread_stop(tsk);
1266 
1267 		put_task_struct(tsk);
1268 	}
1269 
1270 out_rq:
1271 	i915_request_put(rq);
1272 out_obj:
1273 	i915_gem_object_put(obj);
1274 fini:
1275 	hang_fini(&h);
1276 	if (intel_gt_is_wedged(gt))
1277 		return -EIO;
1278 
1279 	return err;
1280 }
1281 
1282 static int igt_reset_evict_ggtt(void *arg)
1283 {
1284 	struct intel_gt *gt = arg;
1285 
1286 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1287 				     evict_vma, EXEC_OBJECT_WRITE);
1288 }
1289 
1290 static int igt_reset_evict_ppgtt(void *arg)
1291 {
1292 	struct intel_gt *gt = arg;
1293 	struct i915_gem_context *ctx;
1294 	struct drm_file *file;
1295 	int err;
1296 
1297 	file = mock_file(gt->i915);
1298 	if (IS_ERR(file))
1299 		return PTR_ERR(file);
1300 
1301 	mutex_lock(&gt->i915->drm.struct_mutex);
1302 	ctx = live_context(gt->i915, file);
1303 	mutex_unlock(&gt->i915->drm.struct_mutex);
1304 	if (IS_ERR(ctx)) {
1305 		err = PTR_ERR(ctx);
1306 		goto out;
1307 	}
1308 
1309 	err = 0;
1310 	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1311 		err = __igt_reset_evict_vma(gt, ctx->vm,
1312 					    evict_vma, EXEC_OBJECT_WRITE);
1313 
1314 out:
1315 	mock_file_free(gt->i915, file);
1316 	return err;
1317 }
1318 
1319 static int igt_reset_evict_fence(void *arg)
1320 {
1321 	struct intel_gt *gt = arg;
1322 
1323 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1324 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1325 }
1326 
1327 static int wait_for_others(struct intel_gt *gt,
1328 			   struct intel_engine_cs *exclude)
1329 {
1330 	struct intel_engine_cs *engine;
1331 	enum intel_engine_id id;
1332 
1333 	for_each_engine(engine, gt->i915, id) {
1334 		if (engine == exclude)
1335 			continue;
1336 
1337 		if (!wait_for_idle(engine))
1338 			return -EIO;
1339 	}
1340 
1341 	return 0;
1342 }
1343 
1344 static int igt_reset_queue(void *arg)
1345 {
1346 	struct intel_gt *gt = arg;
1347 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1348 	struct intel_engine_cs *engine;
1349 	enum intel_engine_id id;
1350 	struct hang h;
1351 	int err;
1352 
1353 	/* Check that we replay pending requests following a hang */
1354 
1355 	igt_global_reset_lock(gt);
1356 
1357 	err = hang_init(&h, gt);
1358 	if (err)
1359 		goto unlock;
1360 
1361 	for_each_engine(engine, gt->i915, id) {
1362 		struct i915_request *prev;
1363 		IGT_TIMEOUT(end_time);
1364 		unsigned int count;
1365 
1366 		if (!intel_engine_can_store_dword(engine))
1367 			continue;
1368 
1369 		prev = hang_create_request(&h, engine);
1370 		if (IS_ERR(prev)) {
1371 			err = PTR_ERR(prev);
1372 			goto fini;
1373 		}
1374 
1375 		i915_request_get(prev);
1376 		i915_request_add(prev);
1377 
1378 		count = 0;
1379 		do {
1380 			struct i915_request *rq;
1381 			unsigned int reset_count;
1382 
1383 			rq = hang_create_request(&h, engine);
1384 			if (IS_ERR(rq)) {
1385 				err = PTR_ERR(rq);
1386 				goto fini;
1387 			}
1388 
1389 			i915_request_get(rq);
1390 			i915_request_add(rq);
1391 
1392 			/*
1393 			 * XXX We don't handle resetting the kernel context
1394 			 * very well. If we trigger a device reset twice in
1395 			 * quick succession while the kernel context is
1396 			 * executing, we may end up skipping the breadcrumb.
1397 			 * This is really only a problem for the selftest as
1398 			 * normally there is a large interlude between resets
1399 			 * (hangcheck), or we focus on resetting just one
1400 			 * engine and so avoid repeatedly resetting innocents.
1401 			 */
1402 			err = wait_for_others(gt, engine);
1403 			if (err) {
1404 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1405 				       __func__, engine->name);
1406 				i915_request_put(rq);
1407 				i915_request_put(prev);
1408 
1409 				GEM_TRACE_DUMP();
1410 				intel_gt_set_wedged(gt);
1411 				goto fini;
1412 			}
1413 
1414 			if (!wait_until_running(&h, prev)) {
1415 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1416 
1417 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1418 				       __func__, engine->name,
1419 				       prev->fence.seqno, hws_seqno(&h, prev));
1420 				intel_engine_dump(engine, &p,
1421 						  "%s\n", engine->name);
1422 
1423 				i915_request_put(rq);
1424 				i915_request_put(prev);
1425 
1426 				intel_gt_set_wedged(gt);
1427 
1428 				err = -EIO;
1429 				goto fini;
1430 			}
1431 
1432 			reset_count = fake_hangcheck(gt, BIT(id));
1433 
1434 			if (prev->fence.error != -EIO) {
1435 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1436 				       prev->fence.error);
1437 				i915_request_put(rq);
1438 				i915_request_put(prev);
1439 				err = -EINVAL;
1440 				goto fini;
1441 			}
1442 
1443 			if (rq->fence.error) {
1444 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1445 				       rq->fence.error);
1446 				i915_request_put(rq);
1447 				i915_request_put(prev);
1448 				err = -EINVAL;
1449 				goto fini;
1450 			}
1451 
1452 			if (i915_reset_count(global) == reset_count) {
1453 				pr_err("No GPU reset recorded!\n");
1454 				i915_request_put(rq);
1455 				i915_request_put(prev);
1456 				err = -EINVAL;
1457 				goto fini;
1458 			}
1459 
1460 			i915_request_put(prev);
1461 			prev = rq;
1462 			count++;
1463 		} while (time_before(jiffies, end_time));
1464 		pr_info("%s: Completed %d resets\n", engine->name, count);
1465 
1466 		*h.batch = MI_BATCH_BUFFER_END;
1467 		intel_gt_chipset_flush(engine->gt);
1468 
1469 		i915_request_put(prev);
1470 
1471 		err = igt_flush_test(gt->i915);
1472 		if (err)
1473 			break;
1474 	}
1475 
1476 fini:
1477 	hang_fini(&h);
1478 unlock:
1479 	igt_global_reset_unlock(gt);
1480 
1481 	if (intel_gt_is_wedged(gt))
1482 		return -EIO;
1483 
1484 	return err;
1485 }
1486 
1487 static int igt_handle_error(void *arg)
1488 {
1489 	struct intel_gt *gt = arg;
1490 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1491 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1492 	struct hang h;
1493 	struct i915_request *rq;
1494 	struct i915_gpu_state *error;
1495 	int err;
1496 
1497 	/* Check that we can issue a global GPU and engine reset */
1498 
1499 	if (!intel_has_reset_engine(gt))
1500 		return 0;
1501 
1502 	if (!engine || !intel_engine_can_store_dword(engine))
1503 		return 0;
1504 
1505 	err = hang_init(&h, gt);
1506 	if (err)
1507 		return err;
1508 
1509 	rq = hang_create_request(&h, engine);
1510 	if (IS_ERR(rq)) {
1511 		err = PTR_ERR(rq);
1512 		goto err_fini;
1513 	}
1514 
1515 	i915_request_get(rq);
1516 	i915_request_add(rq);
1517 
1518 	if (!wait_until_running(&h, rq)) {
1519 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1520 
1521 		pr_err("%s: Failed to start request %llx, at %x\n",
1522 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1523 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1524 
1525 		intel_gt_set_wedged(gt);
1526 
1527 		err = -EIO;
1528 		goto err_request;
1529 	}
1530 
1531 	/* Temporarily disable error capture */
1532 	error = xchg(&global->first_error, (void *)-1);
1533 
1534 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1535 
1536 	xchg(&global->first_error, error);
1537 
1538 	if (rq->fence.error != -EIO) {
1539 		pr_err("Guilty request not identified!\n");
1540 		err = -EINVAL;
1541 		goto err_request;
1542 	}
1543 
1544 err_request:
1545 	i915_request_put(rq);
1546 err_fini:
1547 	hang_fini(&h);
1548 	return err;
1549 }
1550 
1551 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1552 				     const struct igt_atomic_section *p,
1553 				     const char *mode)
1554 {
1555 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1556 	int err;
1557 
1558 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1559 		  engine->name, mode, p->name);
1560 
1561 	tasklet_disable_nosync(t);
1562 	p->critical_section_begin();
1563 
1564 	err = intel_engine_reset(engine, NULL);
1565 
1566 	p->critical_section_end();
1567 	tasklet_enable(t);
1568 
1569 	if (err)
1570 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1571 		       engine->name, mode, p->name);
1572 
1573 	return err;
1574 }
1575 
1576 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1577 				   const struct igt_atomic_section *p)
1578 {
1579 	struct i915_request *rq;
1580 	struct hang h;
1581 	int err;
1582 
1583 	err = __igt_atomic_reset_engine(engine, p, "idle");
1584 	if (err)
1585 		return err;
1586 
1587 	err = hang_init(&h, engine->gt);
1588 	if (err)
1589 		return err;
1590 
1591 	rq = hang_create_request(&h, engine);
1592 	if (IS_ERR(rq)) {
1593 		err = PTR_ERR(rq);
1594 		goto out;
1595 	}
1596 
1597 	i915_request_get(rq);
1598 	i915_request_add(rq);
1599 
1600 	if (wait_until_running(&h, rq)) {
1601 		err = __igt_atomic_reset_engine(engine, p, "active");
1602 	} else {
1603 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1604 		       __func__, engine->name,
1605 		       rq->fence.seqno, hws_seqno(&h, rq));
1606 		intel_gt_set_wedged(engine->gt);
1607 		err = -EIO;
1608 	}
1609 
1610 	if (err == 0) {
1611 		struct intel_wedge_me w;
1612 
1613 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1614 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1615 		if (intel_gt_is_wedged(engine->gt))
1616 			err = -EIO;
1617 	}
1618 
1619 	i915_request_put(rq);
1620 out:
1621 	hang_fini(&h);
1622 	return err;
1623 }
1624 
1625 static int igt_reset_engines_atomic(void *arg)
1626 {
1627 	struct intel_gt *gt = arg;
1628 	const typeof(*igt_atomic_phases) *p;
1629 	int err = 0;
1630 
1631 	/* Check that the engines resets are usable from atomic context */
1632 
1633 	if (!intel_has_reset_engine(gt))
1634 		return 0;
1635 
1636 	if (USES_GUC_SUBMISSION(gt->i915))
1637 		return 0;
1638 
1639 	igt_global_reset_lock(gt);
1640 
1641 	/* Flush any requests before we get started and check basics */
1642 	if (!igt_force_reset(gt))
1643 		goto unlock;
1644 
1645 	for (p = igt_atomic_phases; p->name; p++) {
1646 		struct intel_engine_cs *engine;
1647 		enum intel_engine_id id;
1648 
1649 		for_each_engine(engine, gt->i915, id) {
1650 			err = igt_atomic_reset_engine(engine, p);
1651 			if (err)
1652 				goto out;
1653 		}
1654 	}
1655 
1656 out:
1657 	/* As we poke around the guts, do a full reset before continuing. */
1658 	igt_force_reset(gt);
1659 unlock:
1660 	igt_global_reset_unlock(gt);
1661 
1662 	return err;
1663 }
1664 
1665 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1666 {
1667 	static const struct i915_subtest tests[] = {
1668 		SUBTEST(igt_hang_sanitycheck),
1669 		SUBTEST(igt_reset_nop),
1670 		SUBTEST(igt_reset_nop_engine),
1671 		SUBTEST(igt_reset_idle_engine),
1672 		SUBTEST(igt_reset_active_engine),
1673 		SUBTEST(igt_reset_engines),
1674 		SUBTEST(igt_reset_engines_atomic),
1675 		SUBTEST(igt_reset_queue),
1676 		SUBTEST(igt_reset_wait),
1677 		SUBTEST(igt_reset_evict_ggtt),
1678 		SUBTEST(igt_reset_evict_ppgtt),
1679 		SUBTEST(igt_reset_evict_fence),
1680 		SUBTEST(igt_handle_error),
1681 	};
1682 	struct intel_gt *gt = &i915->gt;
1683 	intel_wakeref_t wakeref;
1684 	bool saved_hangcheck;
1685 	int err;
1686 
1687 	if (!intel_has_gpu_reset(gt))
1688 		return 0;
1689 
1690 	if (intel_gt_is_wedged(gt))
1691 		return -EIO; /* we're long past hope of a successful reset */
1692 
1693 	wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1694 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1695 	drain_delayed_work(&gt->hangcheck.work); /* flush param */
1696 
1697 	err = intel_gt_live_subtests(tests, gt);
1698 
1699 	i915_modparams.enable_hangcheck = saved_hangcheck;
1700 	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1701 
1702 	return err;
1703 }
1704