1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 
33 #include "i915_selftest.h"
34 #include "selftests/i915_random.h"
35 #include "selftests/igt_flush_test.h"
36 #include "selftests/igt_reset.h"
37 #include "selftests/igt_atomic.h"
38 
39 #include "selftests/mock_drm.h"
40 
41 #include "gem/selftests/mock_context.h"
42 #include "gem/selftests/igt_gem_utils.h"
43 
44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
45 
46 struct hang {
47 	struct intel_gt *gt;
48 	struct drm_i915_gem_object *hws;
49 	struct drm_i915_gem_object *obj;
50 	struct i915_gem_context *ctx;
51 	u32 *seqno;
52 	u32 *batch;
53 };
54 
55 static int hang_init(struct hang *h, struct intel_gt *gt)
56 {
57 	void *vaddr;
58 	int err;
59 
60 	memset(h, 0, sizeof(*h));
61 	h->gt = gt;
62 
63 	h->ctx = kernel_context(gt->i915);
64 	if (IS_ERR(h->ctx))
65 		return PTR_ERR(h->ctx);
66 
67 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
68 
69 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
70 	if (IS_ERR(h->hws)) {
71 		err = PTR_ERR(h->hws);
72 		goto err_ctx;
73 	}
74 
75 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
76 	if (IS_ERR(h->obj)) {
77 		err = PTR_ERR(h->obj);
78 		goto err_hws;
79 	}
80 
81 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
82 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
83 	if (IS_ERR(vaddr)) {
84 		err = PTR_ERR(vaddr);
85 		goto err_obj;
86 	}
87 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
88 
89 	vaddr = i915_gem_object_pin_map(h->obj,
90 					i915_coherent_map_type(gt->i915));
91 	if (IS_ERR(vaddr)) {
92 		err = PTR_ERR(vaddr);
93 		goto err_unpin_hws;
94 	}
95 	h->batch = vaddr;
96 
97 	return 0;
98 
99 err_unpin_hws:
100 	i915_gem_object_unpin_map(h->hws);
101 err_obj:
102 	i915_gem_object_put(h->obj);
103 err_hws:
104 	i915_gem_object_put(h->hws);
105 err_ctx:
106 	kernel_context_close(h->ctx);
107 	return err;
108 }
109 
110 static u64 hws_address(const struct i915_vma *hws,
111 		       const struct i915_request *rq)
112 {
113 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
114 }
115 
116 static int move_to_active(struct i915_vma *vma,
117 			  struct i915_request *rq,
118 			  unsigned int flags)
119 {
120 	int err;
121 
122 	i915_vma_lock(vma);
123 	err = i915_request_await_object(rq, vma->obj,
124 					flags & EXEC_OBJECT_WRITE);
125 	if (err == 0)
126 		err = i915_vma_move_to_active(vma, rq, flags);
127 	i915_vma_unlock(vma);
128 
129 	return err;
130 }
131 
132 static struct i915_request *
133 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
134 {
135 	struct intel_gt *gt = h->gt;
136 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
137 	struct drm_i915_gem_object *obj;
138 	struct i915_request *rq = NULL;
139 	struct i915_vma *hws, *vma;
140 	unsigned int flags;
141 	void *vaddr;
142 	u32 *batch;
143 	int err;
144 
145 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
146 	if (IS_ERR(obj)) {
147 		i915_vm_put(vm);
148 		return ERR_CAST(obj);
149 	}
150 
151 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
152 	if (IS_ERR(vaddr)) {
153 		i915_gem_object_put(obj);
154 		i915_vm_put(vm);
155 		return ERR_CAST(vaddr);
156 	}
157 
158 	i915_gem_object_unpin_map(h->obj);
159 	i915_gem_object_put(h->obj);
160 
161 	h->obj = obj;
162 	h->batch = vaddr;
163 
164 	vma = i915_vma_instance(h->obj, vm, NULL);
165 	if (IS_ERR(vma)) {
166 		i915_vm_put(vm);
167 		return ERR_CAST(vma);
168 	}
169 
170 	hws = i915_vma_instance(h->hws, vm, NULL);
171 	if (IS_ERR(hws)) {
172 		i915_vm_put(vm);
173 		return ERR_CAST(hws);
174 	}
175 
176 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
177 	if (err) {
178 		i915_vm_put(vm);
179 		return ERR_PTR(err);
180 	}
181 
182 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
183 	if (err)
184 		goto unpin_vma;
185 
186 	rq = igt_request_alloc(h->ctx, engine);
187 	if (IS_ERR(rq)) {
188 		err = PTR_ERR(rq);
189 		goto unpin_hws;
190 	}
191 
192 	err = move_to_active(vma, rq, 0);
193 	if (err)
194 		goto cancel_rq;
195 
196 	err = move_to_active(hws, rq, 0);
197 	if (err)
198 		goto cancel_rq;
199 
200 	batch = h->batch;
201 	if (INTEL_GEN(gt->i915) >= 8) {
202 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
203 		*batch++ = lower_32_bits(hws_address(hws, rq));
204 		*batch++ = upper_32_bits(hws_address(hws, rq));
205 		*batch++ = rq->fence.seqno;
206 		*batch++ = MI_NOOP;
207 
208 		memset(batch, 0, 1024);
209 		batch += 1024 / sizeof(*batch);
210 
211 		*batch++ = MI_NOOP;
212 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
213 		*batch++ = lower_32_bits(vma->node.start);
214 		*batch++ = upper_32_bits(vma->node.start);
215 	} else if (INTEL_GEN(gt->i915) >= 6) {
216 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
217 		*batch++ = 0;
218 		*batch++ = lower_32_bits(hws_address(hws, rq));
219 		*batch++ = rq->fence.seqno;
220 		*batch++ = MI_NOOP;
221 
222 		memset(batch, 0, 1024);
223 		batch += 1024 / sizeof(*batch);
224 
225 		*batch++ = MI_NOOP;
226 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
227 		*batch++ = lower_32_bits(vma->node.start);
228 	} else if (INTEL_GEN(gt->i915) >= 4) {
229 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
230 		*batch++ = 0;
231 		*batch++ = lower_32_bits(hws_address(hws, rq));
232 		*batch++ = rq->fence.seqno;
233 		*batch++ = MI_NOOP;
234 
235 		memset(batch, 0, 1024);
236 		batch += 1024 / sizeof(*batch);
237 
238 		*batch++ = MI_NOOP;
239 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
240 		*batch++ = lower_32_bits(vma->node.start);
241 	} else {
242 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
243 		*batch++ = lower_32_bits(hws_address(hws, rq));
244 		*batch++ = rq->fence.seqno;
245 		*batch++ = MI_NOOP;
246 
247 		memset(batch, 0, 1024);
248 		batch += 1024 / sizeof(*batch);
249 
250 		*batch++ = MI_NOOP;
251 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
252 		*batch++ = lower_32_bits(vma->node.start);
253 	}
254 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
255 	intel_gt_chipset_flush(engine->gt);
256 
257 	if (rq->engine->emit_init_breadcrumb) {
258 		err = rq->engine->emit_init_breadcrumb(rq);
259 		if (err)
260 			goto cancel_rq;
261 	}
262 
263 	flags = 0;
264 	if (INTEL_GEN(gt->i915) <= 5)
265 		flags |= I915_DISPATCH_SECURE;
266 
267 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
268 
269 cancel_rq:
270 	if (err) {
271 		i915_request_set_error_once(rq, err);
272 		i915_request_add(rq);
273 	}
274 unpin_hws:
275 	i915_vma_unpin(hws);
276 unpin_vma:
277 	i915_vma_unpin(vma);
278 	i915_vm_put(vm);
279 	return err ? ERR_PTR(err) : rq;
280 }
281 
282 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
283 {
284 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
285 }
286 
287 static void hang_fini(struct hang *h)
288 {
289 	*h->batch = MI_BATCH_BUFFER_END;
290 	intel_gt_chipset_flush(h->gt);
291 
292 	i915_gem_object_unpin_map(h->obj);
293 	i915_gem_object_put(h->obj);
294 
295 	i915_gem_object_unpin_map(h->hws);
296 	i915_gem_object_put(h->hws);
297 
298 	kernel_context_close(h->ctx);
299 
300 	igt_flush_test(h->gt->i915);
301 }
302 
303 static bool wait_until_running(struct hang *h, struct i915_request *rq)
304 {
305 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
306 					       rq->fence.seqno),
307 			     10) &&
308 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
309 					    rq->fence.seqno),
310 			  1000));
311 }
312 
313 static void engine_heartbeat_disable(struct intel_engine_cs *engine)
314 {
315 	engine->props.heartbeat_interval_ms = 0;
316 
317 	intel_engine_pm_get(engine);
318 	intel_engine_park_heartbeat(engine);
319 }
320 
321 static void engine_heartbeat_enable(struct intel_engine_cs *engine)
322 {
323 	intel_engine_pm_put(engine);
324 
325 	engine->props.heartbeat_interval_ms =
326 		engine->defaults.heartbeat_interval_ms;
327 }
328 
329 static int igt_hang_sanitycheck(void *arg)
330 {
331 	struct intel_gt *gt = arg;
332 	struct i915_request *rq;
333 	struct intel_engine_cs *engine;
334 	enum intel_engine_id id;
335 	struct hang h;
336 	int err;
337 
338 	/* Basic check that we can execute our hanging batch */
339 
340 	err = hang_init(&h, gt);
341 	if (err)
342 		return err;
343 
344 	for_each_engine(engine, gt, id) {
345 		struct intel_wedge_me w;
346 		long timeout;
347 
348 		if (!intel_engine_can_store_dword(engine))
349 			continue;
350 
351 		rq = hang_create_request(&h, engine);
352 		if (IS_ERR(rq)) {
353 			err = PTR_ERR(rq);
354 			pr_err("Failed to create request for %s, err=%d\n",
355 			       engine->name, err);
356 			goto fini;
357 		}
358 
359 		i915_request_get(rq);
360 
361 		*h.batch = MI_BATCH_BUFFER_END;
362 		intel_gt_chipset_flush(engine->gt);
363 
364 		i915_request_add(rq);
365 
366 		timeout = 0;
367 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
368 			timeout = i915_request_wait(rq, 0,
369 						    MAX_SCHEDULE_TIMEOUT);
370 		if (intel_gt_is_wedged(gt))
371 			timeout = -EIO;
372 
373 		i915_request_put(rq);
374 
375 		if (timeout < 0) {
376 			err = timeout;
377 			pr_err("Wait for request failed on %s, err=%d\n",
378 			       engine->name, err);
379 			goto fini;
380 		}
381 	}
382 
383 fini:
384 	hang_fini(&h);
385 	return err;
386 }
387 
388 static bool wait_for_idle(struct intel_engine_cs *engine)
389 {
390 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
391 }
392 
393 static int igt_reset_nop(void *arg)
394 {
395 	struct intel_gt *gt = arg;
396 	struct i915_gpu_error *global = &gt->i915->gpu_error;
397 	struct intel_engine_cs *engine;
398 	unsigned int reset_count, count;
399 	enum intel_engine_id id;
400 	IGT_TIMEOUT(end_time);
401 	int err = 0;
402 
403 	/* Check that we can reset during non-user portions of requests */
404 
405 	reset_count = i915_reset_count(global);
406 	count = 0;
407 	do {
408 		for_each_engine(engine, gt, id) {
409 			struct intel_context *ce;
410 			int i;
411 
412 			ce = intel_context_create(engine);
413 			if (IS_ERR(ce)) {
414 				err = PTR_ERR(ce);
415 				break;
416 			}
417 
418 			for (i = 0; i < 16; i++) {
419 				struct i915_request *rq;
420 
421 				rq = intel_context_create_request(ce);
422 				if (IS_ERR(rq)) {
423 					err = PTR_ERR(rq);
424 					break;
425 				}
426 
427 				i915_request_add(rq);
428 			}
429 
430 			intel_context_put(ce);
431 		}
432 
433 		igt_global_reset_lock(gt);
434 		intel_gt_reset(gt, ALL_ENGINES, NULL);
435 		igt_global_reset_unlock(gt);
436 
437 		if (intel_gt_is_wedged(gt)) {
438 			err = -EIO;
439 			break;
440 		}
441 
442 		if (i915_reset_count(global) != reset_count + ++count) {
443 			pr_err("Full GPU reset not recorded!\n");
444 			err = -EINVAL;
445 			break;
446 		}
447 
448 		err = igt_flush_test(gt->i915);
449 		if (err)
450 			break;
451 	} while (time_before(jiffies, end_time));
452 	pr_info("%s: %d resets\n", __func__, count);
453 
454 	if (igt_flush_test(gt->i915))
455 		err = -EIO;
456 	return err;
457 }
458 
459 static int igt_reset_nop_engine(void *arg)
460 {
461 	struct intel_gt *gt = arg;
462 	struct i915_gpu_error *global = &gt->i915->gpu_error;
463 	struct intel_engine_cs *engine;
464 	enum intel_engine_id id;
465 
466 	/* Check that we can engine-reset during non-user portions */
467 
468 	if (!intel_has_reset_engine(gt))
469 		return 0;
470 
471 	for_each_engine(engine, gt, id) {
472 		unsigned int reset_count, reset_engine_count, count;
473 		struct intel_context *ce;
474 		IGT_TIMEOUT(end_time);
475 		int err;
476 
477 		ce = intel_context_create(engine);
478 		if (IS_ERR(ce))
479 			return PTR_ERR(ce);
480 
481 		reset_count = i915_reset_count(global);
482 		reset_engine_count = i915_reset_engine_count(global, engine);
483 		count = 0;
484 
485 		engine_heartbeat_disable(engine);
486 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
487 		do {
488 			int i;
489 
490 			if (!wait_for_idle(engine)) {
491 				pr_err("%s failed to idle before reset\n",
492 				       engine->name);
493 				err = -EIO;
494 				break;
495 			}
496 
497 			for (i = 0; i < 16; i++) {
498 				struct i915_request *rq;
499 
500 				rq = intel_context_create_request(ce);
501 				if (IS_ERR(rq)) {
502 					err = PTR_ERR(rq);
503 					break;
504 				}
505 
506 				i915_request_add(rq);
507 			}
508 			err = intel_engine_reset(engine, NULL);
509 			if (err) {
510 				pr_err("i915_reset_engine failed\n");
511 				break;
512 			}
513 
514 			if (i915_reset_count(global) != reset_count) {
515 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
516 				err = -EINVAL;
517 				break;
518 			}
519 
520 			if (i915_reset_engine_count(global, engine) !=
521 			    reset_engine_count + ++count) {
522 				pr_err("%s engine reset not recorded!\n",
523 				       engine->name);
524 				err = -EINVAL;
525 				break;
526 			}
527 		} while (time_before(jiffies, end_time));
528 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
529 		engine_heartbeat_enable(engine);
530 
531 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532 
533 		intel_context_put(ce);
534 		if (igt_flush_test(gt->i915))
535 			err = -EIO;
536 		if (err)
537 			return err;
538 	}
539 
540 	return 0;
541 }
542 
543 static int __igt_reset_engine(struct intel_gt *gt, bool active)
544 {
545 	struct i915_gpu_error *global = &gt->i915->gpu_error;
546 	struct intel_engine_cs *engine;
547 	enum intel_engine_id id;
548 	struct hang h;
549 	int err = 0;
550 
551 	/* Check that we can issue an engine reset on an idle engine (no-op) */
552 
553 	if (!intel_has_reset_engine(gt))
554 		return 0;
555 
556 	if (active) {
557 		err = hang_init(&h, gt);
558 		if (err)
559 			return err;
560 	}
561 
562 	for_each_engine(engine, gt, id) {
563 		unsigned int reset_count, reset_engine_count;
564 		IGT_TIMEOUT(end_time);
565 
566 		if (active && !intel_engine_can_store_dword(engine))
567 			continue;
568 
569 		if (!wait_for_idle(engine)) {
570 			pr_err("%s failed to idle before reset\n",
571 			       engine->name);
572 			err = -EIO;
573 			break;
574 		}
575 
576 		reset_count = i915_reset_count(global);
577 		reset_engine_count = i915_reset_engine_count(global, engine);
578 
579 		engine_heartbeat_disable(engine);
580 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
581 		do {
582 			if (active) {
583 				struct i915_request *rq;
584 
585 				rq = hang_create_request(&h, engine);
586 				if (IS_ERR(rq)) {
587 					err = PTR_ERR(rq);
588 					break;
589 				}
590 
591 				i915_request_get(rq);
592 				i915_request_add(rq);
593 
594 				if (!wait_until_running(&h, rq)) {
595 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
596 
597 					pr_err("%s: Failed to start request %llx, at %x\n",
598 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
599 					intel_engine_dump(engine, &p,
600 							  "%s\n", engine->name);
601 
602 					i915_request_put(rq);
603 					err = -EIO;
604 					break;
605 				}
606 
607 				i915_request_put(rq);
608 			}
609 
610 			err = intel_engine_reset(engine, NULL);
611 			if (err) {
612 				pr_err("i915_reset_engine failed\n");
613 				break;
614 			}
615 
616 			if (i915_reset_count(global) != reset_count) {
617 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
618 				err = -EINVAL;
619 				break;
620 			}
621 
622 			if (i915_reset_engine_count(global, engine) !=
623 			    ++reset_engine_count) {
624 				pr_err("%s engine reset not recorded!\n",
625 				       engine->name);
626 				err = -EINVAL;
627 				break;
628 			}
629 		} while (time_before(jiffies, end_time));
630 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
631 		engine_heartbeat_enable(engine);
632 
633 		if (err)
634 			break;
635 
636 		err = igt_flush_test(gt->i915);
637 		if (err)
638 			break;
639 	}
640 
641 	if (intel_gt_is_wedged(gt))
642 		err = -EIO;
643 
644 	if (active)
645 		hang_fini(&h);
646 
647 	return err;
648 }
649 
650 static int igt_reset_idle_engine(void *arg)
651 {
652 	return __igt_reset_engine(arg, false);
653 }
654 
655 static int igt_reset_active_engine(void *arg)
656 {
657 	return __igt_reset_engine(arg, true);
658 }
659 
660 struct active_engine {
661 	struct task_struct *task;
662 	struct intel_engine_cs *engine;
663 	unsigned long resets;
664 	unsigned int flags;
665 };
666 
667 #define TEST_ACTIVE	BIT(0)
668 #define TEST_OTHERS	BIT(1)
669 #define TEST_SELF	BIT(2)
670 #define TEST_PRIORITY	BIT(3)
671 
672 static int active_request_put(struct i915_request *rq)
673 {
674 	int err = 0;
675 
676 	if (!rq)
677 		return 0;
678 
679 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
680 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
681 			  rq->engine->name,
682 			  rq->fence.context,
683 			  rq->fence.seqno);
684 		GEM_TRACE_DUMP();
685 
686 		intel_gt_set_wedged(rq->engine->gt);
687 		err = -EIO;
688 	}
689 
690 	i915_request_put(rq);
691 
692 	return err;
693 }
694 
695 static int active_engine(void *data)
696 {
697 	I915_RND_STATE(prng);
698 	struct active_engine *arg = data;
699 	struct intel_engine_cs *engine = arg->engine;
700 	struct i915_request *rq[8] = {};
701 	struct intel_context *ce[ARRAY_SIZE(rq)];
702 	unsigned long count;
703 	int err = 0;
704 
705 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
706 		ce[count] = intel_context_create(engine);
707 		if (IS_ERR(ce[count])) {
708 			err = PTR_ERR(ce[count]);
709 			while (--count)
710 				intel_context_put(ce[count]);
711 			return err;
712 		}
713 	}
714 
715 	count = 0;
716 	while (!kthread_should_stop()) {
717 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
718 		struct i915_request *old = rq[idx];
719 		struct i915_request *new;
720 
721 		new = intel_context_create_request(ce[idx]);
722 		if (IS_ERR(new)) {
723 			err = PTR_ERR(new);
724 			break;
725 		}
726 
727 		rq[idx] = i915_request_get(new);
728 		i915_request_add(new);
729 
730 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
731 			struct i915_sched_attr attr = {
732 				.priority =
733 					i915_prandom_u32_max_state(512, &prng),
734 			};
735 			engine->schedule(rq[idx], &attr);
736 		}
737 
738 		err = active_request_put(old);
739 		if (err)
740 			break;
741 
742 		cond_resched();
743 	}
744 
745 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
746 		int err__ = active_request_put(rq[count]);
747 
748 		/* Keep the first error */
749 		if (!err)
750 			err = err__;
751 
752 		intel_context_put(ce[count]);
753 	}
754 
755 	return err;
756 }
757 
758 static int __igt_reset_engines(struct intel_gt *gt,
759 			       const char *test_name,
760 			       unsigned int flags)
761 {
762 	struct i915_gpu_error *global = &gt->i915->gpu_error;
763 	struct intel_engine_cs *engine, *other;
764 	enum intel_engine_id id, tmp;
765 	struct hang h;
766 	int err = 0;
767 
768 	/* Check that issuing a reset on one engine does not interfere
769 	 * with any other engine.
770 	 */
771 
772 	if (!intel_has_reset_engine(gt))
773 		return 0;
774 
775 	if (flags & TEST_ACTIVE) {
776 		err = hang_init(&h, gt);
777 		if (err)
778 			return err;
779 
780 		if (flags & TEST_PRIORITY)
781 			h.ctx->sched.priority = 1024;
782 	}
783 
784 	for_each_engine(engine, gt, id) {
785 		struct active_engine threads[I915_NUM_ENGINES] = {};
786 		unsigned long device = i915_reset_count(global);
787 		unsigned long count = 0, reported;
788 		IGT_TIMEOUT(end_time);
789 
790 		if (flags & TEST_ACTIVE &&
791 		    !intel_engine_can_store_dword(engine))
792 			continue;
793 
794 		if (!wait_for_idle(engine)) {
795 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
796 			       engine->name, test_name);
797 			err = -EIO;
798 			break;
799 		}
800 
801 		memset(threads, 0, sizeof(threads));
802 		for_each_engine(other, gt, tmp) {
803 			struct task_struct *tsk;
804 
805 			threads[tmp].resets =
806 				i915_reset_engine_count(global, other);
807 
808 			if (other == engine && !(flags & TEST_SELF))
809 				continue;
810 
811 			if (other != engine && !(flags & TEST_OTHERS))
812 				continue;
813 
814 			threads[tmp].engine = other;
815 			threads[tmp].flags = flags;
816 
817 			tsk = kthread_run(active_engine, &threads[tmp],
818 					  "igt/%s", other->name);
819 			if (IS_ERR(tsk)) {
820 				err = PTR_ERR(tsk);
821 				goto unwind;
822 			}
823 
824 			threads[tmp].task = tsk;
825 			get_task_struct(tsk);
826 		}
827 
828 		yield(); /* start all threads before we begin */
829 
830 		engine_heartbeat_disable(engine);
831 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
832 		do {
833 			struct i915_request *rq = NULL;
834 
835 			if (flags & TEST_ACTIVE) {
836 				rq = hang_create_request(&h, engine);
837 				if (IS_ERR(rq)) {
838 					err = PTR_ERR(rq);
839 					break;
840 				}
841 
842 				i915_request_get(rq);
843 				i915_request_add(rq);
844 
845 				if (!wait_until_running(&h, rq)) {
846 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
847 
848 					pr_err("%s: Failed to start request %llx, at %x\n",
849 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
850 					intel_engine_dump(engine, &p,
851 							  "%s\n", engine->name);
852 
853 					i915_request_put(rq);
854 					err = -EIO;
855 					break;
856 				}
857 			}
858 
859 			err = intel_engine_reset(engine, NULL);
860 			if (err) {
861 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
862 				       engine->name, test_name, err);
863 				break;
864 			}
865 
866 			count++;
867 
868 			if (rq) {
869 				if (rq->fence.error != -EIO) {
870 					pr_err("i915_reset_engine(%s:%s):"
871 					       " failed to reset request %llx:%lld\n",
872 					       engine->name, test_name,
873 					       rq->fence.context,
874 					       rq->fence.seqno);
875 					i915_request_put(rq);
876 
877 					GEM_TRACE_DUMP();
878 					intel_gt_set_wedged(gt);
879 					err = -EIO;
880 					break;
881 				}
882 
883 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
884 					struct drm_printer p =
885 						drm_info_printer(gt->i915->drm.dev);
886 
887 					pr_err("i915_reset_engine(%s:%s):"
888 					       " failed to complete request %llx:%lld after reset\n",
889 					       engine->name, test_name,
890 					       rq->fence.context,
891 					       rq->fence.seqno);
892 					intel_engine_dump(engine, &p,
893 							  "%s\n", engine->name);
894 					i915_request_put(rq);
895 
896 					GEM_TRACE_DUMP();
897 					intel_gt_set_wedged(gt);
898 					err = -EIO;
899 					break;
900 				}
901 
902 				i915_request_put(rq);
903 			}
904 
905 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
906 				struct drm_printer p =
907 					drm_info_printer(gt->i915->drm.dev);
908 
909 				pr_err("i915_reset_engine(%s:%s):"
910 				       " failed to idle after reset\n",
911 				       engine->name, test_name);
912 				intel_engine_dump(engine, &p,
913 						  "%s\n", engine->name);
914 
915 				err = -EIO;
916 				break;
917 			}
918 		} while (time_before(jiffies, end_time));
919 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
920 		engine_heartbeat_enable(engine);
921 
922 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
923 			engine->name, test_name, count);
924 
925 		reported = i915_reset_engine_count(global, engine);
926 		reported -= threads[engine->id].resets;
927 		if (reported != count) {
928 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
929 			       engine->name, test_name, count, reported);
930 			if (!err)
931 				err = -EINVAL;
932 		}
933 
934 unwind:
935 		for_each_engine(other, gt, tmp) {
936 			int ret;
937 
938 			if (!threads[tmp].task)
939 				continue;
940 
941 			ret = kthread_stop(threads[tmp].task);
942 			if (ret) {
943 				pr_err("kthread for other engine %s failed, err=%d\n",
944 				       other->name, ret);
945 				if (!err)
946 					err = ret;
947 			}
948 			put_task_struct(threads[tmp].task);
949 
950 			if (other->uabi_class != engine->uabi_class &&
951 			    threads[tmp].resets !=
952 			    i915_reset_engine_count(global, other)) {
953 				pr_err("Innocent engine %s was reset (count=%ld)\n",
954 				       other->name,
955 				       i915_reset_engine_count(global, other) -
956 				       threads[tmp].resets);
957 				if (!err)
958 					err = -EINVAL;
959 			}
960 		}
961 
962 		if (device != i915_reset_count(global)) {
963 			pr_err("Global reset (count=%ld)!\n",
964 			       i915_reset_count(global) - device);
965 			if (!err)
966 				err = -EINVAL;
967 		}
968 
969 		if (err)
970 			break;
971 
972 		err = igt_flush_test(gt->i915);
973 		if (err)
974 			break;
975 	}
976 
977 	if (intel_gt_is_wedged(gt))
978 		err = -EIO;
979 
980 	if (flags & TEST_ACTIVE)
981 		hang_fini(&h);
982 
983 	return err;
984 }
985 
986 static int igt_reset_engines(void *arg)
987 {
988 	static const struct {
989 		const char *name;
990 		unsigned int flags;
991 	} phases[] = {
992 		{ "idle", 0 },
993 		{ "active", TEST_ACTIVE },
994 		{ "others-idle", TEST_OTHERS },
995 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
996 		{
997 			"others-priority",
998 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
999 		},
1000 		{
1001 			"self-priority",
1002 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1003 		},
1004 		{ }
1005 	};
1006 	struct intel_gt *gt = arg;
1007 	typeof(*phases) *p;
1008 	int err;
1009 
1010 	for (p = phases; p->name; p++) {
1011 		if (p->flags & TEST_PRIORITY) {
1012 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1013 				continue;
1014 		}
1015 
1016 		err = __igt_reset_engines(arg, p->name, p->flags);
1017 		if (err)
1018 			return err;
1019 	}
1020 
1021 	return 0;
1022 }
1023 
1024 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1025 {
1026 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1027 
1028 	intel_gt_reset(gt, mask, NULL);
1029 
1030 	return count;
1031 }
1032 
1033 static int igt_reset_wait(void *arg)
1034 {
1035 	struct intel_gt *gt = arg;
1036 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1037 	struct intel_engine_cs *engine = gt->engine[RCS0];
1038 	struct i915_request *rq;
1039 	unsigned int reset_count;
1040 	struct hang h;
1041 	long timeout;
1042 	int err;
1043 
1044 	if (!engine || !intel_engine_can_store_dword(engine))
1045 		return 0;
1046 
1047 	/* Check that we detect a stuck waiter and issue a reset */
1048 
1049 	igt_global_reset_lock(gt);
1050 
1051 	err = hang_init(&h, gt);
1052 	if (err)
1053 		goto unlock;
1054 
1055 	rq = hang_create_request(&h, engine);
1056 	if (IS_ERR(rq)) {
1057 		err = PTR_ERR(rq);
1058 		goto fini;
1059 	}
1060 
1061 	i915_request_get(rq);
1062 	i915_request_add(rq);
1063 
1064 	if (!wait_until_running(&h, rq)) {
1065 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1066 
1067 		pr_err("%s: Failed to start request %llx, at %x\n",
1068 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1069 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1070 
1071 		intel_gt_set_wedged(gt);
1072 
1073 		err = -EIO;
1074 		goto out_rq;
1075 	}
1076 
1077 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1078 
1079 	timeout = i915_request_wait(rq, 0, 10);
1080 	if (timeout < 0) {
1081 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1082 		       timeout);
1083 		err = timeout;
1084 		goto out_rq;
1085 	}
1086 
1087 	if (i915_reset_count(global) == reset_count) {
1088 		pr_err("No GPU reset recorded!\n");
1089 		err = -EINVAL;
1090 		goto out_rq;
1091 	}
1092 
1093 out_rq:
1094 	i915_request_put(rq);
1095 fini:
1096 	hang_fini(&h);
1097 unlock:
1098 	igt_global_reset_unlock(gt);
1099 
1100 	if (intel_gt_is_wedged(gt))
1101 		return -EIO;
1102 
1103 	return err;
1104 }
1105 
1106 struct evict_vma {
1107 	struct completion completion;
1108 	struct i915_vma *vma;
1109 };
1110 
1111 static int evict_vma(void *data)
1112 {
1113 	struct evict_vma *arg = data;
1114 	struct i915_address_space *vm = arg->vma->vm;
1115 	struct drm_mm_node evict = arg->vma->node;
1116 	int err;
1117 
1118 	complete(&arg->completion);
1119 
1120 	mutex_lock(&vm->mutex);
1121 	err = i915_gem_evict_for_node(vm, &evict, 0);
1122 	mutex_unlock(&vm->mutex);
1123 
1124 	return err;
1125 }
1126 
1127 static int evict_fence(void *data)
1128 {
1129 	struct evict_vma *arg = data;
1130 	int err;
1131 
1132 	complete(&arg->completion);
1133 
1134 	/* Mark the fence register as dirty to force the mmio update. */
1135 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1136 	if (err) {
1137 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1138 		return err;
1139 	}
1140 
1141 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1142 	if (err) {
1143 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1144 		return err;
1145 	}
1146 
1147 	err = i915_vma_pin_fence(arg->vma);
1148 	i915_vma_unpin(arg->vma);
1149 	if (err) {
1150 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1151 		return err;
1152 	}
1153 
1154 	i915_vma_unpin_fence(arg->vma);
1155 
1156 	return 0;
1157 }
1158 
1159 static int __igt_reset_evict_vma(struct intel_gt *gt,
1160 				 struct i915_address_space *vm,
1161 				 int (*fn)(void *),
1162 				 unsigned int flags)
1163 {
1164 	struct intel_engine_cs *engine = gt->engine[RCS0];
1165 	struct drm_i915_gem_object *obj;
1166 	struct task_struct *tsk = NULL;
1167 	struct i915_request *rq;
1168 	struct evict_vma arg;
1169 	struct hang h;
1170 	unsigned int pin_flags;
1171 	int err;
1172 
1173 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1174 		return 0;
1175 
1176 	if (!engine || !intel_engine_can_store_dword(engine))
1177 		return 0;
1178 
1179 	/* Check that we can recover an unbind stuck on a hanging request */
1180 
1181 	err = hang_init(&h, gt);
1182 	if (err)
1183 		return err;
1184 
1185 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1186 	if (IS_ERR(obj)) {
1187 		err = PTR_ERR(obj);
1188 		goto fini;
1189 	}
1190 
1191 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1192 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1193 		if (err) {
1194 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1195 			goto out_obj;
1196 		}
1197 	}
1198 
1199 	arg.vma = i915_vma_instance(obj, vm, NULL);
1200 	if (IS_ERR(arg.vma)) {
1201 		err = PTR_ERR(arg.vma);
1202 		goto out_obj;
1203 	}
1204 
1205 	rq = hang_create_request(&h, engine);
1206 	if (IS_ERR(rq)) {
1207 		err = PTR_ERR(rq);
1208 		goto out_obj;
1209 	}
1210 
1211 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1212 
1213 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1214 		pin_flags |= PIN_MAPPABLE;
1215 
1216 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1217 	if (err) {
1218 		i915_request_add(rq);
1219 		goto out_obj;
1220 	}
1221 
1222 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1223 		err = i915_vma_pin_fence(arg.vma);
1224 		if (err) {
1225 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1226 			i915_vma_unpin(arg.vma);
1227 			i915_request_add(rq);
1228 			goto out_obj;
1229 		}
1230 	}
1231 
1232 	i915_vma_lock(arg.vma);
1233 	err = i915_request_await_object(rq, arg.vma->obj,
1234 					flags & EXEC_OBJECT_WRITE);
1235 	if (err == 0)
1236 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1237 	i915_vma_unlock(arg.vma);
1238 
1239 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1240 		i915_vma_unpin_fence(arg.vma);
1241 	i915_vma_unpin(arg.vma);
1242 
1243 	i915_request_get(rq);
1244 	i915_request_add(rq);
1245 	if (err)
1246 		goto out_rq;
1247 
1248 	if (!wait_until_running(&h, rq)) {
1249 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1250 
1251 		pr_err("%s: Failed to start request %llx, at %x\n",
1252 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1253 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1254 
1255 		intel_gt_set_wedged(gt);
1256 		goto out_reset;
1257 	}
1258 
1259 	init_completion(&arg.completion);
1260 
1261 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1262 	if (IS_ERR(tsk)) {
1263 		err = PTR_ERR(tsk);
1264 		tsk = NULL;
1265 		goto out_reset;
1266 	}
1267 	get_task_struct(tsk);
1268 
1269 	wait_for_completion(&arg.completion);
1270 
1271 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1272 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1273 
1274 		pr_err("igt/evict_vma kthread did not wait\n");
1275 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1276 
1277 		intel_gt_set_wedged(gt);
1278 		goto out_reset;
1279 	}
1280 
1281 out_reset:
1282 	igt_global_reset_lock(gt);
1283 	fake_hangcheck(gt, rq->engine->mask);
1284 	igt_global_reset_unlock(gt);
1285 
1286 	if (tsk) {
1287 		struct intel_wedge_me w;
1288 
1289 		/* The reset, even indirectly, should take less than 10ms. */
1290 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1291 			err = kthread_stop(tsk);
1292 
1293 		put_task_struct(tsk);
1294 	}
1295 
1296 out_rq:
1297 	i915_request_put(rq);
1298 out_obj:
1299 	i915_gem_object_put(obj);
1300 fini:
1301 	hang_fini(&h);
1302 	if (intel_gt_is_wedged(gt))
1303 		return -EIO;
1304 
1305 	return err;
1306 }
1307 
1308 static int igt_reset_evict_ggtt(void *arg)
1309 {
1310 	struct intel_gt *gt = arg;
1311 
1312 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1313 				     evict_vma, EXEC_OBJECT_WRITE);
1314 }
1315 
1316 static int igt_reset_evict_ppgtt(void *arg)
1317 {
1318 	struct intel_gt *gt = arg;
1319 	struct i915_ppgtt *ppgtt;
1320 	int err;
1321 
1322 	/* aliasing == global gtt locking, covered above */
1323 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1324 		return 0;
1325 
1326 	ppgtt = i915_ppgtt_create(gt);
1327 	if (IS_ERR(ppgtt))
1328 		return PTR_ERR(ppgtt);
1329 
1330 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1331 				    evict_vma, EXEC_OBJECT_WRITE);
1332 	i915_vm_put(&ppgtt->vm);
1333 
1334 	return err;
1335 }
1336 
1337 static int igt_reset_evict_fence(void *arg)
1338 {
1339 	struct intel_gt *gt = arg;
1340 
1341 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1342 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1343 }
1344 
1345 static int wait_for_others(struct intel_gt *gt,
1346 			   struct intel_engine_cs *exclude)
1347 {
1348 	struct intel_engine_cs *engine;
1349 	enum intel_engine_id id;
1350 
1351 	for_each_engine(engine, gt, id) {
1352 		if (engine == exclude)
1353 			continue;
1354 
1355 		if (!wait_for_idle(engine))
1356 			return -EIO;
1357 	}
1358 
1359 	return 0;
1360 }
1361 
1362 static int igt_reset_queue(void *arg)
1363 {
1364 	struct intel_gt *gt = arg;
1365 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1366 	struct intel_engine_cs *engine;
1367 	enum intel_engine_id id;
1368 	struct hang h;
1369 	int err;
1370 
1371 	/* Check that we replay pending requests following a hang */
1372 
1373 	igt_global_reset_lock(gt);
1374 
1375 	err = hang_init(&h, gt);
1376 	if (err)
1377 		goto unlock;
1378 
1379 	for_each_engine(engine, gt, id) {
1380 		struct i915_request *prev;
1381 		IGT_TIMEOUT(end_time);
1382 		unsigned int count;
1383 
1384 		if (!intel_engine_can_store_dword(engine))
1385 			continue;
1386 
1387 		prev = hang_create_request(&h, engine);
1388 		if (IS_ERR(prev)) {
1389 			err = PTR_ERR(prev);
1390 			goto fini;
1391 		}
1392 
1393 		i915_request_get(prev);
1394 		i915_request_add(prev);
1395 
1396 		count = 0;
1397 		do {
1398 			struct i915_request *rq;
1399 			unsigned int reset_count;
1400 
1401 			rq = hang_create_request(&h, engine);
1402 			if (IS_ERR(rq)) {
1403 				err = PTR_ERR(rq);
1404 				goto fini;
1405 			}
1406 
1407 			i915_request_get(rq);
1408 			i915_request_add(rq);
1409 
1410 			/*
1411 			 * XXX We don't handle resetting the kernel context
1412 			 * very well. If we trigger a device reset twice in
1413 			 * quick succession while the kernel context is
1414 			 * executing, we may end up skipping the breadcrumb.
1415 			 * This is really only a problem for the selftest as
1416 			 * normally there is a large interlude between resets
1417 			 * (hangcheck), or we focus on resetting just one
1418 			 * engine and so avoid repeatedly resetting innocents.
1419 			 */
1420 			err = wait_for_others(gt, engine);
1421 			if (err) {
1422 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1423 				       __func__, engine->name);
1424 				i915_request_put(rq);
1425 				i915_request_put(prev);
1426 
1427 				GEM_TRACE_DUMP();
1428 				intel_gt_set_wedged(gt);
1429 				goto fini;
1430 			}
1431 
1432 			if (!wait_until_running(&h, prev)) {
1433 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1434 
1435 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1436 				       __func__, engine->name,
1437 				       prev->fence.seqno, hws_seqno(&h, prev));
1438 				intel_engine_dump(engine, &p,
1439 						  "%s\n", engine->name);
1440 
1441 				i915_request_put(rq);
1442 				i915_request_put(prev);
1443 
1444 				intel_gt_set_wedged(gt);
1445 
1446 				err = -EIO;
1447 				goto fini;
1448 			}
1449 
1450 			reset_count = fake_hangcheck(gt, BIT(id));
1451 
1452 			if (prev->fence.error != -EIO) {
1453 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1454 				       prev->fence.error);
1455 				i915_request_put(rq);
1456 				i915_request_put(prev);
1457 				err = -EINVAL;
1458 				goto fini;
1459 			}
1460 
1461 			if (rq->fence.error) {
1462 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1463 				       rq->fence.error);
1464 				i915_request_put(rq);
1465 				i915_request_put(prev);
1466 				err = -EINVAL;
1467 				goto fini;
1468 			}
1469 
1470 			if (i915_reset_count(global) == reset_count) {
1471 				pr_err("No GPU reset recorded!\n");
1472 				i915_request_put(rq);
1473 				i915_request_put(prev);
1474 				err = -EINVAL;
1475 				goto fini;
1476 			}
1477 
1478 			i915_request_put(prev);
1479 			prev = rq;
1480 			count++;
1481 		} while (time_before(jiffies, end_time));
1482 		pr_info("%s: Completed %d resets\n", engine->name, count);
1483 
1484 		*h.batch = MI_BATCH_BUFFER_END;
1485 		intel_gt_chipset_flush(engine->gt);
1486 
1487 		i915_request_put(prev);
1488 
1489 		err = igt_flush_test(gt->i915);
1490 		if (err)
1491 			break;
1492 	}
1493 
1494 fini:
1495 	hang_fini(&h);
1496 unlock:
1497 	igt_global_reset_unlock(gt);
1498 
1499 	if (intel_gt_is_wedged(gt))
1500 		return -EIO;
1501 
1502 	return err;
1503 }
1504 
1505 static int igt_handle_error(void *arg)
1506 {
1507 	struct intel_gt *gt = arg;
1508 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1509 	struct intel_engine_cs *engine = gt->engine[RCS0];
1510 	struct hang h;
1511 	struct i915_request *rq;
1512 	struct i915_gpu_coredump *error;
1513 	int err;
1514 
1515 	/* Check that we can issue a global GPU and engine reset */
1516 
1517 	if (!intel_has_reset_engine(gt))
1518 		return 0;
1519 
1520 	if (!engine || !intel_engine_can_store_dword(engine))
1521 		return 0;
1522 
1523 	err = hang_init(&h, gt);
1524 	if (err)
1525 		return err;
1526 
1527 	rq = hang_create_request(&h, engine);
1528 	if (IS_ERR(rq)) {
1529 		err = PTR_ERR(rq);
1530 		goto err_fini;
1531 	}
1532 
1533 	i915_request_get(rq);
1534 	i915_request_add(rq);
1535 
1536 	if (!wait_until_running(&h, rq)) {
1537 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1538 
1539 		pr_err("%s: Failed to start request %llx, at %x\n",
1540 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1541 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1542 
1543 		intel_gt_set_wedged(gt);
1544 
1545 		err = -EIO;
1546 		goto err_request;
1547 	}
1548 
1549 	/* Temporarily disable error capture */
1550 	error = xchg(&global->first_error, (void *)-1);
1551 
1552 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1553 
1554 	xchg(&global->first_error, error);
1555 
1556 	if (rq->fence.error != -EIO) {
1557 		pr_err("Guilty request not identified!\n");
1558 		err = -EINVAL;
1559 		goto err_request;
1560 	}
1561 
1562 err_request:
1563 	i915_request_put(rq);
1564 err_fini:
1565 	hang_fini(&h);
1566 	return err;
1567 }
1568 
1569 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1570 				     const struct igt_atomic_section *p,
1571 				     const char *mode)
1572 {
1573 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1574 	int err;
1575 
1576 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1577 		  engine->name, mode, p->name);
1578 
1579 	tasklet_disable(t);
1580 	p->critical_section_begin();
1581 
1582 	err = intel_engine_reset(engine, NULL);
1583 
1584 	p->critical_section_end();
1585 	tasklet_enable(t);
1586 
1587 	if (err)
1588 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1589 		       engine->name, mode, p->name);
1590 
1591 	return err;
1592 }
1593 
1594 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1595 				   const struct igt_atomic_section *p)
1596 {
1597 	struct i915_request *rq;
1598 	struct hang h;
1599 	int err;
1600 
1601 	err = __igt_atomic_reset_engine(engine, p, "idle");
1602 	if (err)
1603 		return err;
1604 
1605 	err = hang_init(&h, engine->gt);
1606 	if (err)
1607 		return err;
1608 
1609 	rq = hang_create_request(&h, engine);
1610 	if (IS_ERR(rq)) {
1611 		err = PTR_ERR(rq);
1612 		goto out;
1613 	}
1614 
1615 	i915_request_get(rq);
1616 	i915_request_add(rq);
1617 
1618 	if (wait_until_running(&h, rq)) {
1619 		err = __igt_atomic_reset_engine(engine, p, "active");
1620 	} else {
1621 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1622 		       __func__, engine->name,
1623 		       rq->fence.seqno, hws_seqno(&h, rq));
1624 		intel_gt_set_wedged(engine->gt);
1625 		err = -EIO;
1626 	}
1627 
1628 	if (err == 0) {
1629 		struct intel_wedge_me w;
1630 
1631 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1632 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1633 		if (intel_gt_is_wedged(engine->gt))
1634 			err = -EIO;
1635 	}
1636 
1637 	i915_request_put(rq);
1638 out:
1639 	hang_fini(&h);
1640 	return err;
1641 }
1642 
1643 static int igt_reset_engines_atomic(void *arg)
1644 {
1645 	struct intel_gt *gt = arg;
1646 	const typeof(*igt_atomic_phases) *p;
1647 	int err = 0;
1648 
1649 	/* Check that the engines resets are usable from atomic context */
1650 
1651 	if (!intel_has_reset_engine(gt))
1652 		return 0;
1653 
1654 	if (intel_uc_uses_guc_submission(&gt->uc))
1655 		return 0;
1656 
1657 	igt_global_reset_lock(gt);
1658 
1659 	/* Flush any requests before we get started and check basics */
1660 	if (!igt_force_reset(gt))
1661 		goto unlock;
1662 
1663 	for (p = igt_atomic_phases; p->name; p++) {
1664 		struct intel_engine_cs *engine;
1665 		enum intel_engine_id id;
1666 
1667 		for_each_engine(engine, gt, id) {
1668 			err = igt_atomic_reset_engine(engine, p);
1669 			if (err)
1670 				goto out;
1671 		}
1672 	}
1673 
1674 out:
1675 	/* As we poke around the guts, do a full reset before continuing. */
1676 	igt_force_reset(gt);
1677 unlock:
1678 	igt_global_reset_unlock(gt);
1679 
1680 	return err;
1681 }
1682 
1683 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1684 {
1685 	static const struct i915_subtest tests[] = {
1686 		SUBTEST(igt_hang_sanitycheck),
1687 		SUBTEST(igt_reset_nop),
1688 		SUBTEST(igt_reset_nop_engine),
1689 		SUBTEST(igt_reset_idle_engine),
1690 		SUBTEST(igt_reset_active_engine),
1691 		SUBTEST(igt_reset_engines),
1692 		SUBTEST(igt_reset_engines_atomic),
1693 		SUBTEST(igt_reset_queue),
1694 		SUBTEST(igt_reset_wait),
1695 		SUBTEST(igt_reset_evict_ggtt),
1696 		SUBTEST(igt_reset_evict_ppgtt),
1697 		SUBTEST(igt_reset_evict_fence),
1698 		SUBTEST(igt_handle_error),
1699 	};
1700 	struct intel_gt *gt = &i915->gt;
1701 	intel_wakeref_t wakeref;
1702 	int err;
1703 
1704 	if (!intel_has_gpu_reset(gt))
1705 		return 0;
1706 
1707 	if (intel_gt_is_wedged(gt))
1708 		return -EIO; /* we're long past hope of a successful reset */
1709 
1710 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1711 
1712 	err = intel_gt_live_subtests(tests, gt);
1713 
1714 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1715 
1716 	return err;
1717 }
1718