1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 
33 #include "i915_selftest.h"
34 #include "selftests/i915_random.h"
35 #include "selftests/igt_flush_test.h"
36 #include "selftests/igt_reset.h"
37 #include "selftests/igt_atomic.h"
38 
39 #include "selftests/mock_drm.h"
40 
41 #include "gem/selftests/mock_context.h"
42 #include "gem/selftests/igt_gem_utils.h"
43 
44 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
45 
46 struct hang {
47 	struct intel_gt *gt;
48 	struct drm_i915_gem_object *hws;
49 	struct drm_i915_gem_object *obj;
50 	struct i915_gem_context *ctx;
51 	u32 *seqno;
52 	u32 *batch;
53 };
54 
55 static int hang_init(struct hang *h, struct intel_gt *gt)
56 {
57 	void *vaddr;
58 	int err;
59 
60 	memset(h, 0, sizeof(*h));
61 	h->gt = gt;
62 
63 	h->ctx = kernel_context(gt->i915);
64 	if (IS_ERR(h->ctx))
65 		return PTR_ERR(h->ctx);
66 
67 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
68 
69 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
70 	if (IS_ERR(h->hws)) {
71 		err = PTR_ERR(h->hws);
72 		goto err_ctx;
73 	}
74 
75 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
76 	if (IS_ERR(h->obj)) {
77 		err = PTR_ERR(h->obj);
78 		goto err_hws;
79 	}
80 
81 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
82 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
83 	if (IS_ERR(vaddr)) {
84 		err = PTR_ERR(vaddr);
85 		goto err_obj;
86 	}
87 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
88 
89 	vaddr = i915_gem_object_pin_map(h->obj,
90 					i915_coherent_map_type(gt->i915));
91 	if (IS_ERR(vaddr)) {
92 		err = PTR_ERR(vaddr);
93 		goto err_unpin_hws;
94 	}
95 	h->batch = vaddr;
96 
97 	return 0;
98 
99 err_unpin_hws:
100 	i915_gem_object_unpin_map(h->hws);
101 err_obj:
102 	i915_gem_object_put(h->obj);
103 err_hws:
104 	i915_gem_object_put(h->hws);
105 err_ctx:
106 	kernel_context_close(h->ctx);
107 	return err;
108 }
109 
110 static u64 hws_address(const struct i915_vma *hws,
111 		       const struct i915_request *rq)
112 {
113 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
114 }
115 
116 static int move_to_active(struct i915_vma *vma,
117 			  struct i915_request *rq,
118 			  unsigned int flags)
119 {
120 	int err;
121 
122 	i915_vma_lock(vma);
123 	err = i915_request_await_object(rq, vma->obj,
124 					flags & EXEC_OBJECT_WRITE);
125 	if (err == 0)
126 		err = i915_vma_move_to_active(vma, rq, flags);
127 	i915_vma_unlock(vma);
128 
129 	return err;
130 }
131 
132 static struct i915_request *
133 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
134 {
135 	struct intel_gt *gt = h->gt;
136 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
137 	struct drm_i915_gem_object *obj;
138 	struct i915_request *rq = NULL;
139 	struct i915_vma *hws, *vma;
140 	unsigned int flags;
141 	void *vaddr;
142 	u32 *batch;
143 	int err;
144 
145 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
146 	if (IS_ERR(obj)) {
147 		i915_vm_put(vm);
148 		return ERR_CAST(obj);
149 	}
150 
151 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
152 	if (IS_ERR(vaddr)) {
153 		i915_gem_object_put(obj);
154 		i915_vm_put(vm);
155 		return ERR_CAST(vaddr);
156 	}
157 
158 	i915_gem_object_unpin_map(h->obj);
159 	i915_gem_object_put(h->obj);
160 
161 	h->obj = obj;
162 	h->batch = vaddr;
163 
164 	vma = i915_vma_instance(h->obj, vm, NULL);
165 	if (IS_ERR(vma)) {
166 		i915_vm_put(vm);
167 		return ERR_CAST(vma);
168 	}
169 
170 	hws = i915_vma_instance(h->hws, vm, NULL);
171 	if (IS_ERR(hws)) {
172 		i915_vm_put(vm);
173 		return ERR_CAST(hws);
174 	}
175 
176 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
177 	if (err) {
178 		i915_vm_put(vm);
179 		return ERR_PTR(err);
180 	}
181 
182 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
183 	if (err)
184 		goto unpin_vma;
185 
186 	rq = igt_request_alloc(h->ctx, engine);
187 	if (IS_ERR(rq)) {
188 		err = PTR_ERR(rq);
189 		goto unpin_hws;
190 	}
191 
192 	err = move_to_active(vma, rq, 0);
193 	if (err)
194 		goto cancel_rq;
195 
196 	err = move_to_active(hws, rq, 0);
197 	if (err)
198 		goto cancel_rq;
199 
200 	batch = h->batch;
201 	if (INTEL_GEN(gt->i915) >= 8) {
202 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
203 		*batch++ = lower_32_bits(hws_address(hws, rq));
204 		*batch++ = upper_32_bits(hws_address(hws, rq));
205 		*batch++ = rq->fence.seqno;
206 		*batch++ = MI_ARB_CHECK;
207 
208 		memset(batch, 0, 1024);
209 		batch += 1024 / sizeof(*batch);
210 
211 		*batch++ = MI_ARB_CHECK;
212 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
213 		*batch++ = lower_32_bits(vma->node.start);
214 		*batch++ = upper_32_bits(vma->node.start);
215 	} else if (INTEL_GEN(gt->i915) >= 6) {
216 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
217 		*batch++ = 0;
218 		*batch++ = lower_32_bits(hws_address(hws, rq));
219 		*batch++ = rq->fence.seqno;
220 		*batch++ = MI_ARB_CHECK;
221 
222 		memset(batch, 0, 1024);
223 		batch += 1024 / sizeof(*batch);
224 
225 		*batch++ = MI_ARB_CHECK;
226 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
227 		*batch++ = lower_32_bits(vma->node.start);
228 	} else if (INTEL_GEN(gt->i915) >= 4) {
229 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
230 		*batch++ = 0;
231 		*batch++ = lower_32_bits(hws_address(hws, rq));
232 		*batch++ = rq->fence.seqno;
233 		*batch++ = MI_ARB_CHECK;
234 
235 		memset(batch, 0, 1024);
236 		batch += 1024 / sizeof(*batch);
237 
238 		*batch++ = MI_ARB_CHECK;
239 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
240 		*batch++ = lower_32_bits(vma->node.start);
241 	} else {
242 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
243 		*batch++ = lower_32_bits(hws_address(hws, rq));
244 		*batch++ = rq->fence.seqno;
245 		*batch++ = MI_ARB_CHECK;
246 
247 		memset(batch, 0, 1024);
248 		batch += 1024 / sizeof(*batch);
249 
250 		*batch++ = MI_ARB_CHECK;
251 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
252 		*batch++ = lower_32_bits(vma->node.start);
253 	}
254 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
255 	intel_gt_chipset_flush(engine->gt);
256 
257 	if (rq->engine->emit_init_breadcrumb) {
258 		err = rq->engine->emit_init_breadcrumb(rq);
259 		if (err)
260 			goto cancel_rq;
261 	}
262 
263 	flags = 0;
264 	if (INTEL_GEN(gt->i915) <= 5)
265 		flags |= I915_DISPATCH_SECURE;
266 
267 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
268 
269 cancel_rq:
270 	if (err) {
271 		i915_request_skip(rq, err);
272 		i915_request_add(rq);
273 	}
274 unpin_hws:
275 	i915_vma_unpin(hws);
276 unpin_vma:
277 	i915_vma_unpin(vma);
278 	i915_vm_put(vm);
279 	return err ? ERR_PTR(err) : rq;
280 }
281 
282 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
283 {
284 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
285 }
286 
287 static void hang_fini(struct hang *h)
288 {
289 	*h->batch = MI_BATCH_BUFFER_END;
290 	intel_gt_chipset_flush(h->gt);
291 
292 	i915_gem_object_unpin_map(h->obj);
293 	i915_gem_object_put(h->obj);
294 
295 	i915_gem_object_unpin_map(h->hws);
296 	i915_gem_object_put(h->hws);
297 
298 	kernel_context_close(h->ctx);
299 
300 	igt_flush_test(h->gt->i915);
301 }
302 
303 static bool wait_until_running(struct hang *h, struct i915_request *rq)
304 {
305 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
306 					       rq->fence.seqno),
307 			     10) &&
308 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
309 					    rq->fence.seqno),
310 			  1000));
311 }
312 
313 static void engine_heartbeat_disable(struct intel_engine_cs *engine,
314 				     unsigned long *saved)
315 {
316 	*saved = engine->props.heartbeat_interval_ms;
317 	engine->props.heartbeat_interval_ms = 0;
318 
319 	intel_engine_pm_get(engine);
320 	intel_engine_park_heartbeat(engine);
321 }
322 
323 static void engine_heartbeat_enable(struct intel_engine_cs *engine,
324 				    unsigned long saved)
325 {
326 	intel_engine_pm_put(engine);
327 
328 	engine->props.heartbeat_interval_ms = saved;
329 }
330 
331 static int igt_hang_sanitycheck(void *arg)
332 {
333 	struct intel_gt *gt = arg;
334 	struct i915_request *rq;
335 	struct intel_engine_cs *engine;
336 	enum intel_engine_id id;
337 	struct hang h;
338 	int err;
339 
340 	/* Basic check that we can execute our hanging batch */
341 
342 	err = hang_init(&h, gt);
343 	if (err)
344 		return err;
345 
346 	for_each_engine(engine, gt, id) {
347 		struct intel_wedge_me w;
348 		long timeout;
349 
350 		if (!intel_engine_can_store_dword(engine))
351 			continue;
352 
353 		rq = hang_create_request(&h, engine);
354 		if (IS_ERR(rq)) {
355 			err = PTR_ERR(rq);
356 			pr_err("Failed to create request for %s, err=%d\n",
357 			       engine->name, err);
358 			goto fini;
359 		}
360 
361 		i915_request_get(rq);
362 
363 		*h.batch = MI_BATCH_BUFFER_END;
364 		intel_gt_chipset_flush(engine->gt);
365 
366 		i915_request_add(rq);
367 
368 		timeout = 0;
369 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
370 			timeout = i915_request_wait(rq, 0,
371 						    MAX_SCHEDULE_TIMEOUT);
372 		if (intel_gt_is_wedged(gt))
373 			timeout = -EIO;
374 
375 		i915_request_put(rq);
376 
377 		if (timeout < 0) {
378 			err = timeout;
379 			pr_err("Wait for request failed on %s, err=%d\n",
380 			       engine->name, err);
381 			goto fini;
382 		}
383 	}
384 
385 fini:
386 	hang_fini(&h);
387 	return err;
388 }
389 
390 static bool wait_for_idle(struct intel_engine_cs *engine)
391 {
392 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
393 }
394 
395 static int igt_reset_nop(void *arg)
396 {
397 	struct intel_gt *gt = arg;
398 	struct i915_gpu_error *global = &gt->i915->gpu_error;
399 	struct intel_engine_cs *engine;
400 	unsigned int reset_count, count;
401 	enum intel_engine_id id;
402 	IGT_TIMEOUT(end_time);
403 	int err = 0;
404 
405 	/* Check that we can reset during non-user portions of requests */
406 
407 	reset_count = i915_reset_count(global);
408 	count = 0;
409 	do {
410 		for_each_engine(engine, gt, id) {
411 			struct intel_context *ce;
412 			int i;
413 
414 			ce = intel_context_create(engine);
415 			if (IS_ERR(ce)) {
416 				err = PTR_ERR(ce);
417 				break;
418 			}
419 
420 			for (i = 0; i < 16; i++) {
421 				struct i915_request *rq;
422 
423 				rq = intel_context_create_request(ce);
424 				if (IS_ERR(rq)) {
425 					err = PTR_ERR(rq);
426 					break;
427 				}
428 
429 				i915_request_add(rq);
430 			}
431 
432 			intel_context_put(ce);
433 		}
434 
435 		igt_global_reset_lock(gt);
436 		intel_gt_reset(gt, ALL_ENGINES, NULL);
437 		igt_global_reset_unlock(gt);
438 
439 		if (intel_gt_is_wedged(gt)) {
440 			err = -EIO;
441 			break;
442 		}
443 
444 		if (i915_reset_count(global) != reset_count + ++count) {
445 			pr_err("Full GPU reset not recorded!\n");
446 			err = -EINVAL;
447 			break;
448 		}
449 
450 		err = igt_flush_test(gt->i915);
451 		if (err)
452 			break;
453 	} while (time_before(jiffies, end_time));
454 	pr_info("%s: %d resets\n", __func__, count);
455 
456 	if (igt_flush_test(gt->i915))
457 		err = -EIO;
458 	return err;
459 }
460 
461 static int igt_reset_nop_engine(void *arg)
462 {
463 	struct intel_gt *gt = arg;
464 	struct i915_gpu_error *global = &gt->i915->gpu_error;
465 	struct intel_engine_cs *engine;
466 	enum intel_engine_id id;
467 
468 	/* Check that we can engine-reset during non-user portions */
469 
470 	if (!intel_has_reset_engine(gt))
471 		return 0;
472 
473 	for_each_engine(engine, gt, id) {
474 		unsigned int reset_count, reset_engine_count, count;
475 		struct intel_context *ce;
476 		unsigned long heartbeat;
477 		IGT_TIMEOUT(end_time);
478 		int err;
479 
480 		ce = intel_context_create(engine);
481 		if (IS_ERR(ce))
482 			return PTR_ERR(ce);
483 
484 		reset_count = i915_reset_count(global);
485 		reset_engine_count = i915_reset_engine_count(global, engine);
486 		count = 0;
487 
488 		engine_heartbeat_disable(engine, &heartbeat);
489 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
490 		do {
491 			int i;
492 
493 			if (!wait_for_idle(engine)) {
494 				pr_err("%s failed to idle before reset\n",
495 				       engine->name);
496 				err = -EIO;
497 				break;
498 			}
499 
500 			for (i = 0; i < 16; i++) {
501 				struct i915_request *rq;
502 
503 				rq = intel_context_create_request(ce);
504 				if (IS_ERR(rq)) {
505 					err = PTR_ERR(rq);
506 					break;
507 				}
508 
509 				i915_request_add(rq);
510 			}
511 			err = intel_engine_reset(engine, NULL);
512 			if (err) {
513 				pr_err("i915_reset_engine failed\n");
514 				break;
515 			}
516 
517 			if (i915_reset_count(global) != reset_count) {
518 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
519 				err = -EINVAL;
520 				break;
521 			}
522 
523 			if (i915_reset_engine_count(global, engine) !=
524 			    reset_engine_count + ++count) {
525 				pr_err("%s engine reset not recorded!\n",
526 				       engine->name);
527 				err = -EINVAL;
528 				break;
529 			}
530 		} while (time_before(jiffies, end_time));
531 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
532 		engine_heartbeat_enable(engine, heartbeat);
533 
534 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
535 
536 		intel_context_put(ce);
537 		if (igt_flush_test(gt->i915))
538 			err = -EIO;
539 		if (err)
540 			return err;
541 	}
542 
543 	return 0;
544 }
545 
546 static int __igt_reset_engine(struct intel_gt *gt, bool active)
547 {
548 	struct i915_gpu_error *global = &gt->i915->gpu_error;
549 	struct intel_engine_cs *engine;
550 	enum intel_engine_id id;
551 	struct hang h;
552 	int err = 0;
553 
554 	/* Check that we can issue an engine reset on an idle engine (no-op) */
555 
556 	if (!intel_has_reset_engine(gt))
557 		return 0;
558 
559 	if (active) {
560 		err = hang_init(&h, gt);
561 		if (err)
562 			return err;
563 	}
564 
565 	for_each_engine(engine, gt, id) {
566 		unsigned int reset_count, reset_engine_count;
567 		unsigned long heartbeat;
568 		IGT_TIMEOUT(end_time);
569 
570 		if (active && !intel_engine_can_store_dword(engine))
571 			continue;
572 
573 		if (!wait_for_idle(engine)) {
574 			pr_err("%s failed to idle before reset\n",
575 			       engine->name);
576 			err = -EIO;
577 			break;
578 		}
579 
580 		reset_count = i915_reset_count(global);
581 		reset_engine_count = i915_reset_engine_count(global, engine);
582 
583 		engine_heartbeat_disable(engine, &heartbeat);
584 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
585 		do {
586 			if (active) {
587 				struct i915_request *rq;
588 
589 				rq = hang_create_request(&h, engine);
590 				if (IS_ERR(rq)) {
591 					err = PTR_ERR(rq);
592 					break;
593 				}
594 
595 				i915_request_get(rq);
596 				i915_request_add(rq);
597 
598 				if (!wait_until_running(&h, rq)) {
599 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
600 
601 					pr_err("%s: Failed to start request %llx, at %x\n",
602 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
603 					intel_engine_dump(engine, &p,
604 							  "%s\n", engine->name);
605 
606 					i915_request_put(rq);
607 					err = -EIO;
608 					break;
609 				}
610 
611 				i915_request_put(rq);
612 			}
613 
614 			err = intel_engine_reset(engine, NULL);
615 			if (err) {
616 				pr_err("i915_reset_engine failed\n");
617 				break;
618 			}
619 
620 			if (i915_reset_count(global) != reset_count) {
621 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
622 				err = -EINVAL;
623 				break;
624 			}
625 
626 			if (i915_reset_engine_count(global, engine) !=
627 			    ++reset_engine_count) {
628 				pr_err("%s engine reset not recorded!\n",
629 				       engine->name);
630 				err = -EINVAL;
631 				break;
632 			}
633 		} while (time_before(jiffies, end_time));
634 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
635 		engine_heartbeat_enable(engine, heartbeat);
636 
637 		if (err)
638 			break;
639 
640 		err = igt_flush_test(gt->i915);
641 		if (err)
642 			break;
643 	}
644 
645 	if (intel_gt_is_wedged(gt))
646 		err = -EIO;
647 
648 	if (active)
649 		hang_fini(&h);
650 
651 	return err;
652 }
653 
654 static int igt_reset_idle_engine(void *arg)
655 {
656 	return __igt_reset_engine(arg, false);
657 }
658 
659 static int igt_reset_active_engine(void *arg)
660 {
661 	return __igt_reset_engine(arg, true);
662 }
663 
664 struct active_engine {
665 	struct task_struct *task;
666 	struct intel_engine_cs *engine;
667 	unsigned long resets;
668 	unsigned int flags;
669 };
670 
671 #define TEST_ACTIVE	BIT(0)
672 #define TEST_OTHERS	BIT(1)
673 #define TEST_SELF	BIT(2)
674 #define TEST_PRIORITY	BIT(3)
675 
676 static int active_request_put(struct i915_request *rq)
677 {
678 	int err = 0;
679 
680 	if (!rq)
681 		return 0;
682 
683 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
684 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
685 			  rq->engine->name,
686 			  rq->fence.context,
687 			  rq->fence.seqno);
688 		GEM_TRACE_DUMP();
689 
690 		intel_gt_set_wedged(rq->engine->gt);
691 		err = -EIO;
692 	}
693 
694 	i915_request_put(rq);
695 
696 	return err;
697 }
698 
699 static int active_engine(void *data)
700 {
701 	I915_RND_STATE(prng);
702 	struct active_engine *arg = data;
703 	struct intel_engine_cs *engine = arg->engine;
704 	struct i915_request *rq[8] = {};
705 	struct intel_context *ce[ARRAY_SIZE(rq)];
706 	unsigned long count;
707 	int err = 0;
708 
709 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
710 		ce[count] = intel_context_create(engine);
711 		if (IS_ERR(ce[count])) {
712 			err = PTR_ERR(ce[count]);
713 			while (--count)
714 				intel_context_put(ce[count]);
715 			return err;
716 		}
717 	}
718 
719 	count = 0;
720 	while (!kthread_should_stop()) {
721 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
722 		struct i915_request *old = rq[idx];
723 		struct i915_request *new;
724 
725 		new = intel_context_create_request(ce[idx]);
726 		if (IS_ERR(new)) {
727 			err = PTR_ERR(new);
728 			break;
729 		}
730 
731 		rq[idx] = i915_request_get(new);
732 		i915_request_add(new);
733 
734 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
735 			struct i915_sched_attr attr = {
736 				.priority =
737 					i915_prandom_u32_max_state(512, &prng),
738 			};
739 			engine->schedule(rq[idx], &attr);
740 		}
741 
742 		err = active_request_put(old);
743 		if (err)
744 			break;
745 
746 		cond_resched();
747 	}
748 
749 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
750 		int err__ = active_request_put(rq[count]);
751 
752 		/* Keep the first error */
753 		if (!err)
754 			err = err__;
755 
756 		intel_context_put(ce[count]);
757 	}
758 
759 	return err;
760 }
761 
762 static int __igt_reset_engines(struct intel_gt *gt,
763 			       const char *test_name,
764 			       unsigned int flags)
765 {
766 	struct i915_gpu_error *global = &gt->i915->gpu_error;
767 	struct intel_engine_cs *engine, *other;
768 	enum intel_engine_id id, tmp;
769 	struct hang h;
770 	int err = 0;
771 
772 	/* Check that issuing a reset on one engine does not interfere
773 	 * with any other engine.
774 	 */
775 
776 	if (!intel_has_reset_engine(gt))
777 		return 0;
778 
779 	if (flags & TEST_ACTIVE) {
780 		err = hang_init(&h, gt);
781 		if (err)
782 			return err;
783 
784 		if (flags & TEST_PRIORITY)
785 			h.ctx->sched.priority = 1024;
786 	}
787 
788 	for_each_engine(engine, gt, id) {
789 		struct active_engine threads[I915_NUM_ENGINES] = {};
790 		unsigned long device = i915_reset_count(global);
791 		unsigned long count = 0, reported;
792 		unsigned long heartbeat;
793 		IGT_TIMEOUT(end_time);
794 
795 		if (flags & TEST_ACTIVE &&
796 		    !intel_engine_can_store_dword(engine))
797 			continue;
798 
799 		if (!wait_for_idle(engine)) {
800 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
801 			       engine->name, test_name);
802 			err = -EIO;
803 			break;
804 		}
805 
806 		memset(threads, 0, sizeof(threads));
807 		for_each_engine(other, gt, tmp) {
808 			struct task_struct *tsk;
809 
810 			threads[tmp].resets =
811 				i915_reset_engine_count(global, other);
812 
813 			if (!(flags & TEST_OTHERS))
814 				continue;
815 
816 			if (other == engine && !(flags & TEST_SELF))
817 				continue;
818 
819 			threads[tmp].engine = other;
820 			threads[tmp].flags = flags;
821 
822 			tsk = kthread_run(active_engine, &threads[tmp],
823 					  "igt/%s", other->name);
824 			if (IS_ERR(tsk)) {
825 				err = PTR_ERR(tsk);
826 				goto unwind;
827 			}
828 
829 			threads[tmp].task = tsk;
830 			get_task_struct(tsk);
831 		}
832 
833 		yield(); /* start all threads before we begin */
834 
835 		engine_heartbeat_disable(engine, &heartbeat);
836 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
837 		do {
838 			struct i915_request *rq = NULL;
839 
840 			if (flags & TEST_ACTIVE) {
841 				rq = hang_create_request(&h, engine);
842 				if (IS_ERR(rq)) {
843 					err = PTR_ERR(rq);
844 					break;
845 				}
846 
847 				i915_request_get(rq);
848 				i915_request_add(rq);
849 
850 				if (!wait_until_running(&h, rq)) {
851 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
852 
853 					pr_err("%s: Failed to start request %llx, at %x\n",
854 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
855 					intel_engine_dump(engine, &p,
856 							  "%s\n", engine->name);
857 
858 					i915_request_put(rq);
859 					err = -EIO;
860 					break;
861 				}
862 			}
863 
864 			err = intel_engine_reset(engine, NULL);
865 			if (err) {
866 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
867 				       engine->name, test_name, err);
868 				break;
869 			}
870 
871 			count++;
872 
873 			if (rq) {
874 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
875 					struct drm_printer p =
876 						drm_info_printer(gt->i915->drm.dev);
877 
878 					pr_err("i915_reset_engine(%s:%s):"
879 					       " failed to complete request after reset\n",
880 					       engine->name, test_name);
881 					intel_engine_dump(engine, &p,
882 							  "%s\n", engine->name);
883 					i915_request_put(rq);
884 
885 					GEM_TRACE_DUMP();
886 					intel_gt_set_wedged(gt);
887 					err = -EIO;
888 					break;
889 				}
890 
891 				i915_request_put(rq);
892 			}
893 
894 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
895 				struct drm_printer p =
896 					drm_info_printer(gt->i915->drm.dev);
897 
898 				pr_err("i915_reset_engine(%s:%s):"
899 				       " failed to idle after reset\n",
900 				       engine->name, test_name);
901 				intel_engine_dump(engine, &p,
902 						  "%s\n", engine->name);
903 
904 				err = -EIO;
905 				break;
906 			}
907 		} while (time_before(jiffies, end_time));
908 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
909 		engine_heartbeat_enable(engine, heartbeat);
910 
911 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
912 			engine->name, test_name, count);
913 
914 		reported = i915_reset_engine_count(global, engine);
915 		reported -= threads[engine->id].resets;
916 		if (reported != count) {
917 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
918 			       engine->name, test_name, count, reported);
919 			if (!err)
920 				err = -EINVAL;
921 		}
922 
923 unwind:
924 		for_each_engine(other, gt, tmp) {
925 			int ret;
926 
927 			if (!threads[tmp].task)
928 				continue;
929 
930 			ret = kthread_stop(threads[tmp].task);
931 			if (ret) {
932 				pr_err("kthread for other engine %s failed, err=%d\n",
933 				       other->name, ret);
934 				if (!err)
935 					err = ret;
936 			}
937 			put_task_struct(threads[tmp].task);
938 
939 			if (other->uabi_class != engine->uabi_class &&
940 			    threads[tmp].resets !=
941 			    i915_reset_engine_count(global, other)) {
942 				pr_err("Innocent engine %s was reset (count=%ld)\n",
943 				       other->name,
944 				       i915_reset_engine_count(global, other) -
945 				       threads[tmp].resets);
946 				if (!err)
947 					err = -EINVAL;
948 			}
949 		}
950 
951 		if (device != i915_reset_count(global)) {
952 			pr_err("Global reset (count=%ld)!\n",
953 			       i915_reset_count(global) - device);
954 			if (!err)
955 				err = -EINVAL;
956 		}
957 
958 		if (err)
959 			break;
960 
961 		err = igt_flush_test(gt->i915);
962 		if (err)
963 			break;
964 	}
965 
966 	if (intel_gt_is_wedged(gt))
967 		err = -EIO;
968 
969 	if (flags & TEST_ACTIVE)
970 		hang_fini(&h);
971 
972 	return err;
973 }
974 
975 static int igt_reset_engines(void *arg)
976 {
977 	static const struct {
978 		const char *name;
979 		unsigned int flags;
980 	} phases[] = {
981 		{ "idle", 0 },
982 		{ "active", TEST_ACTIVE },
983 		{ "others-idle", TEST_OTHERS },
984 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
985 		{
986 			"others-priority",
987 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
988 		},
989 		{
990 			"self-priority",
991 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
992 		},
993 		{ }
994 	};
995 	struct intel_gt *gt = arg;
996 	typeof(*phases) *p;
997 	int err;
998 
999 	for (p = phases; p->name; p++) {
1000 		if (p->flags & TEST_PRIORITY) {
1001 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1002 				continue;
1003 		}
1004 
1005 		err = __igt_reset_engines(arg, p->name, p->flags);
1006 		if (err)
1007 			return err;
1008 	}
1009 
1010 	return 0;
1011 }
1012 
1013 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1014 {
1015 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1016 
1017 	intel_gt_reset(gt, mask, NULL);
1018 
1019 	return count;
1020 }
1021 
1022 static int igt_reset_wait(void *arg)
1023 {
1024 	struct intel_gt *gt = arg;
1025 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1026 	struct intel_engine_cs *engine = gt->engine[RCS0];
1027 	struct i915_request *rq;
1028 	unsigned int reset_count;
1029 	struct hang h;
1030 	long timeout;
1031 	int err;
1032 
1033 	if (!engine || !intel_engine_can_store_dword(engine))
1034 		return 0;
1035 
1036 	/* Check that we detect a stuck waiter and issue a reset */
1037 
1038 	igt_global_reset_lock(gt);
1039 
1040 	err = hang_init(&h, gt);
1041 	if (err)
1042 		goto unlock;
1043 
1044 	rq = hang_create_request(&h, engine);
1045 	if (IS_ERR(rq)) {
1046 		err = PTR_ERR(rq);
1047 		goto fini;
1048 	}
1049 
1050 	i915_request_get(rq);
1051 	i915_request_add(rq);
1052 
1053 	if (!wait_until_running(&h, rq)) {
1054 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1055 
1056 		pr_err("%s: Failed to start request %llx, at %x\n",
1057 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1058 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1059 
1060 		intel_gt_set_wedged(gt);
1061 
1062 		err = -EIO;
1063 		goto out_rq;
1064 	}
1065 
1066 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1067 
1068 	timeout = i915_request_wait(rq, 0, 10);
1069 	if (timeout < 0) {
1070 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1071 		       timeout);
1072 		err = timeout;
1073 		goto out_rq;
1074 	}
1075 
1076 	if (i915_reset_count(global) == reset_count) {
1077 		pr_err("No GPU reset recorded!\n");
1078 		err = -EINVAL;
1079 		goto out_rq;
1080 	}
1081 
1082 out_rq:
1083 	i915_request_put(rq);
1084 fini:
1085 	hang_fini(&h);
1086 unlock:
1087 	igt_global_reset_unlock(gt);
1088 
1089 	if (intel_gt_is_wedged(gt))
1090 		return -EIO;
1091 
1092 	return err;
1093 }
1094 
1095 struct evict_vma {
1096 	struct completion completion;
1097 	struct i915_vma *vma;
1098 };
1099 
1100 static int evict_vma(void *data)
1101 {
1102 	struct evict_vma *arg = data;
1103 	struct i915_address_space *vm = arg->vma->vm;
1104 	struct drm_mm_node evict = arg->vma->node;
1105 	int err;
1106 
1107 	complete(&arg->completion);
1108 
1109 	mutex_lock(&vm->mutex);
1110 	err = i915_gem_evict_for_node(vm, &evict, 0);
1111 	mutex_unlock(&vm->mutex);
1112 
1113 	return err;
1114 }
1115 
1116 static int evict_fence(void *data)
1117 {
1118 	struct evict_vma *arg = data;
1119 	int err;
1120 
1121 	complete(&arg->completion);
1122 
1123 	/* Mark the fence register as dirty to force the mmio update. */
1124 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1125 	if (err) {
1126 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1127 		return err;
1128 	}
1129 
1130 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1131 	if (err) {
1132 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1133 		return err;
1134 	}
1135 
1136 	err = i915_vma_pin_fence(arg->vma);
1137 	i915_vma_unpin(arg->vma);
1138 	if (err) {
1139 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1140 		return err;
1141 	}
1142 
1143 	i915_vma_unpin_fence(arg->vma);
1144 
1145 	return 0;
1146 }
1147 
1148 static int __igt_reset_evict_vma(struct intel_gt *gt,
1149 				 struct i915_address_space *vm,
1150 				 int (*fn)(void *),
1151 				 unsigned int flags)
1152 {
1153 	struct intel_engine_cs *engine = gt->engine[RCS0];
1154 	struct drm_i915_gem_object *obj;
1155 	struct task_struct *tsk = NULL;
1156 	struct i915_request *rq;
1157 	struct evict_vma arg;
1158 	struct hang h;
1159 	unsigned int pin_flags;
1160 	int err;
1161 
1162 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1163 		return 0;
1164 
1165 	if (!engine || !intel_engine_can_store_dword(engine))
1166 		return 0;
1167 
1168 	/* Check that we can recover an unbind stuck on a hanging request */
1169 
1170 	err = hang_init(&h, gt);
1171 	if (err)
1172 		return err;
1173 
1174 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1175 	if (IS_ERR(obj)) {
1176 		err = PTR_ERR(obj);
1177 		goto fini;
1178 	}
1179 
1180 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1181 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1182 		if (err) {
1183 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1184 			goto out_obj;
1185 		}
1186 	}
1187 
1188 	arg.vma = i915_vma_instance(obj, vm, NULL);
1189 	if (IS_ERR(arg.vma)) {
1190 		err = PTR_ERR(arg.vma);
1191 		goto out_obj;
1192 	}
1193 
1194 	rq = hang_create_request(&h, engine);
1195 	if (IS_ERR(rq)) {
1196 		err = PTR_ERR(rq);
1197 		goto out_obj;
1198 	}
1199 
1200 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1201 
1202 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1203 		pin_flags |= PIN_MAPPABLE;
1204 
1205 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1206 	if (err) {
1207 		i915_request_add(rq);
1208 		goto out_obj;
1209 	}
1210 
1211 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1212 		err = i915_vma_pin_fence(arg.vma);
1213 		if (err) {
1214 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1215 			i915_vma_unpin(arg.vma);
1216 			i915_request_add(rq);
1217 			goto out_obj;
1218 		}
1219 	}
1220 
1221 	i915_vma_lock(arg.vma);
1222 	err = i915_request_await_object(rq, arg.vma->obj,
1223 					flags & EXEC_OBJECT_WRITE);
1224 	if (err == 0)
1225 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1226 	i915_vma_unlock(arg.vma);
1227 
1228 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1229 		i915_vma_unpin_fence(arg.vma);
1230 	i915_vma_unpin(arg.vma);
1231 
1232 	i915_request_get(rq);
1233 	i915_request_add(rq);
1234 	if (err)
1235 		goto out_rq;
1236 
1237 	if (!wait_until_running(&h, rq)) {
1238 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1239 
1240 		pr_err("%s: Failed to start request %llx, at %x\n",
1241 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1242 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1243 
1244 		intel_gt_set_wedged(gt);
1245 		goto out_reset;
1246 	}
1247 
1248 	init_completion(&arg.completion);
1249 
1250 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1251 	if (IS_ERR(tsk)) {
1252 		err = PTR_ERR(tsk);
1253 		tsk = NULL;
1254 		goto out_reset;
1255 	}
1256 	get_task_struct(tsk);
1257 
1258 	wait_for_completion(&arg.completion);
1259 
1260 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1261 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1262 
1263 		pr_err("igt/evict_vma kthread did not wait\n");
1264 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1265 
1266 		intel_gt_set_wedged(gt);
1267 		goto out_reset;
1268 	}
1269 
1270 out_reset:
1271 	igt_global_reset_lock(gt);
1272 	fake_hangcheck(gt, rq->engine->mask);
1273 	igt_global_reset_unlock(gt);
1274 
1275 	if (tsk) {
1276 		struct intel_wedge_me w;
1277 
1278 		/* The reset, even indirectly, should take less than 10ms. */
1279 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1280 			err = kthread_stop(tsk);
1281 
1282 		put_task_struct(tsk);
1283 	}
1284 
1285 out_rq:
1286 	i915_request_put(rq);
1287 out_obj:
1288 	i915_gem_object_put(obj);
1289 fini:
1290 	hang_fini(&h);
1291 	if (intel_gt_is_wedged(gt))
1292 		return -EIO;
1293 
1294 	return err;
1295 }
1296 
1297 static int igt_reset_evict_ggtt(void *arg)
1298 {
1299 	struct intel_gt *gt = arg;
1300 
1301 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1302 				     evict_vma, EXEC_OBJECT_WRITE);
1303 }
1304 
1305 static int igt_reset_evict_ppgtt(void *arg)
1306 {
1307 	struct intel_gt *gt = arg;
1308 	struct i915_ppgtt *ppgtt;
1309 	int err;
1310 
1311 	/* aliasing == global gtt locking, covered above */
1312 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1313 		return 0;
1314 
1315 	ppgtt = i915_ppgtt_create(gt);
1316 	if (IS_ERR(ppgtt))
1317 		return PTR_ERR(ppgtt);
1318 
1319 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1320 				    evict_vma, EXEC_OBJECT_WRITE);
1321 	i915_vm_put(&ppgtt->vm);
1322 
1323 	return err;
1324 }
1325 
1326 static int igt_reset_evict_fence(void *arg)
1327 {
1328 	struct intel_gt *gt = arg;
1329 
1330 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1331 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1332 }
1333 
1334 static int wait_for_others(struct intel_gt *gt,
1335 			   struct intel_engine_cs *exclude)
1336 {
1337 	struct intel_engine_cs *engine;
1338 	enum intel_engine_id id;
1339 
1340 	for_each_engine(engine, gt, id) {
1341 		if (engine == exclude)
1342 			continue;
1343 
1344 		if (!wait_for_idle(engine))
1345 			return -EIO;
1346 	}
1347 
1348 	return 0;
1349 }
1350 
1351 static int igt_reset_queue(void *arg)
1352 {
1353 	struct intel_gt *gt = arg;
1354 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1355 	struct intel_engine_cs *engine;
1356 	enum intel_engine_id id;
1357 	struct hang h;
1358 	int err;
1359 
1360 	/* Check that we replay pending requests following a hang */
1361 
1362 	igt_global_reset_lock(gt);
1363 
1364 	err = hang_init(&h, gt);
1365 	if (err)
1366 		goto unlock;
1367 
1368 	for_each_engine(engine, gt, id) {
1369 		struct i915_request *prev;
1370 		IGT_TIMEOUT(end_time);
1371 		unsigned int count;
1372 
1373 		if (!intel_engine_can_store_dword(engine))
1374 			continue;
1375 
1376 		prev = hang_create_request(&h, engine);
1377 		if (IS_ERR(prev)) {
1378 			err = PTR_ERR(prev);
1379 			goto fini;
1380 		}
1381 
1382 		i915_request_get(prev);
1383 		i915_request_add(prev);
1384 
1385 		count = 0;
1386 		do {
1387 			struct i915_request *rq;
1388 			unsigned int reset_count;
1389 
1390 			rq = hang_create_request(&h, engine);
1391 			if (IS_ERR(rq)) {
1392 				err = PTR_ERR(rq);
1393 				goto fini;
1394 			}
1395 
1396 			i915_request_get(rq);
1397 			i915_request_add(rq);
1398 
1399 			/*
1400 			 * XXX We don't handle resetting the kernel context
1401 			 * very well. If we trigger a device reset twice in
1402 			 * quick succession while the kernel context is
1403 			 * executing, we may end up skipping the breadcrumb.
1404 			 * This is really only a problem for the selftest as
1405 			 * normally there is a large interlude between resets
1406 			 * (hangcheck), or we focus on resetting just one
1407 			 * engine and so avoid repeatedly resetting innocents.
1408 			 */
1409 			err = wait_for_others(gt, engine);
1410 			if (err) {
1411 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1412 				       __func__, engine->name);
1413 				i915_request_put(rq);
1414 				i915_request_put(prev);
1415 
1416 				GEM_TRACE_DUMP();
1417 				intel_gt_set_wedged(gt);
1418 				goto fini;
1419 			}
1420 
1421 			if (!wait_until_running(&h, prev)) {
1422 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1423 
1424 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1425 				       __func__, engine->name,
1426 				       prev->fence.seqno, hws_seqno(&h, prev));
1427 				intel_engine_dump(engine, &p,
1428 						  "%s\n", engine->name);
1429 
1430 				i915_request_put(rq);
1431 				i915_request_put(prev);
1432 
1433 				intel_gt_set_wedged(gt);
1434 
1435 				err = -EIO;
1436 				goto fini;
1437 			}
1438 
1439 			reset_count = fake_hangcheck(gt, BIT(id));
1440 
1441 			if (prev->fence.error != -EIO) {
1442 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1443 				       prev->fence.error);
1444 				i915_request_put(rq);
1445 				i915_request_put(prev);
1446 				err = -EINVAL;
1447 				goto fini;
1448 			}
1449 
1450 			if (rq->fence.error) {
1451 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1452 				       rq->fence.error);
1453 				i915_request_put(rq);
1454 				i915_request_put(prev);
1455 				err = -EINVAL;
1456 				goto fini;
1457 			}
1458 
1459 			if (i915_reset_count(global) == reset_count) {
1460 				pr_err("No GPU reset recorded!\n");
1461 				i915_request_put(rq);
1462 				i915_request_put(prev);
1463 				err = -EINVAL;
1464 				goto fini;
1465 			}
1466 
1467 			i915_request_put(prev);
1468 			prev = rq;
1469 			count++;
1470 		} while (time_before(jiffies, end_time));
1471 		pr_info("%s: Completed %d resets\n", engine->name, count);
1472 
1473 		*h.batch = MI_BATCH_BUFFER_END;
1474 		intel_gt_chipset_flush(engine->gt);
1475 
1476 		i915_request_put(prev);
1477 
1478 		err = igt_flush_test(gt->i915);
1479 		if (err)
1480 			break;
1481 	}
1482 
1483 fini:
1484 	hang_fini(&h);
1485 unlock:
1486 	igt_global_reset_unlock(gt);
1487 
1488 	if (intel_gt_is_wedged(gt))
1489 		return -EIO;
1490 
1491 	return err;
1492 }
1493 
1494 static int igt_handle_error(void *arg)
1495 {
1496 	struct intel_gt *gt = arg;
1497 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1498 	struct intel_engine_cs *engine = gt->engine[RCS0];
1499 	struct hang h;
1500 	struct i915_request *rq;
1501 	struct i915_gpu_coredump *error;
1502 	int err;
1503 
1504 	/* Check that we can issue a global GPU and engine reset */
1505 
1506 	if (!intel_has_reset_engine(gt))
1507 		return 0;
1508 
1509 	if (!engine || !intel_engine_can_store_dword(engine))
1510 		return 0;
1511 
1512 	err = hang_init(&h, gt);
1513 	if (err)
1514 		return err;
1515 
1516 	rq = hang_create_request(&h, engine);
1517 	if (IS_ERR(rq)) {
1518 		err = PTR_ERR(rq);
1519 		goto err_fini;
1520 	}
1521 
1522 	i915_request_get(rq);
1523 	i915_request_add(rq);
1524 
1525 	if (!wait_until_running(&h, rq)) {
1526 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1527 
1528 		pr_err("%s: Failed to start request %llx, at %x\n",
1529 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1530 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1531 
1532 		intel_gt_set_wedged(gt);
1533 
1534 		err = -EIO;
1535 		goto err_request;
1536 	}
1537 
1538 	/* Temporarily disable error capture */
1539 	error = xchg(&global->first_error, (void *)-1);
1540 
1541 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1542 
1543 	xchg(&global->first_error, error);
1544 
1545 	if (rq->fence.error != -EIO) {
1546 		pr_err("Guilty request not identified!\n");
1547 		err = -EINVAL;
1548 		goto err_request;
1549 	}
1550 
1551 err_request:
1552 	i915_request_put(rq);
1553 err_fini:
1554 	hang_fini(&h);
1555 	return err;
1556 }
1557 
1558 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1559 				     const struct igt_atomic_section *p,
1560 				     const char *mode)
1561 {
1562 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1563 	int err;
1564 
1565 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1566 		  engine->name, mode, p->name);
1567 
1568 	tasklet_disable(t);
1569 	p->critical_section_begin();
1570 
1571 	err = intel_engine_reset(engine, NULL);
1572 
1573 	p->critical_section_end();
1574 	tasklet_enable(t);
1575 
1576 	if (err)
1577 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1578 		       engine->name, mode, p->name);
1579 
1580 	return err;
1581 }
1582 
1583 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1584 				   const struct igt_atomic_section *p)
1585 {
1586 	struct i915_request *rq;
1587 	struct hang h;
1588 	int err;
1589 
1590 	err = __igt_atomic_reset_engine(engine, p, "idle");
1591 	if (err)
1592 		return err;
1593 
1594 	err = hang_init(&h, engine->gt);
1595 	if (err)
1596 		return err;
1597 
1598 	rq = hang_create_request(&h, engine);
1599 	if (IS_ERR(rq)) {
1600 		err = PTR_ERR(rq);
1601 		goto out;
1602 	}
1603 
1604 	i915_request_get(rq);
1605 	i915_request_add(rq);
1606 
1607 	if (wait_until_running(&h, rq)) {
1608 		err = __igt_atomic_reset_engine(engine, p, "active");
1609 	} else {
1610 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1611 		       __func__, engine->name,
1612 		       rq->fence.seqno, hws_seqno(&h, rq));
1613 		intel_gt_set_wedged(engine->gt);
1614 		err = -EIO;
1615 	}
1616 
1617 	if (err == 0) {
1618 		struct intel_wedge_me w;
1619 
1620 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1621 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1622 		if (intel_gt_is_wedged(engine->gt))
1623 			err = -EIO;
1624 	}
1625 
1626 	i915_request_put(rq);
1627 out:
1628 	hang_fini(&h);
1629 	return err;
1630 }
1631 
1632 static int igt_reset_engines_atomic(void *arg)
1633 {
1634 	struct intel_gt *gt = arg;
1635 	const typeof(*igt_atomic_phases) *p;
1636 	int err = 0;
1637 
1638 	/* Check that the engines resets are usable from atomic context */
1639 
1640 	if (!intel_has_reset_engine(gt))
1641 		return 0;
1642 
1643 	if (USES_GUC_SUBMISSION(gt->i915))
1644 		return 0;
1645 
1646 	igt_global_reset_lock(gt);
1647 
1648 	/* Flush any requests before we get started and check basics */
1649 	if (!igt_force_reset(gt))
1650 		goto unlock;
1651 
1652 	for (p = igt_atomic_phases; p->name; p++) {
1653 		struct intel_engine_cs *engine;
1654 		enum intel_engine_id id;
1655 
1656 		for_each_engine(engine, gt, id) {
1657 			err = igt_atomic_reset_engine(engine, p);
1658 			if (err)
1659 				goto out;
1660 		}
1661 	}
1662 
1663 out:
1664 	/* As we poke around the guts, do a full reset before continuing. */
1665 	igt_force_reset(gt);
1666 unlock:
1667 	igt_global_reset_unlock(gt);
1668 
1669 	return err;
1670 }
1671 
1672 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1673 {
1674 	static const struct i915_subtest tests[] = {
1675 		SUBTEST(igt_hang_sanitycheck),
1676 		SUBTEST(igt_reset_nop),
1677 		SUBTEST(igt_reset_nop_engine),
1678 		SUBTEST(igt_reset_idle_engine),
1679 		SUBTEST(igt_reset_active_engine),
1680 		SUBTEST(igt_reset_engines),
1681 		SUBTEST(igt_reset_engines_atomic),
1682 		SUBTEST(igt_reset_queue),
1683 		SUBTEST(igt_reset_wait),
1684 		SUBTEST(igt_reset_evict_ggtt),
1685 		SUBTEST(igt_reset_evict_ppgtt),
1686 		SUBTEST(igt_reset_evict_fence),
1687 		SUBTEST(igt_handle_error),
1688 	};
1689 	struct intel_gt *gt = &i915->gt;
1690 	intel_wakeref_t wakeref;
1691 	int err;
1692 
1693 	if (!intel_has_gpu_reset(gt))
1694 		return 0;
1695 
1696 	if (intel_gt_is_wedged(gt))
1697 		return -EIO; /* we're long past hope of a successful reset */
1698 
1699 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1700 
1701 	err = intel_gt_live_subtests(tests, gt);
1702 
1703 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1704 
1705 	return err;
1706 }
1707