1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30 
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct intel_gt *gt;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->gt = gt;
60 
61 	h->ctx = kernel_context(gt->i915);
62 	if (IS_ERR(h->ctx))
63 		return PTR_ERR(h->ctx);
64 
65 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66 
67 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68 	if (IS_ERR(h->hws)) {
69 		err = PTR_ERR(h->hws);
70 		goto err_ctx;
71 	}
72 
73 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74 	if (IS_ERR(h->obj)) {
75 		err = PTR_ERR(h->obj);
76 		goto err_hws;
77 	}
78 
79 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 	if (IS_ERR(vaddr)) {
82 		err = PTR_ERR(vaddr);
83 		goto err_obj;
84 	}
85 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86 
87 	vaddr = i915_gem_object_pin_map(h->obj,
88 					i915_coherent_map_type(gt->i915));
89 	if (IS_ERR(vaddr)) {
90 		err = PTR_ERR(vaddr);
91 		goto err_unpin_hws;
92 	}
93 	h->batch = vaddr;
94 
95 	return 0;
96 
97 err_unpin_hws:
98 	i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 	i915_gem_object_put(h->obj);
101 err_hws:
102 	i915_gem_object_put(h->hws);
103 err_ctx:
104 	kernel_context_close(h->ctx);
105 	return err;
106 }
107 
108 static u64 hws_address(const struct i915_vma *hws,
109 		       const struct i915_request *rq)
110 {
111 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113 
114 static int move_to_active(struct i915_vma *vma,
115 			  struct i915_request *rq,
116 			  unsigned int flags)
117 {
118 	int err;
119 
120 	i915_vma_lock(vma);
121 	err = i915_request_await_object(rq, vma->obj,
122 					flags & EXEC_OBJECT_WRITE);
123 	if (err == 0)
124 		err = i915_vma_move_to_active(vma, rq, flags);
125 	i915_vma_unlock(vma);
126 
127 	return err;
128 }
129 
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133 	struct intel_gt *gt = h->gt;
134 	struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
135 	struct drm_i915_gem_object *obj;
136 	struct i915_request *rq = NULL;
137 	struct i915_vma *hws, *vma;
138 	unsigned int flags;
139 	void *vaddr;
140 	u32 *batch;
141 	int err;
142 
143 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
144 	if (IS_ERR(obj))
145 		return ERR_CAST(obj);
146 
147 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
148 	if (IS_ERR(vaddr)) {
149 		i915_gem_object_put(obj);
150 		return ERR_CAST(vaddr);
151 	}
152 
153 	i915_gem_object_unpin_map(h->obj);
154 	i915_gem_object_put(h->obj);
155 
156 	h->obj = obj;
157 	h->batch = vaddr;
158 
159 	vma = i915_vma_instance(h->obj, vm, NULL);
160 	if (IS_ERR(vma))
161 		return ERR_CAST(vma);
162 
163 	hws = i915_vma_instance(h->hws, vm, NULL);
164 	if (IS_ERR(hws))
165 		return ERR_CAST(hws);
166 
167 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
168 	if (err)
169 		return ERR_PTR(err);
170 
171 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
172 	if (err)
173 		goto unpin_vma;
174 
175 	rq = igt_request_alloc(h->ctx, engine);
176 	if (IS_ERR(rq)) {
177 		err = PTR_ERR(rq);
178 		goto unpin_hws;
179 	}
180 
181 	err = move_to_active(vma, rq, 0);
182 	if (err)
183 		goto cancel_rq;
184 
185 	err = move_to_active(hws, rq, 0);
186 	if (err)
187 		goto cancel_rq;
188 
189 	batch = h->batch;
190 	if (INTEL_GEN(gt->i915) >= 8) {
191 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
192 		*batch++ = lower_32_bits(hws_address(hws, rq));
193 		*batch++ = upper_32_bits(hws_address(hws, rq));
194 		*batch++ = rq->fence.seqno;
195 		*batch++ = MI_ARB_CHECK;
196 
197 		memset(batch, 0, 1024);
198 		batch += 1024 / sizeof(*batch);
199 
200 		*batch++ = MI_ARB_CHECK;
201 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
202 		*batch++ = lower_32_bits(vma->node.start);
203 		*batch++ = upper_32_bits(vma->node.start);
204 	} else if (INTEL_GEN(gt->i915) >= 6) {
205 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
206 		*batch++ = 0;
207 		*batch++ = lower_32_bits(hws_address(hws, rq));
208 		*batch++ = rq->fence.seqno;
209 		*batch++ = MI_ARB_CHECK;
210 
211 		memset(batch, 0, 1024);
212 		batch += 1024 / sizeof(*batch);
213 
214 		*batch++ = MI_ARB_CHECK;
215 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
216 		*batch++ = lower_32_bits(vma->node.start);
217 	} else if (INTEL_GEN(gt->i915) >= 4) {
218 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
219 		*batch++ = 0;
220 		*batch++ = lower_32_bits(hws_address(hws, rq));
221 		*batch++ = rq->fence.seqno;
222 		*batch++ = MI_ARB_CHECK;
223 
224 		memset(batch, 0, 1024);
225 		batch += 1024 / sizeof(*batch);
226 
227 		*batch++ = MI_ARB_CHECK;
228 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
229 		*batch++ = lower_32_bits(vma->node.start);
230 	} else {
231 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
232 		*batch++ = lower_32_bits(hws_address(hws, rq));
233 		*batch++ = rq->fence.seqno;
234 		*batch++ = MI_ARB_CHECK;
235 
236 		memset(batch, 0, 1024);
237 		batch += 1024 / sizeof(*batch);
238 
239 		*batch++ = MI_ARB_CHECK;
240 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 		*batch++ = lower_32_bits(vma->node.start);
242 	}
243 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
244 	intel_gt_chipset_flush(engine->gt);
245 
246 	if (rq->engine->emit_init_breadcrumb) {
247 		err = rq->engine->emit_init_breadcrumb(rq);
248 		if (err)
249 			goto cancel_rq;
250 	}
251 
252 	flags = 0;
253 	if (INTEL_GEN(gt->i915) <= 5)
254 		flags |= I915_DISPATCH_SECURE;
255 
256 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
257 
258 cancel_rq:
259 	if (err) {
260 		i915_request_skip(rq, err);
261 		i915_request_add(rq);
262 	}
263 unpin_hws:
264 	i915_vma_unpin(hws);
265 unpin_vma:
266 	i915_vma_unpin(vma);
267 	return err ? ERR_PTR(err) : rq;
268 }
269 
270 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
271 {
272 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
273 }
274 
275 static void hang_fini(struct hang *h)
276 {
277 	*h->batch = MI_BATCH_BUFFER_END;
278 	intel_gt_chipset_flush(h->gt);
279 
280 	i915_gem_object_unpin_map(h->obj);
281 	i915_gem_object_put(h->obj);
282 
283 	i915_gem_object_unpin_map(h->hws);
284 	i915_gem_object_put(h->hws);
285 
286 	kernel_context_close(h->ctx);
287 
288 	igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
289 }
290 
291 static bool wait_until_running(struct hang *h, struct i915_request *rq)
292 {
293 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
294 					       rq->fence.seqno),
295 			     10) &&
296 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
297 					    rq->fence.seqno),
298 			  1000));
299 }
300 
301 static int igt_hang_sanitycheck(void *arg)
302 {
303 	struct intel_gt *gt = arg;
304 	struct i915_request *rq;
305 	struct intel_engine_cs *engine;
306 	enum intel_engine_id id;
307 	struct hang h;
308 	int err;
309 
310 	/* Basic check that we can execute our hanging batch */
311 
312 	mutex_lock(&gt->i915->drm.struct_mutex);
313 	err = hang_init(&h, gt);
314 	if (err)
315 		goto unlock;
316 
317 	for_each_engine(engine, gt->i915, id) {
318 		struct intel_wedge_me w;
319 		long timeout;
320 
321 		if (!intel_engine_can_store_dword(engine))
322 			continue;
323 
324 		rq = hang_create_request(&h, engine);
325 		if (IS_ERR(rq)) {
326 			err = PTR_ERR(rq);
327 			pr_err("Failed to create request for %s, err=%d\n",
328 			       engine->name, err);
329 			goto fini;
330 		}
331 
332 		i915_request_get(rq);
333 
334 		*h.batch = MI_BATCH_BUFFER_END;
335 		intel_gt_chipset_flush(engine->gt);
336 
337 		i915_request_add(rq);
338 
339 		timeout = 0;
340 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
341 			timeout = i915_request_wait(rq, 0,
342 						    MAX_SCHEDULE_TIMEOUT);
343 		if (intel_gt_is_wedged(gt))
344 			timeout = -EIO;
345 
346 		i915_request_put(rq);
347 
348 		if (timeout < 0) {
349 			err = timeout;
350 			pr_err("Wait for request failed on %s, err=%d\n",
351 			       engine->name, err);
352 			goto fini;
353 		}
354 	}
355 
356 fini:
357 	hang_fini(&h);
358 unlock:
359 	mutex_unlock(&gt->i915->drm.struct_mutex);
360 	return err;
361 }
362 
363 static bool wait_for_idle(struct intel_engine_cs *engine)
364 {
365 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
366 }
367 
368 static int igt_reset_nop(void *arg)
369 {
370 	struct intel_gt *gt = arg;
371 	struct i915_gpu_error *global = &gt->i915->gpu_error;
372 	struct intel_engine_cs *engine;
373 	struct i915_gem_context *ctx;
374 	unsigned int reset_count, count;
375 	enum intel_engine_id id;
376 	struct drm_file *file;
377 	IGT_TIMEOUT(end_time);
378 	int err = 0;
379 
380 	/* Check that we can reset during non-user portions of requests */
381 
382 	file = mock_file(gt->i915);
383 	if (IS_ERR(file))
384 		return PTR_ERR(file);
385 
386 	mutex_lock(&gt->i915->drm.struct_mutex);
387 	ctx = live_context(gt->i915, file);
388 	mutex_unlock(&gt->i915->drm.struct_mutex);
389 	if (IS_ERR(ctx)) {
390 		err = PTR_ERR(ctx);
391 		goto out;
392 	}
393 
394 	i915_gem_context_clear_bannable(ctx);
395 	reset_count = i915_reset_count(global);
396 	count = 0;
397 	do {
398 		mutex_lock(&gt->i915->drm.struct_mutex);
399 
400 		for_each_engine(engine, gt->i915, id) {
401 			int i;
402 
403 			for (i = 0; i < 16; i++) {
404 				struct i915_request *rq;
405 
406 				rq = igt_request_alloc(ctx, engine);
407 				if (IS_ERR(rq)) {
408 					err = PTR_ERR(rq);
409 					break;
410 				}
411 
412 				i915_request_add(rq);
413 			}
414 		}
415 
416 		igt_global_reset_lock(gt);
417 		intel_gt_reset(gt, ALL_ENGINES, NULL);
418 		igt_global_reset_unlock(gt);
419 
420 		mutex_unlock(&gt->i915->drm.struct_mutex);
421 		if (intel_gt_is_wedged(gt)) {
422 			err = -EIO;
423 			break;
424 		}
425 
426 		if (i915_reset_count(global) != reset_count + ++count) {
427 			pr_err("Full GPU reset not recorded!\n");
428 			err = -EINVAL;
429 			break;
430 		}
431 
432 		err = igt_flush_test(gt->i915, 0);
433 		if (err)
434 			break;
435 	} while (time_before(jiffies, end_time));
436 	pr_info("%s: %d resets\n", __func__, count);
437 
438 	mutex_lock(&gt->i915->drm.struct_mutex);
439 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
440 	mutex_unlock(&gt->i915->drm.struct_mutex);
441 
442 out:
443 	mock_file_free(gt->i915, file);
444 	if (intel_gt_is_wedged(gt))
445 		err = -EIO;
446 	return err;
447 }
448 
449 static int igt_reset_nop_engine(void *arg)
450 {
451 	struct intel_gt *gt = arg;
452 	struct i915_gpu_error *global = &gt->i915->gpu_error;
453 	struct intel_engine_cs *engine;
454 	struct i915_gem_context *ctx;
455 	enum intel_engine_id id;
456 	struct drm_file *file;
457 	int err = 0;
458 
459 	/* Check that we can engine-reset during non-user portions */
460 
461 	if (!intel_has_reset_engine(gt))
462 		return 0;
463 
464 	file = mock_file(gt->i915);
465 	if (IS_ERR(file))
466 		return PTR_ERR(file);
467 
468 	mutex_lock(&gt->i915->drm.struct_mutex);
469 	ctx = live_context(gt->i915, file);
470 	mutex_unlock(&gt->i915->drm.struct_mutex);
471 	if (IS_ERR(ctx)) {
472 		err = PTR_ERR(ctx);
473 		goto out;
474 	}
475 
476 	i915_gem_context_clear_bannable(ctx);
477 	for_each_engine(engine, gt->i915, id) {
478 		unsigned int reset_count, reset_engine_count;
479 		unsigned int count;
480 		IGT_TIMEOUT(end_time);
481 
482 		reset_count = i915_reset_count(global);
483 		reset_engine_count = i915_reset_engine_count(global, engine);
484 		count = 0;
485 
486 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
487 		do {
488 			int i;
489 
490 			if (!wait_for_idle(engine)) {
491 				pr_err("%s failed to idle before reset\n",
492 				       engine->name);
493 				err = -EIO;
494 				break;
495 			}
496 
497 			mutex_lock(&gt->i915->drm.struct_mutex);
498 			for (i = 0; i < 16; i++) {
499 				struct i915_request *rq;
500 
501 				rq = igt_request_alloc(ctx, engine);
502 				if (IS_ERR(rq)) {
503 					err = PTR_ERR(rq);
504 					break;
505 				}
506 
507 				i915_request_add(rq);
508 			}
509 			err = intel_engine_reset(engine, NULL);
510 			mutex_unlock(&gt->i915->drm.struct_mutex);
511 			if (err) {
512 				pr_err("i915_reset_engine failed\n");
513 				break;
514 			}
515 
516 			if (i915_reset_count(global) != reset_count) {
517 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
518 				err = -EINVAL;
519 				break;
520 			}
521 
522 			if (i915_reset_engine_count(global, engine) !=
523 			    reset_engine_count + ++count) {
524 				pr_err("%s engine reset not recorded!\n",
525 				       engine->name);
526 				err = -EINVAL;
527 				break;
528 			}
529 		} while (time_before(jiffies, end_time));
530 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
531 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532 
533 		if (err)
534 			break;
535 
536 		err = igt_flush_test(gt->i915, 0);
537 		if (err)
538 			break;
539 	}
540 
541 	mutex_lock(&gt->i915->drm.struct_mutex);
542 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
543 	mutex_unlock(&gt->i915->drm.struct_mutex);
544 
545 out:
546 	mock_file_free(gt->i915, file);
547 	if (intel_gt_is_wedged(gt))
548 		err = -EIO;
549 	return err;
550 }
551 
552 static int __igt_reset_engine(struct intel_gt *gt, bool active)
553 {
554 	struct i915_gpu_error *global = &gt->i915->gpu_error;
555 	struct intel_engine_cs *engine;
556 	enum intel_engine_id id;
557 	struct hang h;
558 	int err = 0;
559 
560 	/* Check that we can issue an engine reset on an idle engine (no-op) */
561 
562 	if (!intel_has_reset_engine(gt))
563 		return 0;
564 
565 	if (active) {
566 		mutex_lock(&gt->i915->drm.struct_mutex);
567 		err = hang_init(&h, gt);
568 		mutex_unlock(&gt->i915->drm.struct_mutex);
569 		if (err)
570 			return err;
571 	}
572 
573 	for_each_engine(engine, gt->i915, id) {
574 		unsigned int reset_count, reset_engine_count;
575 		IGT_TIMEOUT(end_time);
576 
577 		if (active && !intel_engine_can_store_dword(engine))
578 			continue;
579 
580 		if (!wait_for_idle(engine)) {
581 			pr_err("%s failed to idle before reset\n",
582 			       engine->name);
583 			err = -EIO;
584 			break;
585 		}
586 
587 		reset_count = i915_reset_count(global);
588 		reset_engine_count = i915_reset_engine_count(global, engine);
589 
590 		intel_engine_pm_get(engine);
591 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
592 		do {
593 			if (active) {
594 				struct i915_request *rq;
595 
596 				mutex_lock(&gt->i915->drm.struct_mutex);
597 				rq = hang_create_request(&h, engine);
598 				if (IS_ERR(rq)) {
599 					err = PTR_ERR(rq);
600 					mutex_unlock(&gt->i915->drm.struct_mutex);
601 					break;
602 				}
603 
604 				i915_request_get(rq);
605 				i915_request_add(rq);
606 				mutex_unlock(&gt->i915->drm.struct_mutex);
607 
608 				if (!wait_until_running(&h, rq)) {
609 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
610 
611 					pr_err("%s: Failed to start request %llx, at %x\n",
612 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
613 					intel_engine_dump(engine, &p,
614 							  "%s\n", engine->name);
615 
616 					i915_request_put(rq);
617 					err = -EIO;
618 					break;
619 				}
620 
621 				i915_request_put(rq);
622 			}
623 
624 			err = intel_engine_reset(engine, NULL);
625 			if (err) {
626 				pr_err("i915_reset_engine failed\n");
627 				break;
628 			}
629 
630 			if (i915_reset_count(global) != reset_count) {
631 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
632 				err = -EINVAL;
633 				break;
634 			}
635 
636 			if (i915_reset_engine_count(global, engine) !=
637 			    ++reset_engine_count) {
638 				pr_err("%s engine reset not recorded!\n",
639 				       engine->name);
640 				err = -EINVAL;
641 				break;
642 			}
643 		} while (time_before(jiffies, end_time));
644 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
645 		intel_engine_pm_put(engine);
646 
647 		if (err)
648 			break;
649 
650 		err = igt_flush_test(gt->i915, 0);
651 		if (err)
652 			break;
653 	}
654 
655 	if (intel_gt_is_wedged(gt))
656 		err = -EIO;
657 
658 	if (active) {
659 		mutex_lock(&gt->i915->drm.struct_mutex);
660 		hang_fini(&h);
661 		mutex_unlock(&gt->i915->drm.struct_mutex);
662 	}
663 
664 	return err;
665 }
666 
667 static int igt_reset_idle_engine(void *arg)
668 {
669 	return __igt_reset_engine(arg, false);
670 }
671 
672 static int igt_reset_active_engine(void *arg)
673 {
674 	return __igt_reset_engine(arg, true);
675 }
676 
677 struct active_engine {
678 	struct task_struct *task;
679 	struct intel_engine_cs *engine;
680 	unsigned long resets;
681 	unsigned int flags;
682 };
683 
684 #define TEST_ACTIVE	BIT(0)
685 #define TEST_OTHERS	BIT(1)
686 #define TEST_SELF	BIT(2)
687 #define TEST_PRIORITY	BIT(3)
688 
689 static int active_request_put(struct i915_request *rq)
690 {
691 	int err = 0;
692 
693 	if (!rq)
694 		return 0;
695 
696 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
697 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
698 			  rq->engine->name,
699 			  rq->fence.context,
700 			  rq->fence.seqno);
701 		GEM_TRACE_DUMP();
702 
703 		intel_gt_set_wedged(rq->engine->gt);
704 		err = -EIO;
705 	}
706 
707 	i915_request_put(rq);
708 
709 	return err;
710 }
711 
712 static int active_engine(void *data)
713 {
714 	I915_RND_STATE(prng);
715 	struct active_engine *arg = data;
716 	struct intel_engine_cs *engine = arg->engine;
717 	struct i915_request *rq[8] = {};
718 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
719 	struct drm_file *file;
720 	unsigned long count = 0;
721 	int err = 0;
722 
723 	file = mock_file(engine->i915);
724 	if (IS_ERR(file))
725 		return PTR_ERR(file);
726 
727 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
728 		mutex_lock(&engine->i915->drm.struct_mutex);
729 		ctx[count] = live_context(engine->i915, file);
730 		mutex_unlock(&engine->i915->drm.struct_mutex);
731 		if (IS_ERR(ctx[count])) {
732 			err = PTR_ERR(ctx[count]);
733 			while (--count)
734 				i915_gem_context_put(ctx[count]);
735 			goto err_file;
736 		}
737 	}
738 
739 	while (!kthread_should_stop()) {
740 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
741 		struct i915_request *old = rq[idx];
742 		struct i915_request *new;
743 
744 		mutex_lock(&engine->i915->drm.struct_mutex);
745 		new = igt_request_alloc(ctx[idx], engine);
746 		if (IS_ERR(new)) {
747 			mutex_unlock(&engine->i915->drm.struct_mutex);
748 			err = PTR_ERR(new);
749 			break;
750 		}
751 
752 		if (arg->flags & TEST_PRIORITY)
753 			ctx[idx]->sched.priority =
754 				i915_prandom_u32_max_state(512, &prng);
755 
756 		rq[idx] = i915_request_get(new);
757 		i915_request_add(new);
758 		mutex_unlock(&engine->i915->drm.struct_mutex);
759 
760 		err = active_request_put(old);
761 		if (err)
762 			break;
763 
764 		cond_resched();
765 	}
766 
767 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
768 		int err__ = active_request_put(rq[count]);
769 
770 		/* Keep the first error */
771 		if (!err)
772 			err = err__;
773 	}
774 
775 err_file:
776 	mock_file_free(engine->i915, file);
777 	return err;
778 }
779 
780 static int __igt_reset_engines(struct intel_gt *gt,
781 			       const char *test_name,
782 			       unsigned int flags)
783 {
784 	struct i915_gpu_error *global = &gt->i915->gpu_error;
785 	struct intel_engine_cs *engine, *other;
786 	enum intel_engine_id id, tmp;
787 	struct hang h;
788 	int err = 0;
789 
790 	/* Check that issuing a reset on one engine does not interfere
791 	 * with any other engine.
792 	 */
793 
794 	if (!intel_has_reset_engine(gt))
795 		return 0;
796 
797 	if (flags & TEST_ACTIVE) {
798 		mutex_lock(&gt->i915->drm.struct_mutex);
799 		err = hang_init(&h, gt);
800 		mutex_unlock(&gt->i915->drm.struct_mutex);
801 		if (err)
802 			return err;
803 
804 		if (flags & TEST_PRIORITY)
805 			h.ctx->sched.priority = 1024;
806 	}
807 
808 	for_each_engine(engine, gt->i915, id) {
809 		struct active_engine threads[I915_NUM_ENGINES] = {};
810 		unsigned long device = i915_reset_count(global);
811 		unsigned long count = 0, reported;
812 		IGT_TIMEOUT(end_time);
813 
814 		if (flags & TEST_ACTIVE &&
815 		    !intel_engine_can_store_dword(engine))
816 			continue;
817 
818 		if (!wait_for_idle(engine)) {
819 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
820 			       engine->name, test_name);
821 			err = -EIO;
822 			break;
823 		}
824 
825 		memset(threads, 0, sizeof(threads));
826 		for_each_engine(other, gt->i915, tmp) {
827 			struct task_struct *tsk;
828 
829 			threads[tmp].resets =
830 				i915_reset_engine_count(global, other);
831 
832 			if (!(flags & TEST_OTHERS))
833 				continue;
834 
835 			if (other == engine && !(flags & TEST_SELF))
836 				continue;
837 
838 			threads[tmp].engine = other;
839 			threads[tmp].flags = flags;
840 
841 			tsk = kthread_run(active_engine, &threads[tmp],
842 					  "igt/%s", other->name);
843 			if (IS_ERR(tsk)) {
844 				err = PTR_ERR(tsk);
845 				goto unwind;
846 			}
847 
848 			threads[tmp].task = tsk;
849 			get_task_struct(tsk);
850 		}
851 
852 		intel_engine_pm_get(engine);
853 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
854 		do {
855 			struct i915_request *rq = NULL;
856 
857 			if (flags & TEST_ACTIVE) {
858 				mutex_lock(&gt->i915->drm.struct_mutex);
859 				rq = hang_create_request(&h, engine);
860 				if (IS_ERR(rq)) {
861 					err = PTR_ERR(rq);
862 					mutex_unlock(&gt->i915->drm.struct_mutex);
863 					break;
864 				}
865 
866 				i915_request_get(rq);
867 				i915_request_add(rq);
868 				mutex_unlock(&gt->i915->drm.struct_mutex);
869 
870 				if (!wait_until_running(&h, rq)) {
871 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
872 
873 					pr_err("%s: Failed to start request %llx, at %x\n",
874 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
875 					intel_engine_dump(engine, &p,
876 							  "%s\n", engine->name);
877 
878 					i915_request_put(rq);
879 					err = -EIO;
880 					break;
881 				}
882 			}
883 
884 			err = intel_engine_reset(engine, NULL);
885 			if (err) {
886 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
887 				       engine->name, test_name, err);
888 				break;
889 			}
890 
891 			count++;
892 
893 			if (rq) {
894 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
895 					struct drm_printer p =
896 						drm_info_printer(gt->i915->drm.dev);
897 
898 					pr_err("i915_reset_engine(%s:%s):"
899 					       " failed to complete request after reset\n",
900 					       engine->name, test_name);
901 					intel_engine_dump(engine, &p,
902 							  "%s\n", engine->name);
903 					i915_request_put(rq);
904 
905 					GEM_TRACE_DUMP();
906 					intel_gt_set_wedged(gt);
907 					err = -EIO;
908 					break;
909 				}
910 
911 				i915_request_put(rq);
912 			}
913 
914 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
915 				struct drm_printer p =
916 					drm_info_printer(gt->i915->drm.dev);
917 
918 				pr_err("i915_reset_engine(%s:%s):"
919 				       " failed to idle after reset\n",
920 				       engine->name, test_name);
921 				intel_engine_dump(engine, &p,
922 						  "%s\n", engine->name);
923 
924 				err = -EIO;
925 				break;
926 			}
927 		} while (time_before(jiffies, end_time));
928 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
929 		intel_engine_pm_put(engine);
930 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
931 			engine->name, test_name, count);
932 
933 		reported = i915_reset_engine_count(global, engine);
934 		reported -= threads[engine->id].resets;
935 		if (reported != count) {
936 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
937 			       engine->name, test_name, count, reported);
938 			if (!err)
939 				err = -EINVAL;
940 		}
941 
942 unwind:
943 		for_each_engine(other, gt->i915, tmp) {
944 			int ret;
945 
946 			if (!threads[tmp].task)
947 				continue;
948 
949 			ret = kthread_stop(threads[tmp].task);
950 			if (ret) {
951 				pr_err("kthread for other engine %s failed, err=%d\n",
952 				       other->name, ret);
953 				if (!err)
954 					err = ret;
955 			}
956 			put_task_struct(threads[tmp].task);
957 
958 			if (other->uabi_class != engine->uabi_class &&
959 			    threads[tmp].resets !=
960 			    i915_reset_engine_count(global, other)) {
961 				pr_err("Innocent engine %s was reset (count=%ld)\n",
962 				       other->name,
963 				       i915_reset_engine_count(global, other) -
964 				       threads[tmp].resets);
965 				if (!err)
966 					err = -EINVAL;
967 			}
968 		}
969 
970 		if (device != i915_reset_count(global)) {
971 			pr_err("Global reset (count=%ld)!\n",
972 			       i915_reset_count(global) - device);
973 			if (!err)
974 				err = -EINVAL;
975 		}
976 
977 		if (err)
978 			break;
979 
980 		mutex_lock(&gt->i915->drm.struct_mutex);
981 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
982 		mutex_unlock(&gt->i915->drm.struct_mutex);
983 		if (err)
984 			break;
985 	}
986 
987 	if (intel_gt_is_wedged(gt))
988 		err = -EIO;
989 
990 	if (flags & TEST_ACTIVE) {
991 		mutex_lock(&gt->i915->drm.struct_mutex);
992 		hang_fini(&h);
993 		mutex_unlock(&gt->i915->drm.struct_mutex);
994 	}
995 
996 	return err;
997 }
998 
999 static int igt_reset_engines(void *arg)
1000 {
1001 	static const struct {
1002 		const char *name;
1003 		unsigned int flags;
1004 	} phases[] = {
1005 		{ "idle", 0 },
1006 		{ "active", TEST_ACTIVE },
1007 		{ "others-idle", TEST_OTHERS },
1008 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1009 		{
1010 			"others-priority",
1011 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012 		},
1013 		{
1014 			"self-priority",
1015 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016 		},
1017 		{ }
1018 	};
1019 	struct intel_gt *gt = arg;
1020 	typeof(*phases) *p;
1021 	int err;
1022 
1023 	for (p = phases; p->name; p++) {
1024 		if (p->flags & TEST_PRIORITY) {
1025 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026 				continue;
1027 		}
1028 
1029 		err = __igt_reset_engines(arg, p->name, p->flags);
1030 		if (err)
1031 			return err;
1032 	}
1033 
1034 	return 0;
1035 }
1036 
1037 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038 {
1039 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1040 
1041 	intel_gt_reset(gt, mask, NULL);
1042 
1043 	return count;
1044 }
1045 
1046 static int igt_reset_wait(void *arg)
1047 {
1048 	struct intel_gt *gt = arg;
1049 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1050 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051 	struct i915_request *rq;
1052 	unsigned int reset_count;
1053 	struct hang h;
1054 	long timeout;
1055 	int err;
1056 
1057 	if (!engine || !intel_engine_can_store_dword(engine))
1058 		return 0;
1059 
1060 	/* Check that we detect a stuck waiter and issue a reset */
1061 
1062 	igt_global_reset_lock(gt);
1063 
1064 	mutex_lock(&gt->i915->drm.struct_mutex);
1065 	err = hang_init(&h, gt);
1066 	if (err)
1067 		goto unlock;
1068 
1069 	rq = hang_create_request(&h, engine);
1070 	if (IS_ERR(rq)) {
1071 		err = PTR_ERR(rq);
1072 		goto fini;
1073 	}
1074 
1075 	i915_request_get(rq);
1076 	i915_request_add(rq);
1077 
1078 	if (!wait_until_running(&h, rq)) {
1079 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080 
1081 		pr_err("%s: Failed to start request %llx, at %x\n",
1082 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084 
1085 		intel_gt_set_wedged(gt);
1086 
1087 		err = -EIO;
1088 		goto out_rq;
1089 	}
1090 
1091 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092 
1093 	timeout = i915_request_wait(rq, 0, 10);
1094 	if (timeout < 0) {
1095 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096 		       timeout);
1097 		err = timeout;
1098 		goto out_rq;
1099 	}
1100 
1101 	if (i915_reset_count(global) == reset_count) {
1102 		pr_err("No GPU reset recorded!\n");
1103 		err = -EINVAL;
1104 		goto out_rq;
1105 	}
1106 
1107 out_rq:
1108 	i915_request_put(rq);
1109 fini:
1110 	hang_fini(&h);
1111 unlock:
1112 	mutex_unlock(&gt->i915->drm.struct_mutex);
1113 	igt_global_reset_unlock(gt);
1114 
1115 	if (intel_gt_is_wedged(gt))
1116 		return -EIO;
1117 
1118 	return err;
1119 }
1120 
1121 struct evict_vma {
1122 	struct completion completion;
1123 	struct i915_vma *vma;
1124 };
1125 
1126 static int evict_vma(void *data)
1127 {
1128 	struct evict_vma *arg = data;
1129 	struct i915_address_space *vm = arg->vma->vm;
1130 	struct drm_mm_node evict = arg->vma->node;
1131 	int err;
1132 
1133 	complete(&arg->completion);
1134 
1135 	mutex_lock(&vm->mutex);
1136 	err = i915_gem_evict_for_node(vm, &evict, 0);
1137 	mutex_unlock(&vm->mutex);
1138 
1139 	return err;
1140 }
1141 
1142 static int evict_fence(void *data)
1143 {
1144 	struct evict_vma *arg = data;
1145 	int err;
1146 
1147 	complete(&arg->completion);
1148 
1149 	/* Mark the fence register as dirty to force the mmio update. */
1150 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1151 	if (err) {
1152 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1153 		return err;
1154 	}
1155 
1156 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1157 	if (err) {
1158 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1159 		return err;
1160 	}
1161 
1162 	err = i915_vma_pin_fence(arg->vma);
1163 	i915_vma_unpin(arg->vma);
1164 	if (err) {
1165 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1166 		return err;
1167 	}
1168 
1169 	i915_vma_unpin_fence(arg->vma);
1170 
1171 	return 0;
1172 }
1173 
1174 static int __igt_reset_evict_vma(struct intel_gt *gt,
1175 				 struct i915_address_space *vm,
1176 				 int (*fn)(void *),
1177 				 unsigned int flags)
1178 {
1179 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1180 	struct drm_i915_gem_object *obj;
1181 	struct task_struct *tsk = NULL;
1182 	struct i915_request *rq;
1183 	struct evict_vma arg;
1184 	struct hang h;
1185 	int err;
1186 
1187 	if (!engine || !intel_engine_can_store_dword(engine))
1188 		return 0;
1189 
1190 	/* Check that we can recover an unbind stuck on a hanging request */
1191 
1192 	mutex_lock(&gt->i915->drm.struct_mutex);
1193 	err = hang_init(&h, gt);
1194 	if (err)
1195 		goto unlock;
1196 
1197 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1198 	if (IS_ERR(obj)) {
1199 		err = PTR_ERR(obj);
1200 		goto fini;
1201 	}
1202 
1203 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1204 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1205 		if (err) {
1206 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1207 			goto out_obj;
1208 		}
1209 	}
1210 
1211 	arg.vma = i915_vma_instance(obj, vm, NULL);
1212 	if (IS_ERR(arg.vma)) {
1213 		err = PTR_ERR(arg.vma);
1214 		goto out_obj;
1215 	}
1216 
1217 	rq = hang_create_request(&h, engine);
1218 	if (IS_ERR(rq)) {
1219 		err = PTR_ERR(rq);
1220 		goto out_obj;
1221 	}
1222 
1223 	err = i915_vma_pin(arg.vma, 0, 0,
1224 			   i915_vma_is_ggtt(arg.vma) ?
1225 			   PIN_GLOBAL | PIN_MAPPABLE :
1226 			   PIN_USER);
1227 	if (err) {
1228 		i915_request_add(rq);
1229 		goto out_obj;
1230 	}
1231 
1232 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1233 		err = i915_vma_pin_fence(arg.vma);
1234 		if (err) {
1235 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1236 			i915_vma_unpin(arg.vma);
1237 			i915_request_add(rq);
1238 			goto out_obj;
1239 		}
1240 	}
1241 
1242 	i915_vma_lock(arg.vma);
1243 	err = i915_request_await_object(rq, arg.vma->obj,
1244 					flags & EXEC_OBJECT_WRITE);
1245 	if (err == 0)
1246 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1247 	i915_vma_unlock(arg.vma);
1248 
1249 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1250 		i915_vma_unpin_fence(arg.vma);
1251 	i915_vma_unpin(arg.vma);
1252 
1253 	i915_request_get(rq);
1254 	i915_request_add(rq);
1255 	if (err)
1256 		goto out_rq;
1257 
1258 	mutex_unlock(&gt->i915->drm.struct_mutex);
1259 
1260 	if (!wait_until_running(&h, rq)) {
1261 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1262 
1263 		pr_err("%s: Failed to start request %llx, at %x\n",
1264 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1265 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1266 
1267 		intel_gt_set_wedged(gt);
1268 		goto out_reset;
1269 	}
1270 
1271 	init_completion(&arg.completion);
1272 
1273 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1274 	if (IS_ERR(tsk)) {
1275 		err = PTR_ERR(tsk);
1276 		tsk = NULL;
1277 		goto out_reset;
1278 	}
1279 	get_task_struct(tsk);
1280 
1281 	wait_for_completion(&arg.completion);
1282 
1283 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1284 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1285 
1286 		pr_err("igt/evict_vma kthread did not wait\n");
1287 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1288 
1289 		intel_gt_set_wedged(gt);
1290 		goto out_reset;
1291 	}
1292 
1293 out_reset:
1294 	igt_global_reset_lock(gt);
1295 	fake_hangcheck(gt, rq->engine->mask);
1296 	igt_global_reset_unlock(gt);
1297 
1298 	if (tsk) {
1299 		struct intel_wedge_me w;
1300 
1301 		/* The reset, even indirectly, should take less than 10ms. */
1302 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1303 			err = kthread_stop(tsk);
1304 
1305 		put_task_struct(tsk);
1306 	}
1307 
1308 	mutex_lock(&gt->i915->drm.struct_mutex);
1309 out_rq:
1310 	i915_request_put(rq);
1311 out_obj:
1312 	i915_gem_object_put(obj);
1313 fini:
1314 	hang_fini(&h);
1315 unlock:
1316 	mutex_unlock(&gt->i915->drm.struct_mutex);
1317 
1318 	if (intel_gt_is_wedged(gt))
1319 		return -EIO;
1320 
1321 	return err;
1322 }
1323 
1324 static int igt_reset_evict_ggtt(void *arg)
1325 {
1326 	struct intel_gt *gt = arg;
1327 
1328 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1329 				     evict_vma, EXEC_OBJECT_WRITE);
1330 }
1331 
1332 static int igt_reset_evict_ppgtt(void *arg)
1333 {
1334 	struct intel_gt *gt = arg;
1335 	struct i915_gem_context *ctx;
1336 	struct drm_file *file;
1337 	int err;
1338 
1339 	file = mock_file(gt->i915);
1340 	if (IS_ERR(file))
1341 		return PTR_ERR(file);
1342 
1343 	mutex_lock(&gt->i915->drm.struct_mutex);
1344 	ctx = live_context(gt->i915, file);
1345 	mutex_unlock(&gt->i915->drm.struct_mutex);
1346 	if (IS_ERR(ctx)) {
1347 		err = PTR_ERR(ctx);
1348 		goto out;
1349 	}
1350 
1351 	err = 0;
1352 	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1353 		err = __igt_reset_evict_vma(gt, ctx->vm,
1354 					    evict_vma, EXEC_OBJECT_WRITE);
1355 
1356 out:
1357 	mock_file_free(gt->i915, file);
1358 	return err;
1359 }
1360 
1361 static int igt_reset_evict_fence(void *arg)
1362 {
1363 	struct intel_gt *gt = arg;
1364 
1365 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1366 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1367 }
1368 
1369 static int wait_for_others(struct intel_gt *gt,
1370 			   struct intel_engine_cs *exclude)
1371 {
1372 	struct intel_engine_cs *engine;
1373 	enum intel_engine_id id;
1374 
1375 	for_each_engine(engine, gt->i915, id) {
1376 		if (engine == exclude)
1377 			continue;
1378 
1379 		if (!wait_for_idle(engine))
1380 			return -EIO;
1381 	}
1382 
1383 	return 0;
1384 }
1385 
1386 static int igt_reset_queue(void *arg)
1387 {
1388 	struct intel_gt *gt = arg;
1389 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1390 	struct intel_engine_cs *engine;
1391 	enum intel_engine_id id;
1392 	struct hang h;
1393 	int err;
1394 
1395 	/* Check that we replay pending requests following a hang */
1396 
1397 	igt_global_reset_lock(gt);
1398 
1399 	mutex_lock(&gt->i915->drm.struct_mutex);
1400 	err = hang_init(&h, gt);
1401 	if (err)
1402 		goto unlock;
1403 
1404 	for_each_engine(engine, gt->i915, id) {
1405 		struct i915_request *prev;
1406 		IGT_TIMEOUT(end_time);
1407 		unsigned int count;
1408 
1409 		if (!intel_engine_can_store_dword(engine))
1410 			continue;
1411 
1412 		prev = hang_create_request(&h, engine);
1413 		if (IS_ERR(prev)) {
1414 			err = PTR_ERR(prev);
1415 			goto fini;
1416 		}
1417 
1418 		i915_request_get(prev);
1419 		i915_request_add(prev);
1420 
1421 		count = 0;
1422 		do {
1423 			struct i915_request *rq;
1424 			unsigned int reset_count;
1425 
1426 			rq = hang_create_request(&h, engine);
1427 			if (IS_ERR(rq)) {
1428 				err = PTR_ERR(rq);
1429 				goto fini;
1430 			}
1431 
1432 			i915_request_get(rq);
1433 			i915_request_add(rq);
1434 
1435 			/*
1436 			 * XXX We don't handle resetting the kernel context
1437 			 * very well. If we trigger a device reset twice in
1438 			 * quick succession while the kernel context is
1439 			 * executing, we may end up skipping the breadcrumb.
1440 			 * This is really only a problem for the selftest as
1441 			 * normally there is a large interlude between resets
1442 			 * (hangcheck), or we focus on resetting just one
1443 			 * engine and so avoid repeatedly resetting innocents.
1444 			 */
1445 			err = wait_for_others(gt, engine);
1446 			if (err) {
1447 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1448 				       __func__, engine->name);
1449 				i915_request_put(rq);
1450 				i915_request_put(prev);
1451 
1452 				GEM_TRACE_DUMP();
1453 				intel_gt_set_wedged(gt);
1454 				goto fini;
1455 			}
1456 
1457 			if (!wait_until_running(&h, prev)) {
1458 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1459 
1460 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1461 				       __func__, engine->name,
1462 				       prev->fence.seqno, hws_seqno(&h, prev));
1463 				intel_engine_dump(engine, &p,
1464 						  "%s\n", engine->name);
1465 
1466 				i915_request_put(rq);
1467 				i915_request_put(prev);
1468 
1469 				intel_gt_set_wedged(gt);
1470 
1471 				err = -EIO;
1472 				goto fini;
1473 			}
1474 
1475 			reset_count = fake_hangcheck(gt, BIT(id));
1476 
1477 			if (prev->fence.error != -EIO) {
1478 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1479 				       prev->fence.error);
1480 				i915_request_put(rq);
1481 				i915_request_put(prev);
1482 				err = -EINVAL;
1483 				goto fini;
1484 			}
1485 
1486 			if (rq->fence.error) {
1487 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1488 				       rq->fence.error);
1489 				i915_request_put(rq);
1490 				i915_request_put(prev);
1491 				err = -EINVAL;
1492 				goto fini;
1493 			}
1494 
1495 			if (i915_reset_count(global) == reset_count) {
1496 				pr_err("No GPU reset recorded!\n");
1497 				i915_request_put(rq);
1498 				i915_request_put(prev);
1499 				err = -EINVAL;
1500 				goto fini;
1501 			}
1502 
1503 			i915_request_put(prev);
1504 			prev = rq;
1505 			count++;
1506 		} while (time_before(jiffies, end_time));
1507 		pr_info("%s: Completed %d resets\n", engine->name, count);
1508 
1509 		*h.batch = MI_BATCH_BUFFER_END;
1510 		intel_gt_chipset_flush(engine->gt);
1511 
1512 		i915_request_put(prev);
1513 
1514 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1515 		if (err)
1516 			break;
1517 	}
1518 
1519 fini:
1520 	hang_fini(&h);
1521 unlock:
1522 	mutex_unlock(&gt->i915->drm.struct_mutex);
1523 	igt_global_reset_unlock(gt);
1524 
1525 	if (intel_gt_is_wedged(gt))
1526 		return -EIO;
1527 
1528 	return err;
1529 }
1530 
1531 static int igt_handle_error(void *arg)
1532 {
1533 	struct intel_gt *gt = arg;
1534 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1535 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1536 	struct hang h;
1537 	struct i915_request *rq;
1538 	struct i915_gpu_state *error;
1539 	int err;
1540 
1541 	/* Check that we can issue a global GPU and engine reset */
1542 
1543 	if (!intel_has_reset_engine(gt))
1544 		return 0;
1545 
1546 	if (!engine || !intel_engine_can_store_dword(engine))
1547 		return 0;
1548 
1549 	mutex_lock(&gt->i915->drm.struct_mutex);
1550 
1551 	err = hang_init(&h, gt);
1552 	if (err)
1553 		goto err_unlock;
1554 
1555 	rq = hang_create_request(&h, engine);
1556 	if (IS_ERR(rq)) {
1557 		err = PTR_ERR(rq);
1558 		goto err_fini;
1559 	}
1560 
1561 	i915_request_get(rq);
1562 	i915_request_add(rq);
1563 
1564 	if (!wait_until_running(&h, rq)) {
1565 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1566 
1567 		pr_err("%s: Failed to start request %llx, at %x\n",
1568 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1569 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1570 
1571 		intel_gt_set_wedged(gt);
1572 
1573 		err = -EIO;
1574 		goto err_request;
1575 	}
1576 
1577 	mutex_unlock(&gt->i915->drm.struct_mutex);
1578 
1579 	/* Temporarily disable error capture */
1580 	error = xchg(&global->first_error, (void *)-1);
1581 
1582 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1583 
1584 	xchg(&global->first_error, error);
1585 
1586 	mutex_lock(&gt->i915->drm.struct_mutex);
1587 
1588 	if (rq->fence.error != -EIO) {
1589 		pr_err("Guilty request not identified!\n");
1590 		err = -EINVAL;
1591 		goto err_request;
1592 	}
1593 
1594 err_request:
1595 	i915_request_put(rq);
1596 err_fini:
1597 	hang_fini(&h);
1598 err_unlock:
1599 	mutex_unlock(&gt->i915->drm.struct_mutex);
1600 	return err;
1601 }
1602 
1603 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1604 				     const struct igt_atomic_section *p,
1605 				     const char *mode)
1606 {
1607 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1608 	int err;
1609 
1610 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1611 		  engine->name, mode, p->name);
1612 
1613 	tasklet_disable_nosync(t);
1614 	p->critical_section_begin();
1615 
1616 	err = intel_engine_reset(engine, NULL);
1617 
1618 	p->critical_section_end();
1619 	tasklet_enable(t);
1620 
1621 	if (err)
1622 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1623 		       engine->name, mode, p->name);
1624 
1625 	return err;
1626 }
1627 
1628 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1629 				   const struct igt_atomic_section *p)
1630 {
1631 	struct i915_request *rq;
1632 	struct hang h;
1633 	int err;
1634 
1635 	err = __igt_atomic_reset_engine(engine, p, "idle");
1636 	if (err)
1637 		return err;
1638 
1639 	err = hang_init(&h, engine->gt);
1640 	if (err)
1641 		return err;
1642 
1643 	rq = hang_create_request(&h, engine);
1644 	if (IS_ERR(rq)) {
1645 		err = PTR_ERR(rq);
1646 		goto out;
1647 	}
1648 
1649 	i915_request_get(rq);
1650 	i915_request_add(rq);
1651 
1652 	if (wait_until_running(&h, rq)) {
1653 		err = __igt_atomic_reset_engine(engine, p, "active");
1654 	} else {
1655 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1656 		       __func__, engine->name,
1657 		       rq->fence.seqno, hws_seqno(&h, rq));
1658 		intel_gt_set_wedged(engine->gt);
1659 		err = -EIO;
1660 	}
1661 
1662 	if (err == 0) {
1663 		struct intel_wedge_me w;
1664 
1665 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1666 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1667 		if (intel_gt_is_wedged(engine->gt))
1668 			err = -EIO;
1669 	}
1670 
1671 	i915_request_put(rq);
1672 out:
1673 	hang_fini(&h);
1674 	return err;
1675 }
1676 
1677 static int igt_reset_engines_atomic(void *arg)
1678 {
1679 	struct intel_gt *gt = arg;
1680 	const typeof(*igt_atomic_phases) *p;
1681 	int err = 0;
1682 
1683 	/* Check that the engines resets are usable from atomic context */
1684 
1685 	if (!intel_has_reset_engine(gt))
1686 		return 0;
1687 
1688 	if (USES_GUC_SUBMISSION(gt->i915))
1689 		return 0;
1690 
1691 	igt_global_reset_lock(gt);
1692 	mutex_lock(&gt->i915->drm.struct_mutex);
1693 
1694 	/* Flush any requests before we get started and check basics */
1695 	if (!igt_force_reset(gt))
1696 		goto unlock;
1697 
1698 	for (p = igt_atomic_phases; p->name; p++) {
1699 		struct intel_engine_cs *engine;
1700 		enum intel_engine_id id;
1701 
1702 		for_each_engine(engine, gt->i915, id) {
1703 			err = igt_atomic_reset_engine(engine, p);
1704 			if (err)
1705 				goto out;
1706 		}
1707 	}
1708 
1709 out:
1710 	/* As we poke around the guts, do a full reset before continuing. */
1711 	igt_force_reset(gt);
1712 
1713 unlock:
1714 	mutex_unlock(&gt->i915->drm.struct_mutex);
1715 	igt_global_reset_unlock(gt);
1716 
1717 	return err;
1718 }
1719 
1720 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1721 {
1722 	static const struct i915_subtest tests[] = {
1723 		SUBTEST(igt_hang_sanitycheck),
1724 		SUBTEST(igt_reset_nop),
1725 		SUBTEST(igt_reset_nop_engine),
1726 		SUBTEST(igt_reset_idle_engine),
1727 		SUBTEST(igt_reset_active_engine),
1728 		SUBTEST(igt_reset_engines),
1729 		SUBTEST(igt_reset_engines_atomic),
1730 		SUBTEST(igt_reset_queue),
1731 		SUBTEST(igt_reset_wait),
1732 		SUBTEST(igt_reset_evict_ggtt),
1733 		SUBTEST(igt_reset_evict_ppgtt),
1734 		SUBTEST(igt_reset_evict_fence),
1735 		SUBTEST(igt_handle_error),
1736 	};
1737 	struct intel_gt *gt = &i915->gt;
1738 	intel_wakeref_t wakeref;
1739 	bool saved_hangcheck;
1740 	int err;
1741 
1742 	if (!intel_has_gpu_reset(gt))
1743 		return 0;
1744 
1745 	if (intel_gt_is_wedged(gt))
1746 		return -EIO; /* we're long past hope of a successful reset */
1747 
1748 	wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1749 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1750 	drain_delayed_work(&gt->hangcheck.work); /* flush param */
1751 
1752 	err = intel_gt_live_subtests(tests, gt);
1753 
1754 	mutex_lock(&gt->i915->drm.struct_mutex);
1755 	igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1756 	mutex_unlock(&gt->i915->drm.struct_mutex);
1757 
1758 	i915_modparams.enable_hangcheck = saved_hangcheck;
1759 	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1760 
1761 	return err;
1762 }
1763