1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "gt/intel_gt.h"
29 #include "intel_engine_pm.h"
30 
31 #include "i915_selftest.h"
32 #include "selftests/i915_random.h"
33 #include "selftests/igt_flush_test.h"
34 #include "selftests/igt_reset.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct intel_gt *gt;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct intel_gt *gt)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->gt = gt;
60 
61 	h->ctx = kernel_context(gt->i915);
62 	if (IS_ERR(h->ctx))
63 		return PTR_ERR(h->ctx);
64 
65 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66 
67 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
68 	if (IS_ERR(h->hws)) {
69 		err = PTR_ERR(h->hws);
70 		goto err_ctx;
71 	}
72 
73 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
74 	if (IS_ERR(h->obj)) {
75 		err = PTR_ERR(h->obj);
76 		goto err_hws;
77 	}
78 
79 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 	if (IS_ERR(vaddr)) {
82 		err = PTR_ERR(vaddr);
83 		goto err_obj;
84 	}
85 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86 
87 	vaddr = i915_gem_object_pin_map(h->obj,
88 					i915_coherent_map_type(gt->i915));
89 	if (IS_ERR(vaddr)) {
90 		err = PTR_ERR(vaddr);
91 		goto err_unpin_hws;
92 	}
93 	h->batch = vaddr;
94 
95 	return 0;
96 
97 err_unpin_hws:
98 	i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 	i915_gem_object_put(h->obj);
101 err_hws:
102 	i915_gem_object_put(h->hws);
103 err_ctx:
104 	kernel_context_close(h->ctx);
105 	return err;
106 }
107 
108 static u64 hws_address(const struct i915_vma *hws,
109 		       const struct i915_request *rq)
110 {
111 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113 
114 static int move_to_active(struct i915_vma *vma,
115 			  struct i915_request *rq,
116 			  unsigned int flags)
117 {
118 	int err;
119 
120 	i915_vma_lock(vma);
121 	err = i915_request_await_object(rq, vma->obj,
122 					flags & EXEC_OBJECT_WRITE);
123 	if (err == 0)
124 		err = i915_vma_move_to_active(vma, rq, flags);
125 	i915_vma_unlock(vma);
126 
127 	return err;
128 }
129 
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133 	struct intel_gt *gt = h->gt;
134 	struct i915_address_space *vm = h->ctx->vm ?: &engine->gt->ggtt->vm;
135 	struct drm_i915_gem_object *obj;
136 	struct i915_request *rq = NULL;
137 	struct i915_vma *hws, *vma;
138 	unsigned int flags;
139 	void *vaddr;
140 	u32 *batch;
141 	int err;
142 
143 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
144 	if (IS_ERR(obj))
145 		return ERR_CAST(obj);
146 
147 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
148 	if (IS_ERR(vaddr)) {
149 		i915_gem_object_put(obj);
150 		return ERR_CAST(vaddr);
151 	}
152 
153 	i915_gem_object_unpin_map(h->obj);
154 	i915_gem_object_put(h->obj);
155 
156 	h->obj = obj;
157 	h->batch = vaddr;
158 
159 	vma = i915_vma_instance(h->obj, vm, NULL);
160 	if (IS_ERR(vma))
161 		return ERR_CAST(vma);
162 
163 	hws = i915_vma_instance(h->hws, vm, NULL);
164 	if (IS_ERR(hws))
165 		return ERR_CAST(hws);
166 
167 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
168 	if (err)
169 		return ERR_PTR(err);
170 
171 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
172 	if (err)
173 		goto unpin_vma;
174 
175 	rq = igt_request_alloc(h->ctx, engine);
176 	if (IS_ERR(rq)) {
177 		err = PTR_ERR(rq);
178 		goto unpin_hws;
179 	}
180 
181 	err = move_to_active(vma, rq, 0);
182 	if (err)
183 		goto cancel_rq;
184 
185 	err = move_to_active(hws, rq, 0);
186 	if (err)
187 		goto cancel_rq;
188 
189 	batch = h->batch;
190 	if (INTEL_GEN(gt->i915) >= 8) {
191 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
192 		*batch++ = lower_32_bits(hws_address(hws, rq));
193 		*batch++ = upper_32_bits(hws_address(hws, rq));
194 		*batch++ = rq->fence.seqno;
195 		*batch++ = MI_ARB_CHECK;
196 
197 		memset(batch, 0, 1024);
198 		batch += 1024 / sizeof(*batch);
199 
200 		*batch++ = MI_ARB_CHECK;
201 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
202 		*batch++ = lower_32_bits(vma->node.start);
203 		*batch++ = upper_32_bits(vma->node.start);
204 	} else if (INTEL_GEN(gt->i915) >= 6) {
205 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
206 		*batch++ = 0;
207 		*batch++ = lower_32_bits(hws_address(hws, rq));
208 		*batch++ = rq->fence.seqno;
209 		*batch++ = MI_ARB_CHECK;
210 
211 		memset(batch, 0, 1024);
212 		batch += 1024 / sizeof(*batch);
213 
214 		*batch++ = MI_ARB_CHECK;
215 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
216 		*batch++ = lower_32_bits(vma->node.start);
217 	} else if (INTEL_GEN(gt->i915) >= 4) {
218 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
219 		*batch++ = 0;
220 		*batch++ = lower_32_bits(hws_address(hws, rq));
221 		*batch++ = rq->fence.seqno;
222 		*batch++ = MI_ARB_CHECK;
223 
224 		memset(batch, 0, 1024);
225 		batch += 1024 / sizeof(*batch);
226 
227 		*batch++ = MI_ARB_CHECK;
228 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
229 		*batch++ = lower_32_bits(vma->node.start);
230 	} else {
231 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
232 		*batch++ = lower_32_bits(hws_address(hws, rq));
233 		*batch++ = rq->fence.seqno;
234 		*batch++ = MI_ARB_CHECK;
235 
236 		memset(batch, 0, 1024);
237 		batch += 1024 / sizeof(*batch);
238 
239 		*batch++ = MI_ARB_CHECK;
240 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 		*batch++ = lower_32_bits(vma->node.start);
242 	}
243 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
244 	intel_gt_chipset_flush(engine->gt);
245 
246 	if (rq->engine->emit_init_breadcrumb) {
247 		err = rq->engine->emit_init_breadcrumb(rq);
248 		if (err)
249 			goto cancel_rq;
250 	}
251 
252 	flags = 0;
253 	if (INTEL_GEN(gt->i915) <= 5)
254 		flags |= I915_DISPATCH_SECURE;
255 
256 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
257 
258 cancel_rq:
259 	if (err) {
260 		i915_request_skip(rq, err);
261 		i915_request_add(rq);
262 	}
263 unpin_hws:
264 	i915_vma_unpin(hws);
265 unpin_vma:
266 	i915_vma_unpin(vma);
267 	return err ? ERR_PTR(err) : rq;
268 }
269 
270 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
271 {
272 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
273 }
274 
275 static void hang_fini(struct hang *h)
276 {
277 	*h->batch = MI_BATCH_BUFFER_END;
278 	intel_gt_chipset_flush(h->gt);
279 
280 	i915_gem_object_unpin_map(h->obj);
281 	i915_gem_object_put(h->obj);
282 
283 	i915_gem_object_unpin_map(h->hws);
284 	i915_gem_object_put(h->hws);
285 
286 	kernel_context_close(h->ctx);
287 
288 	igt_flush_test(h->gt->i915, I915_WAIT_LOCKED);
289 }
290 
291 static bool wait_until_running(struct hang *h, struct i915_request *rq)
292 {
293 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
294 					       rq->fence.seqno),
295 			     10) &&
296 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
297 					    rq->fence.seqno),
298 			  1000));
299 }
300 
301 static int igt_hang_sanitycheck(void *arg)
302 {
303 	struct intel_gt *gt = arg;
304 	struct i915_request *rq;
305 	struct intel_engine_cs *engine;
306 	enum intel_engine_id id;
307 	struct hang h;
308 	int err;
309 
310 	/* Basic check that we can execute our hanging batch */
311 
312 	mutex_lock(&gt->i915->drm.struct_mutex);
313 	err = hang_init(&h, gt);
314 	if (err)
315 		goto unlock;
316 
317 	for_each_engine(engine, gt->i915, id) {
318 		struct intel_wedge_me w;
319 		long timeout;
320 
321 		if (!intel_engine_can_store_dword(engine))
322 			continue;
323 
324 		rq = hang_create_request(&h, engine);
325 		if (IS_ERR(rq)) {
326 			err = PTR_ERR(rq);
327 			pr_err("Failed to create request for %s, err=%d\n",
328 			       engine->name, err);
329 			goto fini;
330 		}
331 
332 		i915_request_get(rq);
333 
334 		*h.batch = MI_BATCH_BUFFER_END;
335 		intel_gt_chipset_flush(engine->gt);
336 
337 		i915_request_add(rq);
338 
339 		timeout = 0;
340 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
341 			timeout = i915_request_wait(rq, 0,
342 						    MAX_SCHEDULE_TIMEOUT);
343 		if (intel_gt_is_wedged(gt))
344 			timeout = -EIO;
345 
346 		i915_request_put(rq);
347 
348 		if (timeout < 0) {
349 			err = timeout;
350 			pr_err("Wait for request failed on %s, err=%d\n",
351 			       engine->name, err);
352 			goto fini;
353 		}
354 	}
355 
356 fini:
357 	hang_fini(&h);
358 unlock:
359 	mutex_unlock(&gt->i915->drm.struct_mutex);
360 	return err;
361 }
362 
363 static bool wait_for_idle(struct intel_engine_cs *engine)
364 {
365 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
366 }
367 
368 static int igt_reset_nop(void *arg)
369 {
370 	struct intel_gt *gt = arg;
371 	struct i915_gpu_error *global = &gt->i915->gpu_error;
372 	struct intel_engine_cs *engine;
373 	struct i915_gem_context *ctx;
374 	unsigned int reset_count, count;
375 	enum intel_engine_id id;
376 	struct drm_file *file;
377 	IGT_TIMEOUT(end_time);
378 	int err = 0;
379 
380 	/* Check that we can reset during non-user portions of requests */
381 
382 	file = mock_file(gt->i915);
383 	if (IS_ERR(file))
384 		return PTR_ERR(file);
385 
386 	mutex_lock(&gt->i915->drm.struct_mutex);
387 	ctx = live_context(gt->i915, file);
388 	mutex_unlock(&gt->i915->drm.struct_mutex);
389 	if (IS_ERR(ctx)) {
390 		err = PTR_ERR(ctx);
391 		goto out;
392 	}
393 
394 	i915_gem_context_clear_bannable(ctx);
395 	reset_count = i915_reset_count(global);
396 	count = 0;
397 	do {
398 		mutex_lock(&gt->i915->drm.struct_mutex);
399 
400 		for_each_engine(engine, gt->i915, id) {
401 			int i;
402 
403 			for (i = 0; i < 16; i++) {
404 				struct i915_request *rq;
405 
406 				rq = igt_request_alloc(ctx, engine);
407 				if (IS_ERR(rq)) {
408 					err = PTR_ERR(rq);
409 					break;
410 				}
411 
412 				i915_request_add(rq);
413 			}
414 		}
415 
416 		igt_global_reset_lock(gt);
417 		intel_gt_reset(gt, ALL_ENGINES, NULL);
418 		igt_global_reset_unlock(gt);
419 
420 		mutex_unlock(&gt->i915->drm.struct_mutex);
421 		if (intel_gt_is_wedged(gt)) {
422 			err = -EIO;
423 			break;
424 		}
425 
426 		if (i915_reset_count(global) != reset_count + ++count) {
427 			pr_err("Full GPU reset not recorded!\n");
428 			err = -EINVAL;
429 			break;
430 		}
431 
432 		err = igt_flush_test(gt->i915, 0);
433 		if (err)
434 			break;
435 	} while (time_before(jiffies, end_time));
436 	pr_info("%s: %d resets\n", __func__, count);
437 
438 	mutex_lock(&gt->i915->drm.struct_mutex);
439 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
440 	mutex_unlock(&gt->i915->drm.struct_mutex);
441 
442 out:
443 	mock_file_free(gt->i915, file);
444 	if (intel_gt_is_wedged(gt))
445 		err = -EIO;
446 	return err;
447 }
448 
449 static int igt_reset_nop_engine(void *arg)
450 {
451 	struct intel_gt *gt = arg;
452 	struct i915_gpu_error *global = &gt->i915->gpu_error;
453 	struct intel_engine_cs *engine;
454 	struct i915_gem_context *ctx;
455 	enum intel_engine_id id;
456 	struct drm_file *file;
457 	int err = 0;
458 
459 	/* Check that we can engine-reset during non-user portions */
460 
461 	if (!intel_has_reset_engine(gt->i915))
462 		return 0;
463 
464 	file = mock_file(gt->i915);
465 	if (IS_ERR(file))
466 		return PTR_ERR(file);
467 
468 	mutex_lock(&gt->i915->drm.struct_mutex);
469 	ctx = live_context(gt->i915, file);
470 	mutex_unlock(&gt->i915->drm.struct_mutex);
471 	if (IS_ERR(ctx)) {
472 		err = PTR_ERR(ctx);
473 		goto out;
474 	}
475 
476 	i915_gem_context_clear_bannable(ctx);
477 	for_each_engine(engine, gt->i915, id) {
478 		unsigned int reset_count, reset_engine_count;
479 		unsigned int count;
480 		IGT_TIMEOUT(end_time);
481 
482 		reset_count = i915_reset_count(global);
483 		reset_engine_count = i915_reset_engine_count(global, engine);
484 		count = 0;
485 
486 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
487 		do {
488 			int i;
489 
490 			if (!wait_for_idle(engine)) {
491 				pr_err("%s failed to idle before reset\n",
492 				       engine->name);
493 				err = -EIO;
494 				break;
495 			}
496 
497 			mutex_lock(&gt->i915->drm.struct_mutex);
498 			for (i = 0; i < 16; i++) {
499 				struct i915_request *rq;
500 
501 				rq = igt_request_alloc(ctx, engine);
502 				if (IS_ERR(rq)) {
503 					err = PTR_ERR(rq);
504 					break;
505 				}
506 
507 				i915_request_add(rq);
508 			}
509 			err = intel_engine_reset(engine, NULL);
510 			mutex_unlock(&gt->i915->drm.struct_mutex);
511 			if (err) {
512 				pr_err("i915_reset_engine failed\n");
513 				break;
514 			}
515 
516 			if (i915_reset_count(global) != reset_count) {
517 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
518 				err = -EINVAL;
519 				break;
520 			}
521 
522 			if (i915_reset_engine_count(global, engine) !=
523 			    reset_engine_count + ++count) {
524 				pr_err("%s engine reset not recorded!\n",
525 				       engine->name);
526 				err = -EINVAL;
527 				break;
528 			}
529 		} while (time_before(jiffies, end_time));
530 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
531 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532 
533 		if (err)
534 			break;
535 
536 		err = igt_flush_test(gt->i915, 0);
537 		if (err)
538 			break;
539 	}
540 
541 	mutex_lock(&gt->i915->drm.struct_mutex);
542 	err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
543 	mutex_unlock(&gt->i915->drm.struct_mutex);
544 
545 out:
546 	mock_file_free(gt->i915, file);
547 	if (intel_gt_is_wedged(gt))
548 		err = -EIO;
549 	return err;
550 }
551 
552 static int __igt_reset_engine(struct intel_gt *gt, bool active)
553 {
554 	struct i915_gpu_error *global = &gt->i915->gpu_error;
555 	struct intel_engine_cs *engine;
556 	enum intel_engine_id id;
557 	struct hang h;
558 	int err = 0;
559 
560 	/* Check that we can issue an engine reset on an idle engine (no-op) */
561 
562 	if (!intel_has_reset_engine(gt->i915))
563 		return 0;
564 
565 	if (active) {
566 		mutex_lock(&gt->i915->drm.struct_mutex);
567 		err = hang_init(&h, gt);
568 		mutex_unlock(&gt->i915->drm.struct_mutex);
569 		if (err)
570 			return err;
571 	}
572 
573 	for_each_engine(engine, gt->i915, id) {
574 		unsigned int reset_count, reset_engine_count;
575 		IGT_TIMEOUT(end_time);
576 
577 		if (active && !intel_engine_can_store_dword(engine))
578 			continue;
579 
580 		if (!wait_for_idle(engine)) {
581 			pr_err("%s failed to idle before reset\n",
582 			       engine->name);
583 			err = -EIO;
584 			break;
585 		}
586 
587 		reset_count = i915_reset_count(global);
588 		reset_engine_count = i915_reset_engine_count(global, engine);
589 
590 		intel_engine_pm_get(engine);
591 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
592 		do {
593 			if (active) {
594 				struct i915_request *rq;
595 
596 				mutex_lock(&gt->i915->drm.struct_mutex);
597 				rq = hang_create_request(&h, engine);
598 				if (IS_ERR(rq)) {
599 					err = PTR_ERR(rq);
600 					mutex_unlock(&gt->i915->drm.struct_mutex);
601 					break;
602 				}
603 
604 				i915_request_get(rq);
605 				i915_request_add(rq);
606 				mutex_unlock(&gt->i915->drm.struct_mutex);
607 
608 				if (!wait_until_running(&h, rq)) {
609 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
610 
611 					pr_err("%s: Failed to start request %llx, at %x\n",
612 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
613 					intel_engine_dump(engine, &p,
614 							  "%s\n", engine->name);
615 
616 					i915_request_put(rq);
617 					err = -EIO;
618 					break;
619 				}
620 
621 				i915_request_put(rq);
622 			}
623 
624 			err = intel_engine_reset(engine, NULL);
625 			if (err) {
626 				pr_err("i915_reset_engine failed\n");
627 				break;
628 			}
629 
630 			if (i915_reset_count(global) != reset_count) {
631 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
632 				err = -EINVAL;
633 				break;
634 			}
635 
636 			if (i915_reset_engine_count(global, engine) !=
637 			    ++reset_engine_count) {
638 				pr_err("%s engine reset not recorded!\n",
639 				       engine->name);
640 				err = -EINVAL;
641 				break;
642 			}
643 		} while (time_before(jiffies, end_time));
644 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
645 		intel_engine_pm_put(engine);
646 
647 		if (err)
648 			break;
649 
650 		err = igt_flush_test(gt->i915, 0);
651 		if (err)
652 			break;
653 	}
654 
655 	if (intel_gt_is_wedged(gt))
656 		err = -EIO;
657 
658 	if (active) {
659 		mutex_lock(&gt->i915->drm.struct_mutex);
660 		hang_fini(&h);
661 		mutex_unlock(&gt->i915->drm.struct_mutex);
662 	}
663 
664 	return err;
665 }
666 
667 static int igt_reset_idle_engine(void *arg)
668 {
669 	return __igt_reset_engine(arg, false);
670 }
671 
672 static int igt_reset_active_engine(void *arg)
673 {
674 	return __igt_reset_engine(arg, true);
675 }
676 
677 struct active_engine {
678 	struct task_struct *task;
679 	struct intel_engine_cs *engine;
680 	unsigned long resets;
681 	unsigned int flags;
682 };
683 
684 #define TEST_ACTIVE	BIT(0)
685 #define TEST_OTHERS	BIT(1)
686 #define TEST_SELF	BIT(2)
687 #define TEST_PRIORITY	BIT(3)
688 
689 static int active_request_put(struct i915_request *rq)
690 {
691 	int err = 0;
692 
693 	if (!rq)
694 		return 0;
695 
696 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
697 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
698 			  rq->engine->name,
699 			  rq->fence.context,
700 			  rq->fence.seqno);
701 		GEM_TRACE_DUMP();
702 
703 		intel_gt_set_wedged(rq->engine->gt);
704 		err = -EIO;
705 	}
706 
707 	i915_request_put(rq);
708 
709 	return err;
710 }
711 
712 static int active_engine(void *data)
713 {
714 	I915_RND_STATE(prng);
715 	struct active_engine *arg = data;
716 	struct intel_engine_cs *engine = arg->engine;
717 	struct i915_request *rq[8] = {};
718 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
719 	struct drm_file *file;
720 	unsigned long count = 0;
721 	int err = 0;
722 
723 	file = mock_file(engine->i915);
724 	if (IS_ERR(file))
725 		return PTR_ERR(file);
726 
727 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
728 		mutex_lock(&engine->i915->drm.struct_mutex);
729 		ctx[count] = live_context(engine->i915, file);
730 		mutex_unlock(&engine->i915->drm.struct_mutex);
731 		if (IS_ERR(ctx[count])) {
732 			err = PTR_ERR(ctx[count]);
733 			while (--count)
734 				i915_gem_context_put(ctx[count]);
735 			goto err_file;
736 		}
737 	}
738 
739 	while (!kthread_should_stop()) {
740 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
741 		struct i915_request *old = rq[idx];
742 		struct i915_request *new;
743 
744 		mutex_lock(&engine->i915->drm.struct_mutex);
745 		new = igt_request_alloc(ctx[idx], engine);
746 		if (IS_ERR(new)) {
747 			mutex_unlock(&engine->i915->drm.struct_mutex);
748 			err = PTR_ERR(new);
749 			break;
750 		}
751 
752 		if (arg->flags & TEST_PRIORITY)
753 			ctx[idx]->sched.priority =
754 				i915_prandom_u32_max_state(512, &prng);
755 
756 		rq[idx] = i915_request_get(new);
757 		i915_request_add(new);
758 		mutex_unlock(&engine->i915->drm.struct_mutex);
759 
760 		err = active_request_put(old);
761 		if (err)
762 			break;
763 
764 		cond_resched();
765 	}
766 
767 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
768 		int err__ = active_request_put(rq[count]);
769 
770 		/* Keep the first error */
771 		if (!err)
772 			err = err__;
773 	}
774 
775 err_file:
776 	mock_file_free(engine->i915, file);
777 	return err;
778 }
779 
780 static int __igt_reset_engines(struct intel_gt *gt,
781 			       const char *test_name,
782 			       unsigned int flags)
783 {
784 	struct i915_gpu_error *global = &gt->i915->gpu_error;
785 	struct intel_engine_cs *engine, *other;
786 	enum intel_engine_id id, tmp;
787 	struct hang h;
788 	int err = 0;
789 
790 	/* Check that issuing a reset on one engine does not interfere
791 	 * with any other engine.
792 	 */
793 
794 	if (!intel_has_reset_engine(gt->i915))
795 		return 0;
796 
797 	if (flags & TEST_ACTIVE) {
798 		mutex_lock(&gt->i915->drm.struct_mutex);
799 		err = hang_init(&h, gt);
800 		mutex_unlock(&gt->i915->drm.struct_mutex);
801 		if (err)
802 			return err;
803 
804 		if (flags & TEST_PRIORITY)
805 			h.ctx->sched.priority = 1024;
806 	}
807 
808 	for_each_engine(engine, gt->i915, id) {
809 		struct active_engine threads[I915_NUM_ENGINES] = {};
810 		unsigned long device = i915_reset_count(global);
811 		unsigned long count = 0, reported;
812 		IGT_TIMEOUT(end_time);
813 
814 		if (flags & TEST_ACTIVE &&
815 		    !intel_engine_can_store_dword(engine))
816 			continue;
817 
818 		if (!wait_for_idle(engine)) {
819 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
820 			       engine->name, test_name);
821 			err = -EIO;
822 			break;
823 		}
824 
825 		memset(threads, 0, sizeof(threads));
826 		for_each_engine(other, gt->i915, tmp) {
827 			struct task_struct *tsk;
828 
829 			threads[tmp].resets =
830 				i915_reset_engine_count(global, other);
831 
832 			if (!(flags & TEST_OTHERS))
833 				continue;
834 
835 			if (other == engine && !(flags & TEST_SELF))
836 				continue;
837 
838 			threads[tmp].engine = other;
839 			threads[tmp].flags = flags;
840 
841 			tsk = kthread_run(active_engine, &threads[tmp],
842 					  "igt/%s", other->name);
843 			if (IS_ERR(tsk)) {
844 				err = PTR_ERR(tsk);
845 				goto unwind;
846 			}
847 
848 			threads[tmp].task = tsk;
849 			get_task_struct(tsk);
850 		}
851 
852 		intel_engine_pm_get(engine);
853 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
854 		do {
855 			struct i915_request *rq = NULL;
856 
857 			if (flags & TEST_ACTIVE) {
858 				mutex_lock(&gt->i915->drm.struct_mutex);
859 				rq = hang_create_request(&h, engine);
860 				if (IS_ERR(rq)) {
861 					err = PTR_ERR(rq);
862 					mutex_unlock(&gt->i915->drm.struct_mutex);
863 					break;
864 				}
865 
866 				i915_request_get(rq);
867 				i915_request_add(rq);
868 				mutex_unlock(&gt->i915->drm.struct_mutex);
869 
870 				if (!wait_until_running(&h, rq)) {
871 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
872 
873 					pr_err("%s: Failed to start request %llx, at %x\n",
874 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
875 					intel_engine_dump(engine, &p,
876 							  "%s\n", engine->name);
877 
878 					i915_request_put(rq);
879 					err = -EIO;
880 					break;
881 				}
882 			}
883 
884 			err = intel_engine_reset(engine, NULL);
885 			if (err) {
886 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
887 				       engine->name, test_name, err);
888 				break;
889 			}
890 
891 			count++;
892 
893 			if (rq) {
894 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
895 					struct drm_printer p =
896 						drm_info_printer(gt->i915->drm.dev);
897 
898 					pr_err("i915_reset_engine(%s:%s):"
899 					       " failed to complete request after reset\n",
900 					       engine->name, test_name);
901 					intel_engine_dump(engine, &p,
902 							  "%s\n", engine->name);
903 					i915_request_put(rq);
904 
905 					GEM_TRACE_DUMP();
906 					intel_gt_set_wedged(gt);
907 					err = -EIO;
908 					break;
909 				}
910 
911 				i915_request_put(rq);
912 			}
913 
914 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
915 				struct drm_printer p =
916 					drm_info_printer(gt->i915->drm.dev);
917 
918 				pr_err("i915_reset_engine(%s:%s):"
919 				       " failed to idle after reset\n",
920 				       engine->name, test_name);
921 				intel_engine_dump(engine, &p,
922 						  "%s\n", engine->name);
923 
924 				err = -EIO;
925 				break;
926 			}
927 		} while (time_before(jiffies, end_time));
928 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
929 		intel_engine_pm_put(engine);
930 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
931 			engine->name, test_name, count);
932 
933 		reported = i915_reset_engine_count(global, engine);
934 		reported -= threads[engine->id].resets;
935 		if (reported != count) {
936 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
937 			       engine->name, test_name, count, reported);
938 			if (!err)
939 				err = -EINVAL;
940 		}
941 
942 unwind:
943 		for_each_engine(other, gt->i915, tmp) {
944 			int ret;
945 
946 			if (!threads[tmp].task)
947 				continue;
948 
949 			ret = kthread_stop(threads[tmp].task);
950 			if (ret) {
951 				pr_err("kthread for other engine %s failed, err=%d\n",
952 				       other->name, ret);
953 				if (!err)
954 					err = ret;
955 			}
956 			put_task_struct(threads[tmp].task);
957 
958 			if (other->uabi_class != engine->uabi_class &&
959 			    threads[tmp].resets !=
960 			    i915_reset_engine_count(global, other)) {
961 				pr_err("Innocent engine %s was reset (count=%ld)\n",
962 				       other->name,
963 				       i915_reset_engine_count(global, other) -
964 				       threads[tmp].resets);
965 				if (!err)
966 					err = -EINVAL;
967 			}
968 		}
969 
970 		if (device != i915_reset_count(global)) {
971 			pr_err("Global reset (count=%ld)!\n",
972 			       i915_reset_count(global) - device);
973 			if (!err)
974 				err = -EINVAL;
975 		}
976 
977 		if (err)
978 			break;
979 
980 		mutex_lock(&gt->i915->drm.struct_mutex);
981 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
982 		mutex_unlock(&gt->i915->drm.struct_mutex);
983 		if (err)
984 			break;
985 	}
986 
987 	if (intel_gt_is_wedged(gt))
988 		err = -EIO;
989 
990 	if (flags & TEST_ACTIVE) {
991 		mutex_lock(&gt->i915->drm.struct_mutex);
992 		hang_fini(&h);
993 		mutex_unlock(&gt->i915->drm.struct_mutex);
994 	}
995 
996 	return err;
997 }
998 
999 static int igt_reset_engines(void *arg)
1000 {
1001 	static const struct {
1002 		const char *name;
1003 		unsigned int flags;
1004 	} phases[] = {
1005 		{ "idle", 0 },
1006 		{ "active", TEST_ACTIVE },
1007 		{ "others-idle", TEST_OTHERS },
1008 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1009 		{
1010 			"others-priority",
1011 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1012 		},
1013 		{
1014 			"self-priority",
1015 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1016 		},
1017 		{ }
1018 	};
1019 	struct intel_gt *gt = arg;
1020 	typeof(*phases) *p;
1021 	int err;
1022 
1023 	for (p = phases; p->name; p++) {
1024 		if (p->flags & TEST_PRIORITY) {
1025 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1026 				continue;
1027 		}
1028 
1029 		err = __igt_reset_engines(arg, p->name, p->flags);
1030 		if (err)
1031 			return err;
1032 	}
1033 
1034 	return 0;
1035 }
1036 
1037 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1038 {
1039 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1040 
1041 	intel_gt_reset(gt, mask, NULL);
1042 
1043 	return count;
1044 }
1045 
1046 static int igt_reset_wait(void *arg)
1047 {
1048 	struct intel_gt *gt = arg;
1049 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1050 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1051 	struct i915_request *rq;
1052 	unsigned int reset_count;
1053 	struct hang h;
1054 	long timeout;
1055 	int err;
1056 
1057 	if (!engine || !intel_engine_can_store_dword(engine))
1058 		return 0;
1059 
1060 	/* Check that we detect a stuck waiter and issue a reset */
1061 
1062 	igt_global_reset_lock(gt);
1063 
1064 	mutex_lock(&gt->i915->drm.struct_mutex);
1065 	err = hang_init(&h, gt);
1066 	if (err)
1067 		goto unlock;
1068 
1069 	rq = hang_create_request(&h, engine);
1070 	if (IS_ERR(rq)) {
1071 		err = PTR_ERR(rq);
1072 		goto fini;
1073 	}
1074 
1075 	i915_request_get(rq);
1076 	i915_request_add(rq);
1077 
1078 	if (!wait_until_running(&h, rq)) {
1079 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1080 
1081 		pr_err("%s: Failed to start request %llx, at %x\n",
1082 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1083 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1084 
1085 		intel_gt_set_wedged(gt);
1086 
1087 		err = -EIO;
1088 		goto out_rq;
1089 	}
1090 
1091 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1092 
1093 	timeout = i915_request_wait(rq, 0, 10);
1094 	if (timeout < 0) {
1095 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1096 		       timeout);
1097 		err = timeout;
1098 		goto out_rq;
1099 	}
1100 
1101 	if (i915_reset_count(global) == reset_count) {
1102 		pr_err("No GPU reset recorded!\n");
1103 		err = -EINVAL;
1104 		goto out_rq;
1105 	}
1106 
1107 out_rq:
1108 	i915_request_put(rq);
1109 fini:
1110 	hang_fini(&h);
1111 unlock:
1112 	mutex_unlock(&gt->i915->drm.struct_mutex);
1113 	igt_global_reset_unlock(gt);
1114 
1115 	if (intel_gt_is_wedged(gt))
1116 		return -EIO;
1117 
1118 	return err;
1119 }
1120 
1121 struct evict_vma {
1122 	struct completion completion;
1123 	struct i915_vma *vma;
1124 };
1125 
1126 static int evict_vma(void *data)
1127 {
1128 	struct evict_vma *arg = data;
1129 	struct i915_address_space *vm = arg->vma->vm;
1130 	struct drm_i915_private *i915 = vm->i915;
1131 	struct drm_mm_node evict = arg->vma->node;
1132 	int err;
1133 
1134 	complete(&arg->completion);
1135 
1136 	mutex_lock(&i915->drm.struct_mutex);
1137 	err = i915_gem_evict_for_node(vm, &evict, 0);
1138 	mutex_unlock(&i915->drm.struct_mutex);
1139 
1140 	return err;
1141 }
1142 
1143 static int evict_fence(void *data)
1144 {
1145 	struct evict_vma *arg = data;
1146 	struct drm_i915_private *i915 = arg->vma->vm->i915;
1147 	int err;
1148 
1149 	complete(&arg->completion);
1150 
1151 	mutex_lock(&i915->drm.struct_mutex);
1152 
1153 	/* Mark the fence register as dirty to force the mmio update. */
1154 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1155 	if (err) {
1156 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1157 		goto out_unlock;
1158 	}
1159 
1160 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1161 	if (err) {
1162 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1163 		goto out_unlock;
1164 	}
1165 
1166 	err = i915_vma_pin_fence(arg->vma);
1167 	i915_vma_unpin(arg->vma);
1168 	if (err) {
1169 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170 		goto out_unlock;
1171 	}
1172 
1173 	i915_vma_unpin_fence(arg->vma);
1174 
1175 out_unlock:
1176 	mutex_unlock(&i915->drm.struct_mutex);
1177 
1178 	return err;
1179 }
1180 
1181 static int __igt_reset_evict_vma(struct intel_gt *gt,
1182 				 struct i915_address_space *vm,
1183 				 int (*fn)(void *),
1184 				 unsigned int flags)
1185 {
1186 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1187 	struct drm_i915_gem_object *obj;
1188 	struct task_struct *tsk = NULL;
1189 	struct i915_request *rq;
1190 	struct evict_vma arg;
1191 	struct hang h;
1192 	int err;
1193 
1194 	if (!engine || !intel_engine_can_store_dword(engine))
1195 		return 0;
1196 
1197 	/* Check that we can recover an unbind stuck on a hanging request */
1198 
1199 	mutex_lock(&gt->i915->drm.struct_mutex);
1200 	err = hang_init(&h, gt);
1201 	if (err)
1202 		goto unlock;
1203 
1204 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1205 	if (IS_ERR(obj)) {
1206 		err = PTR_ERR(obj);
1207 		goto fini;
1208 	}
1209 
1210 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1211 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1212 		if (err) {
1213 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1214 			goto out_obj;
1215 		}
1216 	}
1217 
1218 	arg.vma = i915_vma_instance(obj, vm, NULL);
1219 	if (IS_ERR(arg.vma)) {
1220 		err = PTR_ERR(arg.vma);
1221 		goto out_obj;
1222 	}
1223 
1224 	rq = hang_create_request(&h, engine);
1225 	if (IS_ERR(rq)) {
1226 		err = PTR_ERR(rq);
1227 		goto out_obj;
1228 	}
1229 
1230 	err = i915_vma_pin(arg.vma, 0, 0,
1231 			   i915_vma_is_ggtt(arg.vma) ?
1232 			   PIN_GLOBAL | PIN_MAPPABLE :
1233 			   PIN_USER);
1234 	if (err) {
1235 		i915_request_add(rq);
1236 		goto out_obj;
1237 	}
1238 
1239 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1240 		err = i915_vma_pin_fence(arg.vma);
1241 		if (err) {
1242 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1243 			i915_vma_unpin(arg.vma);
1244 			i915_request_add(rq);
1245 			goto out_obj;
1246 		}
1247 	}
1248 
1249 	i915_vma_lock(arg.vma);
1250 	err = i915_request_await_object(rq, arg.vma->obj,
1251 					flags & EXEC_OBJECT_WRITE);
1252 	if (err == 0)
1253 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1254 	i915_vma_unlock(arg.vma);
1255 
1256 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1257 		i915_vma_unpin_fence(arg.vma);
1258 	i915_vma_unpin(arg.vma);
1259 
1260 	i915_request_get(rq);
1261 	i915_request_add(rq);
1262 	if (err)
1263 		goto out_rq;
1264 
1265 	mutex_unlock(&gt->i915->drm.struct_mutex);
1266 
1267 	if (!wait_until_running(&h, rq)) {
1268 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1269 
1270 		pr_err("%s: Failed to start request %llx, at %x\n",
1271 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1272 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1273 
1274 		intel_gt_set_wedged(gt);
1275 		goto out_reset;
1276 	}
1277 
1278 	init_completion(&arg.completion);
1279 
1280 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1281 	if (IS_ERR(tsk)) {
1282 		err = PTR_ERR(tsk);
1283 		tsk = NULL;
1284 		goto out_reset;
1285 	}
1286 	get_task_struct(tsk);
1287 
1288 	wait_for_completion(&arg.completion);
1289 
1290 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1291 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1292 
1293 		pr_err("igt/evict_vma kthread did not wait\n");
1294 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1295 
1296 		intel_gt_set_wedged(gt);
1297 		goto out_reset;
1298 	}
1299 
1300 out_reset:
1301 	igt_global_reset_lock(gt);
1302 	fake_hangcheck(gt, rq->engine->mask);
1303 	igt_global_reset_unlock(gt);
1304 
1305 	if (tsk) {
1306 		struct intel_wedge_me w;
1307 
1308 		/* The reset, even indirectly, should take less than 10ms. */
1309 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1310 			err = kthread_stop(tsk);
1311 
1312 		put_task_struct(tsk);
1313 	}
1314 
1315 	mutex_lock(&gt->i915->drm.struct_mutex);
1316 out_rq:
1317 	i915_request_put(rq);
1318 out_obj:
1319 	i915_gem_object_put(obj);
1320 fini:
1321 	hang_fini(&h);
1322 unlock:
1323 	mutex_unlock(&gt->i915->drm.struct_mutex);
1324 
1325 	if (intel_gt_is_wedged(gt))
1326 		return -EIO;
1327 
1328 	return err;
1329 }
1330 
1331 static int igt_reset_evict_ggtt(void *arg)
1332 {
1333 	struct intel_gt *gt = arg;
1334 
1335 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1336 				     evict_vma, EXEC_OBJECT_WRITE);
1337 }
1338 
1339 static int igt_reset_evict_ppgtt(void *arg)
1340 {
1341 	struct intel_gt *gt = arg;
1342 	struct i915_gem_context *ctx;
1343 	struct drm_file *file;
1344 	int err;
1345 
1346 	file = mock_file(gt->i915);
1347 	if (IS_ERR(file))
1348 		return PTR_ERR(file);
1349 
1350 	mutex_lock(&gt->i915->drm.struct_mutex);
1351 	ctx = live_context(gt->i915, file);
1352 	mutex_unlock(&gt->i915->drm.struct_mutex);
1353 	if (IS_ERR(ctx)) {
1354 		err = PTR_ERR(ctx);
1355 		goto out;
1356 	}
1357 
1358 	err = 0;
1359 	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1360 		err = __igt_reset_evict_vma(gt, ctx->vm,
1361 					    evict_vma, EXEC_OBJECT_WRITE);
1362 
1363 out:
1364 	mock_file_free(gt->i915, file);
1365 	return err;
1366 }
1367 
1368 static int igt_reset_evict_fence(void *arg)
1369 {
1370 	struct intel_gt *gt = arg;
1371 
1372 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1373 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1374 }
1375 
1376 static int wait_for_others(struct intel_gt *gt,
1377 			   struct intel_engine_cs *exclude)
1378 {
1379 	struct intel_engine_cs *engine;
1380 	enum intel_engine_id id;
1381 
1382 	for_each_engine(engine, gt->i915, id) {
1383 		if (engine == exclude)
1384 			continue;
1385 
1386 		if (!wait_for_idle(engine))
1387 			return -EIO;
1388 	}
1389 
1390 	return 0;
1391 }
1392 
1393 static int igt_reset_queue(void *arg)
1394 {
1395 	struct intel_gt *gt = arg;
1396 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1397 	struct intel_engine_cs *engine;
1398 	enum intel_engine_id id;
1399 	struct hang h;
1400 	int err;
1401 
1402 	/* Check that we replay pending requests following a hang */
1403 
1404 	igt_global_reset_lock(gt);
1405 
1406 	mutex_lock(&gt->i915->drm.struct_mutex);
1407 	err = hang_init(&h, gt);
1408 	if (err)
1409 		goto unlock;
1410 
1411 	for_each_engine(engine, gt->i915, id) {
1412 		struct i915_request *prev;
1413 		IGT_TIMEOUT(end_time);
1414 		unsigned int count;
1415 
1416 		if (!intel_engine_can_store_dword(engine))
1417 			continue;
1418 
1419 		prev = hang_create_request(&h, engine);
1420 		if (IS_ERR(prev)) {
1421 			err = PTR_ERR(prev);
1422 			goto fini;
1423 		}
1424 
1425 		i915_request_get(prev);
1426 		i915_request_add(prev);
1427 
1428 		count = 0;
1429 		do {
1430 			struct i915_request *rq;
1431 			unsigned int reset_count;
1432 
1433 			rq = hang_create_request(&h, engine);
1434 			if (IS_ERR(rq)) {
1435 				err = PTR_ERR(rq);
1436 				goto fini;
1437 			}
1438 
1439 			i915_request_get(rq);
1440 			i915_request_add(rq);
1441 
1442 			/*
1443 			 * XXX We don't handle resetting the kernel context
1444 			 * very well. If we trigger a device reset twice in
1445 			 * quick succession while the kernel context is
1446 			 * executing, we may end up skipping the breadcrumb.
1447 			 * This is really only a problem for the selftest as
1448 			 * normally there is a large interlude between resets
1449 			 * (hangcheck), or we focus on resetting just one
1450 			 * engine and so avoid repeatedly resetting innocents.
1451 			 */
1452 			err = wait_for_others(gt, engine);
1453 			if (err) {
1454 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1455 				       __func__, engine->name);
1456 				i915_request_put(rq);
1457 				i915_request_put(prev);
1458 
1459 				GEM_TRACE_DUMP();
1460 				intel_gt_set_wedged(gt);
1461 				goto fini;
1462 			}
1463 
1464 			if (!wait_until_running(&h, prev)) {
1465 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1466 
1467 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1468 				       __func__, engine->name,
1469 				       prev->fence.seqno, hws_seqno(&h, prev));
1470 				intel_engine_dump(engine, &p,
1471 						  "%s\n", engine->name);
1472 
1473 				i915_request_put(rq);
1474 				i915_request_put(prev);
1475 
1476 				intel_gt_set_wedged(gt);
1477 
1478 				err = -EIO;
1479 				goto fini;
1480 			}
1481 
1482 			reset_count = fake_hangcheck(gt, BIT(id));
1483 
1484 			if (prev->fence.error != -EIO) {
1485 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1486 				       prev->fence.error);
1487 				i915_request_put(rq);
1488 				i915_request_put(prev);
1489 				err = -EINVAL;
1490 				goto fini;
1491 			}
1492 
1493 			if (rq->fence.error) {
1494 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1495 				       rq->fence.error);
1496 				i915_request_put(rq);
1497 				i915_request_put(prev);
1498 				err = -EINVAL;
1499 				goto fini;
1500 			}
1501 
1502 			if (i915_reset_count(global) == reset_count) {
1503 				pr_err("No GPU reset recorded!\n");
1504 				i915_request_put(rq);
1505 				i915_request_put(prev);
1506 				err = -EINVAL;
1507 				goto fini;
1508 			}
1509 
1510 			i915_request_put(prev);
1511 			prev = rq;
1512 			count++;
1513 		} while (time_before(jiffies, end_time));
1514 		pr_info("%s: Completed %d resets\n", engine->name, count);
1515 
1516 		*h.batch = MI_BATCH_BUFFER_END;
1517 		intel_gt_chipset_flush(engine->gt);
1518 
1519 		i915_request_put(prev);
1520 
1521 		err = igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1522 		if (err)
1523 			break;
1524 	}
1525 
1526 fini:
1527 	hang_fini(&h);
1528 unlock:
1529 	mutex_unlock(&gt->i915->drm.struct_mutex);
1530 	igt_global_reset_unlock(gt);
1531 
1532 	if (intel_gt_is_wedged(gt))
1533 		return -EIO;
1534 
1535 	return err;
1536 }
1537 
1538 static int igt_handle_error(void *arg)
1539 {
1540 	struct intel_gt *gt = arg;
1541 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1542 	struct intel_engine_cs *engine = gt->i915->engine[RCS0];
1543 	struct hang h;
1544 	struct i915_request *rq;
1545 	struct i915_gpu_state *error;
1546 	int err;
1547 
1548 	/* Check that we can issue a global GPU and engine reset */
1549 
1550 	if (!intel_has_reset_engine(gt->i915))
1551 		return 0;
1552 
1553 	if (!engine || !intel_engine_can_store_dword(engine))
1554 		return 0;
1555 
1556 	mutex_lock(&gt->i915->drm.struct_mutex);
1557 
1558 	err = hang_init(&h, gt);
1559 	if (err)
1560 		goto err_unlock;
1561 
1562 	rq = hang_create_request(&h, engine);
1563 	if (IS_ERR(rq)) {
1564 		err = PTR_ERR(rq);
1565 		goto err_fini;
1566 	}
1567 
1568 	i915_request_get(rq);
1569 	i915_request_add(rq);
1570 
1571 	if (!wait_until_running(&h, rq)) {
1572 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1573 
1574 		pr_err("%s: Failed to start request %llx, at %x\n",
1575 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1576 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1577 
1578 		intel_gt_set_wedged(gt);
1579 
1580 		err = -EIO;
1581 		goto err_request;
1582 	}
1583 
1584 	mutex_unlock(&gt->i915->drm.struct_mutex);
1585 
1586 	/* Temporarily disable error capture */
1587 	error = xchg(&global->first_error, (void *)-1);
1588 
1589 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1590 
1591 	xchg(&global->first_error, error);
1592 
1593 	mutex_lock(&gt->i915->drm.struct_mutex);
1594 
1595 	if (rq->fence.error != -EIO) {
1596 		pr_err("Guilty request not identified!\n");
1597 		err = -EINVAL;
1598 		goto err_request;
1599 	}
1600 
1601 err_request:
1602 	i915_request_put(rq);
1603 err_fini:
1604 	hang_fini(&h);
1605 err_unlock:
1606 	mutex_unlock(&gt->i915->drm.struct_mutex);
1607 	return err;
1608 }
1609 
1610 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1611 				     const struct igt_atomic_section *p,
1612 				     const char *mode)
1613 {
1614 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1615 	int err;
1616 
1617 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1618 		  engine->name, mode, p->name);
1619 
1620 	tasklet_disable_nosync(t);
1621 	p->critical_section_begin();
1622 
1623 	err = intel_engine_reset(engine, NULL);
1624 
1625 	p->critical_section_end();
1626 	tasklet_enable(t);
1627 
1628 	if (err)
1629 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1630 		       engine->name, mode, p->name);
1631 
1632 	return err;
1633 }
1634 
1635 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1636 				   const struct igt_atomic_section *p)
1637 {
1638 	struct i915_request *rq;
1639 	struct hang h;
1640 	int err;
1641 
1642 	err = __igt_atomic_reset_engine(engine, p, "idle");
1643 	if (err)
1644 		return err;
1645 
1646 	err = hang_init(&h, engine->gt);
1647 	if (err)
1648 		return err;
1649 
1650 	rq = hang_create_request(&h, engine);
1651 	if (IS_ERR(rq)) {
1652 		err = PTR_ERR(rq);
1653 		goto out;
1654 	}
1655 
1656 	i915_request_get(rq);
1657 	i915_request_add(rq);
1658 
1659 	if (wait_until_running(&h, rq)) {
1660 		err = __igt_atomic_reset_engine(engine, p, "active");
1661 	} else {
1662 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1663 		       __func__, engine->name,
1664 		       rq->fence.seqno, hws_seqno(&h, rq));
1665 		intel_gt_set_wedged(engine->gt);
1666 		err = -EIO;
1667 	}
1668 
1669 	if (err == 0) {
1670 		struct intel_wedge_me w;
1671 
1672 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1673 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1674 		if (intel_gt_is_wedged(engine->gt))
1675 			err = -EIO;
1676 	}
1677 
1678 	i915_request_put(rq);
1679 out:
1680 	hang_fini(&h);
1681 	return err;
1682 }
1683 
1684 static int igt_reset_engines_atomic(void *arg)
1685 {
1686 	struct intel_gt *gt = arg;
1687 	const typeof(*igt_atomic_phases) *p;
1688 	int err = 0;
1689 
1690 	/* Check that the engines resets are usable from atomic context */
1691 
1692 	if (!intel_has_reset_engine(gt->i915))
1693 		return 0;
1694 
1695 	if (USES_GUC_SUBMISSION(gt->i915))
1696 		return 0;
1697 
1698 	igt_global_reset_lock(gt);
1699 	mutex_lock(&gt->i915->drm.struct_mutex);
1700 
1701 	/* Flush any requests before we get started and check basics */
1702 	if (!igt_force_reset(gt))
1703 		goto unlock;
1704 
1705 	for (p = igt_atomic_phases; p->name; p++) {
1706 		struct intel_engine_cs *engine;
1707 		enum intel_engine_id id;
1708 
1709 		for_each_engine(engine, gt->i915, id) {
1710 			err = igt_atomic_reset_engine(engine, p);
1711 			if (err)
1712 				goto out;
1713 		}
1714 	}
1715 
1716 out:
1717 	/* As we poke around the guts, do a full reset before continuing. */
1718 	igt_force_reset(gt);
1719 
1720 unlock:
1721 	mutex_unlock(&gt->i915->drm.struct_mutex);
1722 	igt_global_reset_unlock(gt);
1723 
1724 	return err;
1725 }
1726 
1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728 {
1729 	static const struct i915_subtest tests[] = {
1730 		SUBTEST(igt_hang_sanitycheck),
1731 		SUBTEST(igt_reset_nop),
1732 		SUBTEST(igt_reset_nop_engine),
1733 		SUBTEST(igt_reset_idle_engine),
1734 		SUBTEST(igt_reset_active_engine),
1735 		SUBTEST(igt_reset_engines),
1736 		SUBTEST(igt_reset_engines_atomic),
1737 		SUBTEST(igt_reset_queue),
1738 		SUBTEST(igt_reset_wait),
1739 		SUBTEST(igt_reset_evict_ggtt),
1740 		SUBTEST(igt_reset_evict_ppgtt),
1741 		SUBTEST(igt_reset_evict_fence),
1742 		SUBTEST(igt_handle_error),
1743 	};
1744 	struct intel_gt *gt = &i915->gt;
1745 	intel_wakeref_t wakeref;
1746 	bool saved_hangcheck;
1747 	int err;
1748 
1749 	if (!intel_has_gpu_reset(gt->i915))
1750 		return 0;
1751 
1752 	if (intel_gt_is_wedged(gt))
1753 		return -EIO; /* we're long past hope of a successful reset */
1754 
1755 	wakeref = intel_runtime_pm_get(&gt->i915->runtime_pm);
1756 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1757 	drain_delayed_work(&gt->hangcheck.work); /* flush param */
1758 
1759 	err = intel_gt_live_subtests(tests, gt);
1760 
1761 	mutex_lock(&gt->i915->drm.struct_mutex);
1762 	igt_flush_test(gt->i915, I915_WAIT_LOCKED);
1763 	mutex_unlock(&gt->i915->drm.struct_mutex);
1764 
1765 	i915_modparams.enable_hangcheck = saved_hangcheck;
1766 	intel_runtime_pm_put(&gt->i915->runtime_pm, wakeref);
1767 
1768 	return err;
1769 }
1770