1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 #include "intel_engine_pm.h"
29 
30 #include "i915_selftest.h"
31 #include "selftests/i915_random.h"
32 #include "selftests/igt_flush_test.h"
33 #include "selftests/igt_reset.h"
34 #include "selftests/igt_wedge_me.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_drm.h"
38 
39 #include "gem/selftests/mock_context.h"
40 #include "gem/selftests/igt_gem_utils.h"
41 
42 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
43 
44 struct hang {
45 	struct drm_i915_private *i915;
46 	struct drm_i915_gem_object *hws;
47 	struct drm_i915_gem_object *obj;
48 	struct i915_gem_context *ctx;
49 	u32 *seqno;
50 	u32 *batch;
51 };
52 
53 static int hang_init(struct hang *h, struct drm_i915_private *i915)
54 {
55 	void *vaddr;
56 	int err;
57 
58 	memset(h, 0, sizeof(*h));
59 	h->i915 = i915;
60 
61 	h->ctx = kernel_context(i915);
62 	if (IS_ERR(h->ctx))
63 		return PTR_ERR(h->ctx);
64 
65 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
66 
67 	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
68 	if (IS_ERR(h->hws)) {
69 		err = PTR_ERR(h->hws);
70 		goto err_ctx;
71 	}
72 
73 	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
74 	if (IS_ERR(h->obj)) {
75 		err = PTR_ERR(h->obj);
76 		goto err_hws;
77 	}
78 
79 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
80 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
81 	if (IS_ERR(vaddr)) {
82 		err = PTR_ERR(vaddr);
83 		goto err_obj;
84 	}
85 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
86 
87 	vaddr = i915_gem_object_pin_map(h->obj,
88 					i915_coherent_map_type(i915));
89 	if (IS_ERR(vaddr)) {
90 		err = PTR_ERR(vaddr);
91 		goto err_unpin_hws;
92 	}
93 	h->batch = vaddr;
94 
95 	return 0;
96 
97 err_unpin_hws:
98 	i915_gem_object_unpin_map(h->hws);
99 err_obj:
100 	i915_gem_object_put(h->obj);
101 err_hws:
102 	i915_gem_object_put(h->hws);
103 err_ctx:
104 	kernel_context_close(h->ctx);
105 	return err;
106 }
107 
108 static u64 hws_address(const struct i915_vma *hws,
109 		       const struct i915_request *rq)
110 {
111 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
112 }
113 
114 static int move_to_active(struct i915_vma *vma,
115 			  struct i915_request *rq,
116 			  unsigned int flags)
117 {
118 	int err;
119 
120 	i915_vma_lock(vma);
121 	err = i915_vma_move_to_active(vma, rq, flags);
122 	i915_vma_unlock(vma);
123 
124 	return err;
125 }
126 
127 static struct i915_request *
128 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
129 {
130 	struct drm_i915_private *i915 = h->i915;
131 	struct i915_address_space *vm = h->ctx->vm ?: &i915->ggtt.vm;
132 	struct i915_request *rq = NULL;
133 	struct i915_vma *hws, *vma;
134 	unsigned int flags;
135 	u32 *batch;
136 	int err;
137 
138 	if (i915_gem_object_is_active(h->obj)) {
139 		struct drm_i915_gem_object *obj;
140 		void *vaddr;
141 
142 		obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
143 		if (IS_ERR(obj))
144 			return ERR_CAST(obj);
145 
146 		vaddr = i915_gem_object_pin_map(obj,
147 						i915_coherent_map_type(h->i915));
148 		if (IS_ERR(vaddr)) {
149 			i915_gem_object_put(obj);
150 			return ERR_CAST(vaddr);
151 		}
152 
153 		i915_gem_object_unpin_map(h->obj);
154 		i915_gem_object_put(h->obj);
155 
156 		h->obj = obj;
157 		h->batch = vaddr;
158 	}
159 
160 	vma = i915_vma_instance(h->obj, vm, NULL);
161 	if (IS_ERR(vma))
162 		return ERR_CAST(vma);
163 
164 	hws = i915_vma_instance(h->hws, vm, NULL);
165 	if (IS_ERR(hws))
166 		return ERR_CAST(hws);
167 
168 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
169 	if (err)
170 		return ERR_PTR(err);
171 
172 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
173 	if (err)
174 		goto unpin_vma;
175 
176 	rq = igt_request_alloc(h->ctx, engine);
177 	if (IS_ERR(rq)) {
178 		err = PTR_ERR(rq);
179 		goto unpin_hws;
180 	}
181 
182 	err = move_to_active(vma, rq, 0);
183 	if (err)
184 		goto cancel_rq;
185 
186 	err = move_to_active(hws, rq, 0);
187 	if (err)
188 		goto cancel_rq;
189 
190 	batch = h->batch;
191 	if (INTEL_GEN(i915) >= 8) {
192 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
193 		*batch++ = lower_32_bits(hws_address(hws, rq));
194 		*batch++ = upper_32_bits(hws_address(hws, rq));
195 		*batch++ = rq->fence.seqno;
196 		*batch++ = MI_ARB_CHECK;
197 
198 		memset(batch, 0, 1024);
199 		batch += 1024 / sizeof(*batch);
200 
201 		*batch++ = MI_ARB_CHECK;
202 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
203 		*batch++ = lower_32_bits(vma->node.start);
204 		*batch++ = upper_32_bits(vma->node.start);
205 	} else if (INTEL_GEN(i915) >= 6) {
206 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
207 		*batch++ = 0;
208 		*batch++ = lower_32_bits(hws_address(hws, rq));
209 		*batch++ = rq->fence.seqno;
210 		*batch++ = MI_ARB_CHECK;
211 
212 		memset(batch, 0, 1024);
213 		batch += 1024 / sizeof(*batch);
214 
215 		*batch++ = MI_ARB_CHECK;
216 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
217 		*batch++ = lower_32_bits(vma->node.start);
218 	} else if (INTEL_GEN(i915) >= 4) {
219 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
220 		*batch++ = 0;
221 		*batch++ = lower_32_bits(hws_address(hws, rq));
222 		*batch++ = rq->fence.seqno;
223 		*batch++ = MI_ARB_CHECK;
224 
225 		memset(batch, 0, 1024);
226 		batch += 1024 / sizeof(*batch);
227 
228 		*batch++ = MI_ARB_CHECK;
229 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
230 		*batch++ = lower_32_bits(vma->node.start);
231 	} else {
232 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
233 		*batch++ = lower_32_bits(hws_address(hws, rq));
234 		*batch++ = rq->fence.seqno;
235 		*batch++ = MI_ARB_CHECK;
236 
237 		memset(batch, 0, 1024);
238 		batch += 1024 / sizeof(*batch);
239 
240 		*batch++ = MI_ARB_CHECK;
241 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
242 		*batch++ = lower_32_bits(vma->node.start);
243 	}
244 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
245 	i915_gem_chipset_flush(h->i915);
246 
247 	if (rq->engine->emit_init_breadcrumb) {
248 		err = rq->engine->emit_init_breadcrumb(rq);
249 		if (err)
250 			goto cancel_rq;
251 	}
252 
253 	flags = 0;
254 	if (INTEL_GEN(vm->i915) <= 5)
255 		flags |= I915_DISPATCH_SECURE;
256 
257 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
258 
259 cancel_rq:
260 	if (err) {
261 		i915_request_skip(rq, err);
262 		i915_request_add(rq);
263 	}
264 unpin_hws:
265 	i915_vma_unpin(hws);
266 unpin_vma:
267 	i915_vma_unpin(vma);
268 	return err ? ERR_PTR(err) : rq;
269 }
270 
271 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
272 {
273 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
274 }
275 
276 static void hang_fini(struct hang *h)
277 {
278 	*h->batch = MI_BATCH_BUFFER_END;
279 	i915_gem_chipset_flush(h->i915);
280 
281 	i915_gem_object_unpin_map(h->obj);
282 	i915_gem_object_put(h->obj);
283 
284 	i915_gem_object_unpin_map(h->hws);
285 	i915_gem_object_put(h->hws);
286 
287 	kernel_context_close(h->ctx);
288 
289 	igt_flush_test(h->i915, I915_WAIT_LOCKED);
290 }
291 
292 static bool wait_until_running(struct hang *h, struct i915_request *rq)
293 {
294 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
295 					       rq->fence.seqno),
296 			     10) &&
297 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
298 					    rq->fence.seqno),
299 			  1000));
300 }
301 
302 static int igt_hang_sanitycheck(void *arg)
303 {
304 	struct drm_i915_private *i915 = arg;
305 	struct i915_request *rq;
306 	struct intel_engine_cs *engine;
307 	enum intel_engine_id id;
308 	struct hang h;
309 	int err;
310 
311 	/* Basic check that we can execute our hanging batch */
312 
313 	mutex_lock(&i915->drm.struct_mutex);
314 	err = hang_init(&h, i915);
315 	if (err)
316 		goto unlock;
317 
318 	for_each_engine(engine, i915, id) {
319 		struct igt_wedge_me w;
320 		long timeout;
321 
322 		if (!intel_engine_can_store_dword(engine))
323 			continue;
324 
325 		rq = hang_create_request(&h, engine);
326 		if (IS_ERR(rq)) {
327 			err = PTR_ERR(rq);
328 			pr_err("Failed to create request for %s, err=%d\n",
329 			       engine->name, err);
330 			goto fini;
331 		}
332 
333 		i915_request_get(rq);
334 
335 		*h.batch = MI_BATCH_BUFFER_END;
336 		i915_gem_chipset_flush(i915);
337 
338 		i915_request_add(rq);
339 
340 		timeout = 0;
341 		igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
342 			timeout = i915_request_wait(rq, 0,
343 						    MAX_SCHEDULE_TIMEOUT);
344 		if (i915_reset_failed(i915))
345 			timeout = -EIO;
346 
347 		i915_request_put(rq);
348 
349 		if (timeout < 0) {
350 			err = timeout;
351 			pr_err("Wait for request failed on %s, err=%d\n",
352 			       engine->name, err);
353 			goto fini;
354 		}
355 	}
356 
357 fini:
358 	hang_fini(&h);
359 unlock:
360 	mutex_unlock(&i915->drm.struct_mutex);
361 	return err;
362 }
363 
364 static bool wait_for_idle(struct intel_engine_cs *engine)
365 {
366 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
367 }
368 
369 static int igt_reset_nop(void *arg)
370 {
371 	struct drm_i915_private *i915 = arg;
372 	struct intel_engine_cs *engine;
373 	struct i915_gem_context *ctx;
374 	unsigned int reset_count, count;
375 	enum intel_engine_id id;
376 	intel_wakeref_t wakeref;
377 	struct drm_file *file;
378 	IGT_TIMEOUT(end_time);
379 	int err = 0;
380 
381 	/* Check that we can reset during non-user portions of requests */
382 
383 	file = mock_file(i915);
384 	if (IS_ERR(file))
385 		return PTR_ERR(file);
386 
387 	mutex_lock(&i915->drm.struct_mutex);
388 	ctx = live_context(i915, file);
389 	mutex_unlock(&i915->drm.struct_mutex);
390 	if (IS_ERR(ctx)) {
391 		err = PTR_ERR(ctx);
392 		goto out;
393 	}
394 
395 	i915_gem_context_clear_bannable(ctx);
396 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
397 	reset_count = i915_reset_count(&i915->gpu_error);
398 	count = 0;
399 	do {
400 		mutex_lock(&i915->drm.struct_mutex);
401 		for_each_engine(engine, i915, id) {
402 			int i;
403 
404 			for (i = 0; i < 16; i++) {
405 				struct i915_request *rq;
406 
407 				rq = igt_request_alloc(ctx, engine);
408 				if (IS_ERR(rq)) {
409 					err = PTR_ERR(rq);
410 					break;
411 				}
412 
413 				i915_request_add(rq);
414 			}
415 		}
416 		mutex_unlock(&i915->drm.struct_mutex);
417 
418 		igt_global_reset_lock(i915);
419 		i915_reset(i915, ALL_ENGINES, NULL);
420 		igt_global_reset_unlock(i915);
421 		if (i915_reset_failed(i915)) {
422 			err = -EIO;
423 			break;
424 		}
425 
426 		if (i915_reset_count(&i915->gpu_error) !=
427 		    reset_count + ++count) {
428 			pr_err("Full GPU reset not recorded!\n");
429 			err = -EINVAL;
430 			break;
431 		}
432 
433 		err = igt_flush_test(i915, 0);
434 		if (err)
435 			break;
436 	} while (time_before(jiffies, end_time));
437 	pr_info("%s: %d resets\n", __func__, count);
438 
439 	mutex_lock(&i915->drm.struct_mutex);
440 	err = igt_flush_test(i915, I915_WAIT_LOCKED);
441 	mutex_unlock(&i915->drm.struct_mutex);
442 
443 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
444 
445 out:
446 	mock_file_free(i915, file);
447 	if (i915_reset_failed(i915))
448 		err = -EIO;
449 	return err;
450 }
451 
452 static int igt_reset_nop_engine(void *arg)
453 {
454 	struct drm_i915_private *i915 = arg;
455 	struct intel_engine_cs *engine;
456 	struct i915_gem_context *ctx;
457 	enum intel_engine_id id;
458 	intel_wakeref_t wakeref;
459 	struct drm_file *file;
460 	int err = 0;
461 
462 	/* Check that we can engine-reset during non-user portions */
463 
464 	if (!intel_has_reset_engine(i915))
465 		return 0;
466 
467 	file = mock_file(i915);
468 	if (IS_ERR(file))
469 		return PTR_ERR(file);
470 
471 	mutex_lock(&i915->drm.struct_mutex);
472 	ctx = live_context(i915, file);
473 	mutex_unlock(&i915->drm.struct_mutex);
474 	if (IS_ERR(ctx)) {
475 		err = PTR_ERR(ctx);
476 		goto out;
477 	}
478 
479 	i915_gem_context_clear_bannable(ctx);
480 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
481 	for_each_engine(engine, i915, id) {
482 		unsigned int reset_count, reset_engine_count;
483 		unsigned int count;
484 		IGT_TIMEOUT(end_time);
485 
486 		reset_count = i915_reset_count(&i915->gpu_error);
487 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
488 							     engine);
489 		count = 0;
490 
491 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
492 		do {
493 			int i;
494 
495 			if (!wait_for_idle(engine)) {
496 				pr_err("%s failed to idle before reset\n",
497 				       engine->name);
498 				err = -EIO;
499 				break;
500 			}
501 
502 			mutex_lock(&i915->drm.struct_mutex);
503 			for (i = 0; i < 16; i++) {
504 				struct i915_request *rq;
505 
506 				rq = igt_request_alloc(ctx, engine);
507 				if (IS_ERR(rq)) {
508 					err = PTR_ERR(rq);
509 					break;
510 				}
511 
512 				i915_request_add(rq);
513 			}
514 			mutex_unlock(&i915->drm.struct_mutex);
515 
516 			err = i915_reset_engine(engine, NULL);
517 			if (err) {
518 				pr_err("i915_reset_engine failed\n");
519 				break;
520 			}
521 
522 			if (i915_reset_count(&i915->gpu_error) != reset_count) {
523 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
524 				err = -EINVAL;
525 				break;
526 			}
527 
528 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
529 			    reset_engine_count + ++count) {
530 				pr_err("%s engine reset not recorded!\n",
531 				       engine->name);
532 				err = -EINVAL;
533 				break;
534 			}
535 		} while (time_before(jiffies, end_time));
536 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
537 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
538 
539 		if (err)
540 			break;
541 
542 		err = igt_flush_test(i915, 0);
543 		if (err)
544 			break;
545 	}
546 
547 	mutex_lock(&i915->drm.struct_mutex);
548 	err = igt_flush_test(i915, I915_WAIT_LOCKED);
549 	mutex_unlock(&i915->drm.struct_mutex);
550 
551 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
552 out:
553 	mock_file_free(i915, file);
554 	if (i915_reset_failed(i915))
555 		err = -EIO;
556 	return err;
557 }
558 
559 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
560 {
561 	struct intel_engine_cs *engine;
562 	enum intel_engine_id id;
563 	struct hang h;
564 	int err = 0;
565 
566 	/* Check that we can issue an engine reset on an idle engine (no-op) */
567 
568 	if (!intel_has_reset_engine(i915))
569 		return 0;
570 
571 	if (active) {
572 		mutex_lock(&i915->drm.struct_mutex);
573 		err = hang_init(&h, i915);
574 		mutex_unlock(&i915->drm.struct_mutex);
575 		if (err)
576 			return err;
577 	}
578 
579 	for_each_engine(engine, i915, id) {
580 		unsigned int reset_count, reset_engine_count;
581 		IGT_TIMEOUT(end_time);
582 
583 		if (active && !intel_engine_can_store_dword(engine))
584 			continue;
585 
586 		if (!wait_for_idle(engine)) {
587 			pr_err("%s failed to idle before reset\n",
588 			       engine->name);
589 			err = -EIO;
590 			break;
591 		}
592 
593 		reset_count = i915_reset_count(&i915->gpu_error);
594 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
595 							     engine);
596 
597 		intel_engine_pm_get(engine);
598 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
599 		do {
600 			if (active) {
601 				struct i915_request *rq;
602 
603 				mutex_lock(&i915->drm.struct_mutex);
604 				rq = hang_create_request(&h, engine);
605 				if (IS_ERR(rq)) {
606 					err = PTR_ERR(rq);
607 					mutex_unlock(&i915->drm.struct_mutex);
608 					break;
609 				}
610 
611 				i915_request_get(rq);
612 				i915_request_add(rq);
613 				mutex_unlock(&i915->drm.struct_mutex);
614 
615 				if (!wait_until_running(&h, rq)) {
616 					struct drm_printer p = drm_info_printer(i915->drm.dev);
617 
618 					pr_err("%s: Failed to start request %llx, at %x\n",
619 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
620 					intel_engine_dump(engine, &p,
621 							  "%s\n", engine->name);
622 
623 					i915_request_put(rq);
624 					err = -EIO;
625 					break;
626 				}
627 
628 				i915_request_put(rq);
629 			}
630 
631 			err = i915_reset_engine(engine, NULL);
632 			if (err) {
633 				pr_err("i915_reset_engine failed\n");
634 				break;
635 			}
636 
637 			if (i915_reset_count(&i915->gpu_error) != reset_count) {
638 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
639 				err = -EINVAL;
640 				break;
641 			}
642 
643 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
644 			    ++reset_engine_count) {
645 				pr_err("%s engine reset not recorded!\n",
646 				       engine->name);
647 				err = -EINVAL;
648 				break;
649 			}
650 		} while (time_before(jiffies, end_time));
651 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
652 		intel_engine_pm_put(engine);
653 
654 		if (err)
655 			break;
656 
657 		err = igt_flush_test(i915, 0);
658 		if (err)
659 			break;
660 	}
661 
662 	if (i915_reset_failed(i915))
663 		err = -EIO;
664 
665 	if (active) {
666 		mutex_lock(&i915->drm.struct_mutex);
667 		hang_fini(&h);
668 		mutex_unlock(&i915->drm.struct_mutex);
669 	}
670 
671 	return err;
672 }
673 
674 static int igt_reset_idle_engine(void *arg)
675 {
676 	return __igt_reset_engine(arg, false);
677 }
678 
679 static int igt_reset_active_engine(void *arg)
680 {
681 	return __igt_reset_engine(arg, true);
682 }
683 
684 struct active_engine {
685 	struct task_struct *task;
686 	struct intel_engine_cs *engine;
687 	unsigned long resets;
688 	unsigned int flags;
689 };
690 
691 #define TEST_ACTIVE	BIT(0)
692 #define TEST_OTHERS	BIT(1)
693 #define TEST_SELF	BIT(2)
694 #define TEST_PRIORITY	BIT(3)
695 
696 static int active_request_put(struct i915_request *rq)
697 {
698 	int err = 0;
699 
700 	if (!rq)
701 		return 0;
702 
703 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
704 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
705 			  rq->engine->name,
706 			  rq->fence.context,
707 			  rq->fence.seqno);
708 		GEM_TRACE_DUMP();
709 
710 		i915_gem_set_wedged(rq->i915);
711 		err = -EIO;
712 	}
713 
714 	i915_request_put(rq);
715 
716 	return err;
717 }
718 
719 static int active_engine(void *data)
720 {
721 	I915_RND_STATE(prng);
722 	struct active_engine *arg = data;
723 	struct intel_engine_cs *engine = arg->engine;
724 	struct i915_request *rq[8] = {};
725 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
726 	struct drm_file *file;
727 	unsigned long count = 0;
728 	int err = 0;
729 
730 	file = mock_file(engine->i915);
731 	if (IS_ERR(file))
732 		return PTR_ERR(file);
733 
734 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
735 		mutex_lock(&engine->i915->drm.struct_mutex);
736 		ctx[count] = live_context(engine->i915, file);
737 		mutex_unlock(&engine->i915->drm.struct_mutex);
738 		if (IS_ERR(ctx[count])) {
739 			err = PTR_ERR(ctx[count]);
740 			while (--count)
741 				i915_gem_context_put(ctx[count]);
742 			goto err_file;
743 		}
744 	}
745 
746 	while (!kthread_should_stop()) {
747 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
748 		struct i915_request *old = rq[idx];
749 		struct i915_request *new;
750 
751 		mutex_lock(&engine->i915->drm.struct_mutex);
752 		new = igt_request_alloc(ctx[idx], engine);
753 		if (IS_ERR(new)) {
754 			mutex_unlock(&engine->i915->drm.struct_mutex);
755 			err = PTR_ERR(new);
756 			break;
757 		}
758 
759 		if (arg->flags & TEST_PRIORITY)
760 			ctx[idx]->sched.priority =
761 				i915_prandom_u32_max_state(512, &prng);
762 
763 		rq[idx] = i915_request_get(new);
764 		i915_request_add(new);
765 		mutex_unlock(&engine->i915->drm.struct_mutex);
766 
767 		err = active_request_put(old);
768 		if (err)
769 			break;
770 
771 		cond_resched();
772 	}
773 
774 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
775 		int err__ = active_request_put(rq[count]);
776 
777 		/* Keep the first error */
778 		if (!err)
779 			err = err__;
780 	}
781 
782 err_file:
783 	mock_file_free(engine->i915, file);
784 	return err;
785 }
786 
787 static int __igt_reset_engines(struct drm_i915_private *i915,
788 			       const char *test_name,
789 			       unsigned int flags)
790 {
791 	struct intel_engine_cs *engine, *other;
792 	enum intel_engine_id id, tmp;
793 	struct hang h;
794 	int err = 0;
795 
796 	/* Check that issuing a reset on one engine does not interfere
797 	 * with any other engine.
798 	 */
799 
800 	if (!intel_has_reset_engine(i915))
801 		return 0;
802 
803 	if (flags & TEST_ACTIVE) {
804 		mutex_lock(&i915->drm.struct_mutex);
805 		err = hang_init(&h, i915);
806 		mutex_unlock(&i915->drm.struct_mutex);
807 		if (err)
808 			return err;
809 
810 		if (flags & TEST_PRIORITY)
811 			h.ctx->sched.priority = 1024;
812 	}
813 
814 	for_each_engine(engine, i915, id) {
815 		struct active_engine threads[I915_NUM_ENGINES] = {};
816 		unsigned long global = i915_reset_count(&i915->gpu_error);
817 		unsigned long count = 0, reported;
818 		IGT_TIMEOUT(end_time);
819 
820 		if (flags & TEST_ACTIVE &&
821 		    !intel_engine_can_store_dword(engine))
822 			continue;
823 
824 		if (!wait_for_idle(engine)) {
825 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
826 			       engine->name, test_name);
827 			err = -EIO;
828 			break;
829 		}
830 
831 		memset(threads, 0, sizeof(threads));
832 		for_each_engine(other, i915, tmp) {
833 			struct task_struct *tsk;
834 
835 			threads[tmp].resets =
836 				i915_reset_engine_count(&i915->gpu_error,
837 							other);
838 
839 			if (!(flags & TEST_OTHERS))
840 				continue;
841 
842 			if (other == engine && !(flags & TEST_SELF))
843 				continue;
844 
845 			threads[tmp].engine = other;
846 			threads[tmp].flags = flags;
847 
848 			tsk = kthread_run(active_engine, &threads[tmp],
849 					  "igt/%s", other->name);
850 			if (IS_ERR(tsk)) {
851 				err = PTR_ERR(tsk);
852 				goto unwind;
853 			}
854 
855 			threads[tmp].task = tsk;
856 			get_task_struct(tsk);
857 		}
858 
859 		intel_engine_pm_get(engine);
860 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
861 		do {
862 			struct i915_request *rq = NULL;
863 
864 			if (flags & TEST_ACTIVE) {
865 				mutex_lock(&i915->drm.struct_mutex);
866 				rq = hang_create_request(&h, engine);
867 				if (IS_ERR(rq)) {
868 					err = PTR_ERR(rq);
869 					mutex_unlock(&i915->drm.struct_mutex);
870 					break;
871 				}
872 
873 				i915_request_get(rq);
874 				i915_request_add(rq);
875 				mutex_unlock(&i915->drm.struct_mutex);
876 
877 				if (!wait_until_running(&h, rq)) {
878 					struct drm_printer p = drm_info_printer(i915->drm.dev);
879 
880 					pr_err("%s: Failed to start request %llx, at %x\n",
881 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
882 					intel_engine_dump(engine, &p,
883 							  "%s\n", engine->name);
884 
885 					i915_request_put(rq);
886 					err = -EIO;
887 					break;
888 				}
889 			}
890 
891 			err = i915_reset_engine(engine, NULL);
892 			if (err) {
893 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
894 				       engine->name, test_name, err);
895 				break;
896 			}
897 
898 			count++;
899 
900 			if (rq) {
901 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
902 					struct drm_printer p =
903 						drm_info_printer(i915->drm.dev);
904 
905 					pr_err("i915_reset_engine(%s:%s):"
906 					       " failed to complete request after reset\n",
907 					       engine->name, test_name);
908 					intel_engine_dump(engine, &p,
909 							  "%s\n", engine->name);
910 					i915_request_put(rq);
911 
912 					GEM_TRACE_DUMP();
913 					i915_gem_set_wedged(i915);
914 					err = -EIO;
915 					break;
916 				}
917 
918 				i915_request_put(rq);
919 			}
920 
921 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
922 				struct drm_printer p =
923 					drm_info_printer(i915->drm.dev);
924 
925 				pr_err("i915_reset_engine(%s:%s):"
926 				       " failed to idle after reset\n",
927 				       engine->name, test_name);
928 				intel_engine_dump(engine, &p,
929 						  "%s\n", engine->name);
930 
931 				err = -EIO;
932 				break;
933 			}
934 		} while (time_before(jiffies, end_time));
935 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
936 		intel_engine_pm_put(engine);
937 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
938 			engine->name, test_name, count);
939 
940 		reported = i915_reset_engine_count(&i915->gpu_error, engine);
941 		reported -= threads[engine->id].resets;
942 		if (reported != count) {
943 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
944 			       engine->name, test_name, count, reported);
945 			if (!err)
946 				err = -EINVAL;
947 		}
948 
949 unwind:
950 		for_each_engine(other, i915, tmp) {
951 			int ret;
952 
953 			if (!threads[tmp].task)
954 				continue;
955 
956 			ret = kthread_stop(threads[tmp].task);
957 			if (ret) {
958 				pr_err("kthread for other engine %s failed, err=%d\n",
959 				       other->name, ret);
960 				if (!err)
961 					err = ret;
962 			}
963 			put_task_struct(threads[tmp].task);
964 
965 			if (other != engine &&
966 			    threads[tmp].resets !=
967 			    i915_reset_engine_count(&i915->gpu_error, other)) {
968 				pr_err("Innocent engine %s was reset (count=%ld)\n",
969 				       other->name,
970 				       i915_reset_engine_count(&i915->gpu_error,
971 							       other) -
972 				       threads[tmp].resets);
973 				if (!err)
974 					err = -EINVAL;
975 			}
976 		}
977 
978 		if (global != i915_reset_count(&i915->gpu_error)) {
979 			pr_err("Global reset (count=%ld)!\n",
980 			       i915_reset_count(&i915->gpu_error) - global);
981 			if (!err)
982 				err = -EINVAL;
983 		}
984 
985 		if (err)
986 			break;
987 
988 		mutex_lock(&i915->drm.struct_mutex);
989 		err = igt_flush_test(i915, I915_WAIT_LOCKED);
990 		mutex_unlock(&i915->drm.struct_mutex);
991 		if (err)
992 			break;
993 	}
994 
995 	if (i915_reset_failed(i915))
996 		err = -EIO;
997 
998 	if (flags & TEST_ACTIVE) {
999 		mutex_lock(&i915->drm.struct_mutex);
1000 		hang_fini(&h);
1001 		mutex_unlock(&i915->drm.struct_mutex);
1002 	}
1003 
1004 	return err;
1005 }
1006 
1007 static int igt_reset_engines(void *arg)
1008 {
1009 	static const struct {
1010 		const char *name;
1011 		unsigned int flags;
1012 	} phases[] = {
1013 		{ "idle", 0 },
1014 		{ "active", TEST_ACTIVE },
1015 		{ "others-idle", TEST_OTHERS },
1016 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1017 		{
1018 			"others-priority",
1019 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1020 		},
1021 		{
1022 			"self-priority",
1023 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1024 		},
1025 		{ }
1026 	};
1027 	struct drm_i915_private *i915 = arg;
1028 	typeof(*phases) *p;
1029 	int err;
1030 
1031 	for (p = phases; p->name; p++) {
1032 		if (p->flags & TEST_PRIORITY) {
1033 			if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1034 				continue;
1035 		}
1036 
1037 		err = __igt_reset_engines(arg, p->name, p->flags);
1038 		if (err)
1039 			return err;
1040 	}
1041 
1042 	return 0;
1043 }
1044 
1045 static u32 fake_hangcheck(struct drm_i915_private *i915,
1046 			  intel_engine_mask_t mask)
1047 {
1048 	u32 count = i915_reset_count(&i915->gpu_error);
1049 
1050 	i915_reset(i915, mask, NULL);
1051 
1052 	return count;
1053 }
1054 
1055 static int igt_reset_wait(void *arg)
1056 {
1057 	struct drm_i915_private *i915 = arg;
1058 	struct i915_request *rq;
1059 	unsigned int reset_count;
1060 	struct hang h;
1061 	long timeout;
1062 	int err;
1063 
1064 	if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1065 		return 0;
1066 
1067 	/* Check that we detect a stuck waiter and issue a reset */
1068 
1069 	igt_global_reset_lock(i915);
1070 
1071 	mutex_lock(&i915->drm.struct_mutex);
1072 	err = hang_init(&h, i915);
1073 	if (err)
1074 		goto unlock;
1075 
1076 	rq = hang_create_request(&h, i915->engine[RCS0]);
1077 	if (IS_ERR(rq)) {
1078 		err = PTR_ERR(rq);
1079 		goto fini;
1080 	}
1081 
1082 	i915_request_get(rq);
1083 	i915_request_add(rq);
1084 
1085 	if (!wait_until_running(&h, rq)) {
1086 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1087 
1088 		pr_err("%s: Failed to start request %llx, at %x\n",
1089 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1090 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1091 
1092 		i915_gem_set_wedged(i915);
1093 
1094 		err = -EIO;
1095 		goto out_rq;
1096 	}
1097 
1098 	reset_count = fake_hangcheck(i915, ALL_ENGINES);
1099 
1100 	timeout = i915_request_wait(rq, 0, 10);
1101 	if (timeout < 0) {
1102 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1103 		       timeout);
1104 		err = timeout;
1105 		goto out_rq;
1106 	}
1107 
1108 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
1109 		pr_err("No GPU reset recorded!\n");
1110 		err = -EINVAL;
1111 		goto out_rq;
1112 	}
1113 
1114 out_rq:
1115 	i915_request_put(rq);
1116 fini:
1117 	hang_fini(&h);
1118 unlock:
1119 	mutex_unlock(&i915->drm.struct_mutex);
1120 	igt_global_reset_unlock(i915);
1121 
1122 	if (i915_reset_failed(i915))
1123 		return -EIO;
1124 
1125 	return err;
1126 }
1127 
1128 struct evict_vma {
1129 	struct completion completion;
1130 	struct i915_vma *vma;
1131 };
1132 
1133 static int evict_vma(void *data)
1134 {
1135 	struct evict_vma *arg = data;
1136 	struct i915_address_space *vm = arg->vma->vm;
1137 	struct drm_i915_private *i915 = vm->i915;
1138 	struct drm_mm_node evict = arg->vma->node;
1139 	int err;
1140 
1141 	complete(&arg->completion);
1142 
1143 	mutex_lock(&i915->drm.struct_mutex);
1144 	err = i915_gem_evict_for_node(vm, &evict, 0);
1145 	mutex_unlock(&i915->drm.struct_mutex);
1146 
1147 	return err;
1148 }
1149 
1150 static int evict_fence(void *data)
1151 {
1152 	struct evict_vma *arg = data;
1153 	struct drm_i915_private *i915 = arg->vma->vm->i915;
1154 	int err;
1155 
1156 	complete(&arg->completion);
1157 
1158 	mutex_lock(&i915->drm.struct_mutex);
1159 
1160 	/* Mark the fence register as dirty to force the mmio update. */
1161 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1162 	if (err) {
1163 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1164 		goto out_unlock;
1165 	}
1166 
1167 	err = i915_vma_pin_fence(arg->vma);
1168 	if (err) {
1169 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1170 		goto out_unlock;
1171 	}
1172 
1173 	i915_vma_unpin_fence(arg->vma);
1174 
1175 out_unlock:
1176 	mutex_unlock(&i915->drm.struct_mutex);
1177 
1178 	return err;
1179 }
1180 
1181 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1182 				 struct i915_address_space *vm,
1183 				 int (*fn)(void *),
1184 				 unsigned int flags)
1185 {
1186 	struct drm_i915_gem_object *obj;
1187 	struct task_struct *tsk = NULL;
1188 	struct i915_request *rq;
1189 	struct evict_vma arg;
1190 	struct hang h;
1191 	int err;
1192 
1193 	if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1194 		return 0;
1195 
1196 	/* Check that we can recover an unbind stuck on a hanging request */
1197 
1198 	mutex_lock(&i915->drm.struct_mutex);
1199 	err = hang_init(&h, i915);
1200 	if (err)
1201 		goto unlock;
1202 
1203 	obj = i915_gem_object_create_internal(i915, SZ_1M);
1204 	if (IS_ERR(obj)) {
1205 		err = PTR_ERR(obj);
1206 		goto fini;
1207 	}
1208 
1209 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1210 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1211 		if (err) {
1212 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1213 			goto out_obj;
1214 		}
1215 	}
1216 
1217 	arg.vma = i915_vma_instance(obj, vm, NULL);
1218 	if (IS_ERR(arg.vma)) {
1219 		err = PTR_ERR(arg.vma);
1220 		goto out_obj;
1221 	}
1222 
1223 	rq = hang_create_request(&h, i915->engine[RCS0]);
1224 	if (IS_ERR(rq)) {
1225 		err = PTR_ERR(rq);
1226 		goto out_obj;
1227 	}
1228 
1229 	err = i915_vma_pin(arg.vma, 0, 0,
1230 			   i915_vma_is_ggtt(arg.vma) ?
1231 			   PIN_GLOBAL | PIN_MAPPABLE :
1232 			   PIN_USER);
1233 	if (err) {
1234 		i915_request_add(rq);
1235 		goto out_obj;
1236 	}
1237 
1238 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1239 		err = i915_vma_pin_fence(arg.vma);
1240 		if (err) {
1241 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1242 			i915_vma_unpin(arg.vma);
1243 			i915_request_add(rq);
1244 			goto out_obj;
1245 		}
1246 	}
1247 
1248 	i915_vma_lock(arg.vma);
1249 	err = i915_vma_move_to_active(arg.vma, rq, flags);
1250 	i915_vma_unlock(arg.vma);
1251 
1252 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1253 		i915_vma_unpin_fence(arg.vma);
1254 	i915_vma_unpin(arg.vma);
1255 
1256 	i915_request_get(rq);
1257 	i915_request_add(rq);
1258 	if (err)
1259 		goto out_rq;
1260 
1261 	mutex_unlock(&i915->drm.struct_mutex);
1262 
1263 	if (!wait_until_running(&h, rq)) {
1264 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1265 
1266 		pr_err("%s: Failed to start request %llx, at %x\n",
1267 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1268 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1269 
1270 		i915_gem_set_wedged(i915);
1271 		goto out_reset;
1272 	}
1273 
1274 	init_completion(&arg.completion);
1275 
1276 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1277 	if (IS_ERR(tsk)) {
1278 		err = PTR_ERR(tsk);
1279 		tsk = NULL;
1280 		goto out_reset;
1281 	}
1282 	get_task_struct(tsk);
1283 
1284 	wait_for_completion(&arg.completion);
1285 
1286 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1287 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1288 
1289 		pr_err("igt/evict_vma kthread did not wait\n");
1290 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1291 
1292 		i915_gem_set_wedged(i915);
1293 		goto out_reset;
1294 	}
1295 
1296 out_reset:
1297 	igt_global_reset_lock(i915);
1298 	fake_hangcheck(rq->i915, rq->engine->mask);
1299 	igt_global_reset_unlock(i915);
1300 
1301 	if (tsk) {
1302 		struct igt_wedge_me w;
1303 
1304 		/* The reset, even indirectly, should take less than 10ms. */
1305 		igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1306 			err = kthread_stop(tsk);
1307 
1308 		put_task_struct(tsk);
1309 	}
1310 
1311 	mutex_lock(&i915->drm.struct_mutex);
1312 out_rq:
1313 	i915_request_put(rq);
1314 out_obj:
1315 	i915_gem_object_put(obj);
1316 fini:
1317 	hang_fini(&h);
1318 unlock:
1319 	mutex_unlock(&i915->drm.struct_mutex);
1320 
1321 	if (i915_reset_failed(i915))
1322 		return -EIO;
1323 
1324 	return err;
1325 }
1326 
1327 static int igt_reset_evict_ggtt(void *arg)
1328 {
1329 	struct drm_i915_private *i915 = arg;
1330 
1331 	return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1332 				     evict_vma, EXEC_OBJECT_WRITE);
1333 }
1334 
1335 static int igt_reset_evict_ppgtt(void *arg)
1336 {
1337 	struct drm_i915_private *i915 = arg;
1338 	struct i915_gem_context *ctx;
1339 	struct drm_file *file;
1340 	int err;
1341 
1342 	file = mock_file(i915);
1343 	if (IS_ERR(file))
1344 		return PTR_ERR(file);
1345 
1346 	mutex_lock(&i915->drm.struct_mutex);
1347 	ctx = live_context(i915, file);
1348 	mutex_unlock(&i915->drm.struct_mutex);
1349 	if (IS_ERR(ctx)) {
1350 		err = PTR_ERR(ctx);
1351 		goto out;
1352 	}
1353 
1354 	err = 0;
1355 	if (ctx->vm) /* aliasing == global gtt locking, covered above */
1356 		err = __igt_reset_evict_vma(i915, ctx->vm,
1357 					    evict_vma, EXEC_OBJECT_WRITE);
1358 
1359 out:
1360 	mock_file_free(i915, file);
1361 	return err;
1362 }
1363 
1364 static int igt_reset_evict_fence(void *arg)
1365 {
1366 	struct drm_i915_private *i915 = arg;
1367 
1368 	return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1369 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1370 }
1371 
1372 static int wait_for_others(struct drm_i915_private *i915,
1373 			   struct intel_engine_cs *exclude)
1374 {
1375 	struct intel_engine_cs *engine;
1376 	enum intel_engine_id id;
1377 
1378 	for_each_engine(engine, i915, id) {
1379 		if (engine == exclude)
1380 			continue;
1381 
1382 		if (!wait_for_idle(engine))
1383 			return -EIO;
1384 	}
1385 
1386 	return 0;
1387 }
1388 
1389 static int igt_reset_queue(void *arg)
1390 {
1391 	struct drm_i915_private *i915 = arg;
1392 	struct intel_engine_cs *engine;
1393 	enum intel_engine_id id;
1394 	struct hang h;
1395 	int err;
1396 
1397 	/* Check that we replay pending requests following a hang */
1398 
1399 	igt_global_reset_lock(i915);
1400 
1401 	mutex_lock(&i915->drm.struct_mutex);
1402 	err = hang_init(&h, i915);
1403 	if (err)
1404 		goto unlock;
1405 
1406 	for_each_engine(engine, i915, id) {
1407 		struct i915_request *prev;
1408 		IGT_TIMEOUT(end_time);
1409 		unsigned int count;
1410 
1411 		if (!intel_engine_can_store_dword(engine))
1412 			continue;
1413 
1414 		prev = hang_create_request(&h, engine);
1415 		if (IS_ERR(prev)) {
1416 			err = PTR_ERR(prev);
1417 			goto fini;
1418 		}
1419 
1420 		i915_request_get(prev);
1421 		i915_request_add(prev);
1422 
1423 		count = 0;
1424 		do {
1425 			struct i915_request *rq;
1426 			unsigned int reset_count;
1427 
1428 			rq = hang_create_request(&h, engine);
1429 			if (IS_ERR(rq)) {
1430 				err = PTR_ERR(rq);
1431 				goto fini;
1432 			}
1433 
1434 			i915_request_get(rq);
1435 			i915_request_add(rq);
1436 
1437 			/*
1438 			 * XXX We don't handle resetting the kernel context
1439 			 * very well. If we trigger a device reset twice in
1440 			 * quick succession while the kernel context is
1441 			 * executing, we may end up skipping the breadcrumb.
1442 			 * This is really only a problem for the selftest as
1443 			 * normally there is a large interlude between resets
1444 			 * (hangcheck), or we focus on resetting just one
1445 			 * engine and so avoid repeatedly resetting innocents.
1446 			 */
1447 			err = wait_for_others(i915, engine);
1448 			if (err) {
1449 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1450 				       __func__, engine->name);
1451 				i915_request_put(rq);
1452 				i915_request_put(prev);
1453 
1454 				GEM_TRACE_DUMP();
1455 				i915_gem_set_wedged(i915);
1456 				goto fini;
1457 			}
1458 
1459 			if (!wait_until_running(&h, prev)) {
1460 				struct drm_printer p = drm_info_printer(i915->drm.dev);
1461 
1462 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1463 				       __func__, engine->name,
1464 				       prev->fence.seqno, hws_seqno(&h, prev));
1465 				intel_engine_dump(engine, &p,
1466 						  "%s\n", engine->name);
1467 
1468 				i915_request_put(rq);
1469 				i915_request_put(prev);
1470 
1471 				i915_gem_set_wedged(i915);
1472 
1473 				err = -EIO;
1474 				goto fini;
1475 			}
1476 
1477 			reset_count = fake_hangcheck(i915, BIT(id));
1478 
1479 			if (prev->fence.error != -EIO) {
1480 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1481 				       prev->fence.error);
1482 				i915_request_put(rq);
1483 				i915_request_put(prev);
1484 				err = -EINVAL;
1485 				goto fini;
1486 			}
1487 
1488 			if (rq->fence.error) {
1489 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1490 				       rq->fence.error);
1491 				i915_request_put(rq);
1492 				i915_request_put(prev);
1493 				err = -EINVAL;
1494 				goto fini;
1495 			}
1496 
1497 			if (i915_reset_count(&i915->gpu_error) == reset_count) {
1498 				pr_err("No GPU reset recorded!\n");
1499 				i915_request_put(rq);
1500 				i915_request_put(prev);
1501 				err = -EINVAL;
1502 				goto fini;
1503 			}
1504 
1505 			i915_request_put(prev);
1506 			prev = rq;
1507 			count++;
1508 		} while (time_before(jiffies, end_time));
1509 		pr_info("%s: Completed %d resets\n", engine->name, count);
1510 
1511 		*h.batch = MI_BATCH_BUFFER_END;
1512 		i915_gem_chipset_flush(i915);
1513 
1514 		i915_request_put(prev);
1515 
1516 		err = igt_flush_test(i915, I915_WAIT_LOCKED);
1517 		if (err)
1518 			break;
1519 	}
1520 
1521 fini:
1522 	hang_fini(&h);
1523 unlock:
1524 	mutex_unlock(&i915->drm.struct_mutex);
1525 	igt_global_reset_unlock(i915);
1526 
1527 	if (i915_reset_failed(i915))
1528 		return -EIO;
1529 
1530 	return err;
1531 }
1532 
1533 static int igt_handle_error(void *arg)
1534 {
1535 	struct drm_i915_private *i915 = arg;
1536 	struct intel_engine_cs *engine = i915->engine[RCS0];
1537 	struct hang h;
1538 	struct i915_request *rq;
1539 	struct i915_gpu_state *error;
1540 	int err;
1541 
1542 	/* Check that we can issue a global GPU and engine reset */
1543 
1544 	if (!intel_has_reset_engine(i915))
1545 		return 0;
1546 
1547 	if (!engine || !intel_engine_can_store_dword(engine))
1548 		return 0;
1549 
1550 	mutex_lock(&i915->drm.struct_mutex);
1551 
1552 	err = hang_init(&h, i915);
1553 	if (err)
1554 		goto err_unlock;
1555 
1556 	rq = hang_create_request(&h, engine);
1557 	if (IS_ERR(rq)) {
1558 		err = PTR_ERR(rq);
1559 		goto err_fini;
1560 	}
1561 
1562 	i915_request_get(rq);
1563 	i915_request_add(rq);
1564 
1565 	if (!wait_until_running(&h, rq)) {
1566 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1567 
1568 		pr_err("%s: Failed to start request %llx, at %x\n",
1569 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1570 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1571 
1572 		i915_gem_set_wedged(i915);
1573 
1574 		err = -EIO;
1575 		goto err_request;
1576 	}
1577 
1578 	mutex_unlock(&i915->drm.struct_mutex);
1579 
1580 	/* Temporarily disable error capture */
1581 	error = xchg(&i915->gpu_error.first_error, (void *)-1);
1582 
1583 	i915_handle_error(i915, engine->mask, 0, NULL);
1584 
1585 	xchg(&i915->gpu_error.first_error, error);
1586 
1587 	mutex_lock(&i915->drm.struct_mutex);
1588 
1589 	if (rq->fence.error != -EIO) {
1590 		pr_err("Guilty request not identified!\n");
1591 		err = -EINVAL;
1592 		goto err_request;
1593 	}
1594 
1595 err_request:
1596 	i915_request_put(rq);
1597 err_fini:
1598 	hang_fini(&h);
1599 err_unlock:
1600 	mutex_unlock(&i915->drm.struct_mutex);
1601 	return err;
1602 }
1603 
1604 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1605 				     const struct igt_atomic_section *p,
1606 				     const char *mode)
1607 {
1608 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1609 	int err;
1610 
1611 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1612 		  engine->name, mode, p->name);
1613 
1614 	tasklet_disable_nosync(t);
1615 	p->critical_section_begin();
1616 
1617 	err = i915_reset_engine(engine, NULL);
1618 
1619 	p->critical_section_end();
1620 	tasklet_enable(t);
1621 
1622 	if (err)
1623 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1624 		       engine->name, mode, p->name);
1625 
1626 	return err;
1627 }
1628 
1629 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1630 				   const struct igt_atomic_section *p)
1631 {
1632 	struct drm_i915_private *i915 = engine->i915;
1633 	struct i915_request *rq;
1634 	struct hang h;
1635 	int err;
1636 
1637 	err = __igt_atomic_reset_engine(engine, p, "idle");
1638 	if (err)
1639 		return err;
1640 
1641 	err = hang_init(&h, i915);
1642 	if (err)
1643 		return err;
1644 
1645 	rq = hang_create_request(&h, engine);
1646 	if (IS_ERR(rq)) {
1647 		err = PTR_ERR(rq);
1648 		goto out;
1649 	}
1650 
1651 	i915_request_get(rq);
1652 	i915_request_add(rq);
1653 
1654 	if (wait_until_running(&h, rq)) {
1655 		err = __igt_atomic_reset_engine(engine, p, "active");
1656 	} else {
1657 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1658 		       __func__, engine->name,
1659 		       rq->fence.seqno, hws_seqno(&h, rq));
1660 		i915_gem_set_wedged(i915);
1661 		err = -EIO;
1662 	}
1663 
1664 	if (err == 0) {
1665 		struct igt_wedge_me w;
1666 
1667 		igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/)
1668 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1669 		if (i915_reset_failed(i915))
1670 			err = -EIO;
1671 	}
1672 
1673 	i915_request_put(rq);
1674 out:
1675 	hang_fini(&h);
1676 	return err;
1677 }
1678 
1679 static int igt_reset_engines_atomic(void *arg)
1680 {
1681 	struct drm_i915_private *i915 = arg;
1682 	const typeof(*igt_atomic_phases) *p;
1683 	int err = 0;
1684 
1685 	/* Check that the engines resets are usable from atomic context */
1686 
1687 	if (!intel_has_reset_engine(i915))
1688 		return 0;
1689 
1690 	if (USES_GUC_SUBMISSION(i915))
1691 		return 0;
1692 
1693 	igt_global_reset_lock(i915);
1694 	mutex_lock(&i915->drm.struct_mutex);
1695 
1696 	/* Flush any requests before we get started and check basics */
1697 	if (!igt_force_reset(i915))
1698 		goto unlock;
1699 
1700 	for (p = igt_atomic_phases; p->name; p++) {
1701 		struct intel_engine_cs *engine;
1702 		enum intel_engine_id id;
1703 
1704 		for_each_engine(engine, i915, id) {
1705 			err = igt_atomic_reset_engine(engine, p);
1706 			if (err)
1707 				goto out;
1708 		}
1709 	}
1710 
1711 out:
1712 	/* As we poke around the guts, do a full reset before continuing. */
1713 	igt_force_reset(i915);
1714 
1715 unlock:
1716 	mutex_unlock(&i915->drm.struct_mutex);
1717 	igt_global_reset_unlock(i915);
1718 
1719 	return err;
1720 }
1721 
1722 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1723 {
1724 	static const struct i915_subtest tests[] = {
1725 		SUBTEST(igt_hang_sanitycheck),
1726 		SUBTEST(igt_reset_nop),
1727 		SUBTEST(igt_reset_nop_engine),
1728 		SUBTEST(igt_reset_idle_engine),
1729 		SUBTEST(igt_reset_active_engine),
1730 		SUBTEST(igt_reset_engines),
1731 		SUBTEST(igt_reset_engines_atomic),
1732 		SUBTEST(igt_reset_queue),
1733 		SUBTEST(igt_reset_wait),
1734 		SUBTEST(igt_reset_evict_ggtt),
1735 		SUBTEST(igt_reset_evict_ppgtt),
1736 		SUBTEST(igt_reset_evict_fence),
1737 		SUBTEST(igt_handle_error),
1738 	};
1739 	intel_wakeref_t wakeref;
1740 	bool saved_hangcheck;
1741 	int err;
1742 
1743 	if (!intel_has_gpu_reset(i915))
1744 		return 0;
1745 
1746 	if (i915_terminally_wedged(i915))
1747 		return -EIO; /* we're long past hope of a successful reset */
1748 
1749 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1750 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1751 	drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */
1752 
1753 	err = i915_subtests(tests, i915);
1754 
1755 	mutex_lock(&i915->drm.struct_mutex);
1756 	igt_flush_test(i915, I915_WAIT_LOCKED);
1757 	mutex_unlock(&i915->drm.struct_mutex);
1758 
1759 	i915_modparams.enable_hangcheck = saved_hangcheck;
1760 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1761 
1762 	return err;
1763 }
1764