1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "intel_engine_pm.h"
28 
29 #include "i915_selftest.h"
30 #include "selftests/i915_random.h"
31 #include "selftests/igt_flush_test.h"
32 #include "selftests/igt_gem_utils.h"
33 #include "selftests/igt_reset.h"
34 #include "selftests/igt_wedge_me.h"
35 #include "selftests/igt_atomic.h"
36 
37 #include "selftests/mock_context.h"
38 #include "selftests/mock_drm.h"
39 
40 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
41 
42 struct hang {
43 	struct drm_i915_private *i915;
44 	struct drm_i915_gem_object *hws;
45 	struct drm_i915_gem_object *obj;
46 	struct i915_gem_context *ctx;
47 	u32 *seqno;
48 	u32 *batch;
49 };
50 
51 static int hang_init(struct hang *h, struct drm_i915_private *i915)
52 {
53 	void *vaddr;
54 	int err;
55 
56 	memset(h, 0, sizeof(*h));
57 	h->i915 = i915;
58 
59 	h->ctx = kernel_context(i915);
60 	if (IS_ERR(h->ctx))
61 		return PTR_ERR(h->ctx);
62 
63 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
64 
65 	h->hws = i915_gem_object_create_internal(i915, PAGE_SIZE);
66 	if (IS_ERR(h->hws)) {
67 		err = PTR_ERR(h->hws);
68 		goto err_ctx;
69 	}
70 
71 	h->obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
72 	if (IS_ERR(h->obj)) {
73 		err = PTR_ERR(h->obj);
74 		goto err_hws;
75 	}
76 
77 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
78 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
79 	if (IS_ERR(vaddr)) {
80 		err = PTR_ERR(vaddr);
81 		goto err_obj;
82 	}
83 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
84 
85 	vaddr = i915_gem_object_pin_map(h->obj,
86 					i915_coherent_map_type(i915));
87 	if (IS_ERR(vaddr)) {
88 		err = PTR_ERR(vaddr);
89 		goto err_unpin_hws;
90 	}
91 	h->batch = vaddr;
92 
93 	return 0;
94 
95 err_unpin_hws:
96 	i915_gem_object_unpin_map(h->hws);
97 err_obj:
98 	i915_gem_object_put(h->obj);
99 err_hws:
100 	i915_gem_object_put(h->hws);
101 err_ctx:
102 	kernel_context_close(h->ctx);
103 	return err;
104 }
105 
106 static u64 hws_address(const struct i915_vma *hws,
107 		       const struct i915_request *rq)
108 {
109 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
110 }
111 
112 static int move_to_active(struct i915_vma *vma,
113 			  struct i915_request *rq,
114 			  unsigned int flags)
115 {
116 	int err;
117 
118 	err = i915_vma_move_to_active(vma, rq, flags);
119 	if (err)
120 		return err;
121 
122 	if (!i915_gem_object_has_active_reference(vma->obj)) {
123 		i915_gem_object_get(vma->obj);
124 		i915_gem_object_set_active_reference(vma->obj);
125 	}
126 
127 	return 0;
128 }
129 
130 static struct i915_request *
131 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
132 {
133 	struct drm_i915_private *i915 = h->i915;
134 	struct i915_address_space *vm =
135 		h->ctx->ppgtt ? &h->ctx->ppgtt->vm : &i915->ggtt.vm;
136 	struct i915_request *rq = NULL;
137 	struct i915_vma *hws, *vma;
138 	unsigned int flags;
139 	u32 *batch;
140 	int err;
141 
142 	if (i915_gem_object_is_active(h->obj)) {
143 		struct drm_i915_gem_object *obj;
144 		void *vaddr;
145 
146 		obj = i915_gem_object_create_internal(h->i915, PAGE_SIZE);
147 		if (IS_ERR(obj))
148 			return ERR_CAST(obj);
149 
150 		vaddr = i915_gem_object_pin_map(obj,
151 						i915_coherent_map_type(h->i915));
152 		if (IS_ERR(vaddr)) {
153 			i915_gem_object_put(obj);
154 			return ERR_CAST(vaddr);
155 		}
156 
157 		i915_gem_object_unpin_map(h->obj);
158 		i915_gem_object_put(h->obj);
159 
160 		h->obj = obj;
161 		h->batch = vaddr;
162 	}
163 
164 	vma = i915_vma_instance(h->obj, vm, NULL);
165 	if (IS_ERR(vma))
166 		return ERR_CAST(vma);
167 
168 	hws = i915_vma_instance(h->hws, vm, NULL);
169 	if (IS_ERR(hws))
170 		return ERR_CAST(hws);
171 
172 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
173 	if (err)
174 		return ERR_PTR(err);
175 
176 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
177 	if (err)
178 		goto unpin_vma;
179 
180 	rq = igt_request_alloc(h->ctx, engine);
181 	if (IS_ERR(rq)) {
182 		err = PTR_ERR(rq);
183 		goto unpin_hws;
184 	}
185 
186 	err = move_to_active(vma, rq, 0);
187 	if (err)
188 		goto cancel_rq;
189 
190 	err = move_to_active(hws, rq, 0);
191 	if (err)
192 		goto cancel_rq;
193 
194 	batch = h->batch;
195 	if (INTEL_GEN(i915) >= 8) {
196 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
197 		*batch++ = lower_32_bits(hws_address(hws, rq));
198 		*batch++ = upper_32_bits(hws_address(hws, rq));
199 		*batch++ = rq->fence.seqno;
200 		*batch++ = MI_ARB_CHECK;
201 
202 		memset(batch, 0, 1024);
203 		batch += 1024 / sizeof(*batch);
204 
205 		*batch++ = MI_ARB_CHECK;
206 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
207 		*batch++ = lower_32_bits(vma->node.start);
208 		*batch++ = upper_32_bits(vma->node.start);
209 	} else if (INTEL_GEN(i915) >= 6) {
210 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
211 		*batch++ = 0;
212 		*batch++ = lower_32_bits(hws_address(hws, rq));
213 		*batch++ = rq->fence.seqno;
214 		*batch++ = MI_ARB_CHECK;
215 
216 		memset(batch, 0, 1024);
217 		batch += 1024 / sizeof(*batch);
218 
219 		*batch++ = MI_ARB_CHECK;
220 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
221 		*batch++ = lower_32_bits(vma->node.start);
222 	} else if (INTEL_GEN(i915) >= 4) {
223 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
224 		*batch++ = 0;
225 		*batch++ = lower_32_bits(hws_address(hws, rq));
226 		*batch++ = rq->fence.seqno;
227 		*batch++ = MI_ARB_CHECK;
228 
229 		memset(batch, 0, 1024);
230 		batch += 1024 / sizeof(*batch);
231 
232 		*batch++ = MI_ARB_CHECK;
233 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234 		*batch++ = lower_32_bits(vma->node.start);
235 	} else {
236 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
237 		*batch++ = lower_32_bits(hws_address(hws, rq));
238 		*batch++ = rq->fence.seqno;
239 		*batch++ = MI_ARB_CHECK;
240 
241 		memset(batch, 0, 1024);
242 		batch += 1024 / sizeof(*batch);
243 
244 		*batch++ = MI_ARB_CHECK;
245 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
246 		*batch++ = lower_32_bits(vma->node.start);
247 	}
248 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
249 	i915_gem_chipset_flush(h->i915);
250 
251 	if (rq->engine->emit_init_breadcrumb) {
252 		err = rq->engine->emit_init_breadcrumb(rq);
253 		if (err)
254 			goto cancel_rq;
255 	}
256 
257 	flags = 0;
258 	if (INTEL_GEN(vm->i915) <= 5)
259 		flags |= I915_DISPATCH_SECURE;
260 
261 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
262 
263 cancel_rq:
264 	if (err) {
265 		i915_request_skip(rq, err);
266 		i915_request_add(rq);
267 	}
268 unpin_hws:
269 	i915_vma_unpin(hws);
270 unpin_vma:
271 	i915_vma_unpin(vma);
272 	return err ? ERR_PTR(err) : rq;
273 }
274 
275 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
276 {
277 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
278 }
279 
280 static void hang_fini(struct hang *h)
281 {
282 	*h->batch = MI_BATCH_BUFFER_END;
283 	i915_gem_chipset_flush(h->i915);
284 
285 	i915_gem_object_unpin_map(h->obj);
286 	i915_gem_object_put(h->obj);
287 
288 	i915_gem_object_unpin_map(h->hws);
289 	i915_gem_object_put(h->hws);
290 
291 	kernel_context_close(h->ctx);
292 
293 	igt_flush_test(h->i915, I915_WAIT_LOCKED);
294 }
295 
296 static bool wait_until_running(struct hang *h, struct i915_request *rq)
297 {
298 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
299 					       rq->fence.seqno),
300 			     10) &&
301 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
302 					    rq->fence.seqno),
303 			  1000));
304 }
305 
306 static int igt_hang_sanitycheck(void *arg)
307 {
308 	struct drm_i915_private *i915 = arg;
309 	struct i915_request *rq;
310 	struct intel_engine_cs *engine;
311 	enum intel_engine_id id;
312 	struct hang h;
313 	int err;
314 
315 	/* Basic check that we can execute our hanging batch */
316 
317 	mutex_lock(&i915->drm.struct_mutex);
318 	err = hang_init(&h, i915);
319 	if (err)
320 		goto unlock;
321 
322 	for_each_engine(engine, i915, id) {
323 		struct igt_wedge_me w;
324 		long timeout;
325 
326 		if (!intel_engine_can_store_dword(engine))
327 			continue;
328 
329 		rq = hang_create_request(&h, engine);
330 		if (IS_ERR(rq)) {
331 			err = PTR_ERR(rq);
332 			pr_err("Failed to create request for %s, err=%d\n",
333 			       engine->name, err);
334 			goto fini;
335 		}
336 
337 		i915_request_get(rq);
338 
339 		*h.batch = MI_BATCH_BUFFER_END;
340 		i915_gem_chipset_flush(i915);
341 
342 		i915_request_add(rq);
343 
344 		timeout = 0;
345 		igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
346 			timeout = i915_request_wait(rq,
347 						    I915_WAIT_LOCKED,
348 						    MAX_SCHEDULE_TIMEOUT);
349 		if (i915_reset_failed(i915))
350 			timeout = -EIO;
351 
352 		i915_request_put(rq);
353 
354 		if (timeout < 0) {
355 			err = timeout;
356 			pr_err("Wait for request failed on %s, err=%d\n",
357 			       engine->name, err);
358 			goto fini;
359 		}
360 	}
361 
362 fini:
363 	hang_fini(&h);
364 unlock:
365 	mutex_unlock(&i915->drm.struct_mutex);
366 	return err;
367 }
368 
369 static bool wait_for_idle(struct intel_engine_cs *engine)
370 {
371 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
372 }
373 
374 static int igt_reset_nop(void *arg)
375 {
376 	struct drm_i915_private *i915 = arg;
377 	struct intel_engine_cs *engine;
378 	struct i915_gem_context *ctx;
379 	unsigned int reset_count, count;
380 	enum intel_engine_id id;
381 	intel_wakeref_t wakeref;
382 	struct drm_file *file;
383 	IGT_TIMEOUT(end_time);
384 	int err = 0;
385 
386 	/* Check that we can reset during non-user portions of requests */
387 
388 	file = mock_file(i915);
389 	if (IS_ERR(file))
390 		return PTR_ERR(file);
391 
392 	mutex_lock(&i915->drm.struct_mutex);
393 	ctx = live_context(i915, file);
394 	mutex_unlock(&i915->drm.struct_mutex);
395 	if (IS_ERR(ctx)) {
396 		err = PTR_ERR(ctx);
397 		goto out;
398 	}
399 
400 	i915_gem_context_clear_bannable(ctx);
401 	wakeref = intel_runtime_pm_get(i915);
402 	reset_count = i915_reset_count(&i915->gpu_error);
403 	count = 0;
404 	do {
405 		mutex_lock(&i915->drm.struct_mutex);
406 		for_each_engine(engine, i915, id) {
407 			int i;
408 
409 			for (i = 0; i < 16; i++) {
410 				struct i915_request *rq;
411 
412 				rq = igt_request_alloc(ctx, engine);
413 				if (IS_ERR(rq)) {
414 					err = PTR_ERR(rq);
415 					break;
416 				}
417 
418 				i915_request_add(rq);
419 			}
420 		}
421 		mutex_unlock(&i915->drm.struct_mutex);
422 
423 		igt_global_reset_lock(i915);
424 		i915_reset(i915, ALL_ENGINES, NULL);
425 		igt_global_reset_unlock(i915);
426 		if (i915_reset_failed(i915)) {
427 			err = -EIO;
428 			break;
429 		}
430 
431 		if (i915_reset_count(&i915->gpu_error) !=
432 		    reset_count + ++count) {
433 			pr_err("Full GPU reset not recorded!\n");
434 			err = -EINVAL;
435 			break;
436 		}
437 
438 		err = igt_flush_test(i915, 0);
439 		if (err)
440 			break;
441 	} while (time_before(jiffies, end_time));
442 	pr_info("%s: %d resets\n", __func__, count);
443 
444 	mutex_lock(&i915->drm.struct_mutex);
445 	err = igt_flush_test(i915, I915_WAIT_LOCKED);
446 	mutex_unlock(&i915->drm.struct_mutex);
447 
448 	intel_runtime_pm_put(i915, wakeref);
449 
450 out:
451 	mock_file_free(i915, file);
452 	if (i915_reset_failed(i915))
453 		err = -EIO;
454 	return err;
455 }
456 
457 static int igt_reset_nop_engine(void *arg)
458 {
459 	struct drm_i915_private *i915 = arg;
460 	struct intel_engine_cs *engine;
461 	struct i915_gem_context *ctx;
462 	enum intel_engine_id id;
463 	intel_wakeref_t wakeref;
464 	struct drm_file *file;
465 	int err = 0;
466 
467 	/* Check that we can engine-reset during non-user portions */
468 
469 	if (!intel_has_reset_engine(i915))
470 		return 0;
471 
472 	file = mock_file(i915);
473 	if (IS_ERR(file))
474 		return PTR_ERR(file);
475 
476 	mutex_lock(&i915->drm.struct_mutex);
477 	ctx = live_context(i915, file);
478 	mutex_unlock(&i915->drm.struct_mutex);
479 	if (IS_ERR(ctx)) {
480 		err = PTR_ERR(ctx);
481 		goto out;
482 	}
483 
484 	i915_gem_context_clear_bannable(ctx);
485 	wakeref = intel_runtime_pm_get(i915);
486 	for_each_engine(engine, i915, id) {
487 		unsigned int reset_count, reset_engine_count;
488 		unsigned int count;
489 		IGT_TIMEOUT(end_time);
490 
491 		reset_count = i915_reset_count(&i915->gpu_error);
492 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
493 							     engine);
494 		count = 0;
495 
496 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
497 		do {
498 			int i;
499 
500 			if (!wait_for_idle(engine)) {
501 				pr_err("%s failed to idle before reset\n",
502 				       engine->name);
503 				err = -EIO;
504 				break;
505 			}
506 
507 			mutex_lock(&i915->drm.struct_mutex);
508 			for (i = 0; i < 16; i++) {
509 				struct i915_request *rq;
510 
511 				rq = igt_request_alloc(ctx, engine);
512 				if (IS_ERR(rq)) {
513 					err = PTR_ERR(rq);
514 					break;
515 				}
516 
517 				i915_request_add(rq);
518 			}
519 			mutex_unlock(&i915->drm.struct_mutex);
520 
521 			err = i915_reset_engine(engine, NULL);
522 			if (err) {
523 				pr_err("i915_reset_engine failed\n");
524 				break;
525 			}
526 
527 			if (i915_reset_count(&i915->gpu_error) != reset_count) {
528 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
529 				err = -EINVAL;
530 				break;
531 			}
532 
533 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
534 			    reset_engine_count + ++count) {
535 				pr_err("%s engine reset not recorded!\n",
536 				       engine->name);
537 				err = -EINVAL;
538 				break;
539 			}
540 		} while (time_before(jiffies, end_time));
541 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
542 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
543 
544 		if (err)
545 			break;
546 
547 		err = igt_flush_test(i915, 0);
548 		if (err)
549 			break;
550 	}
551 
552 	mutex_lock(&i915->drm.struct_mutex);
553 	err = igt_flush_test(i915, I915_WAIT_LOCKED);
554 	mutex_unlock(&i915->drm.struct_mutex);
555 
556 	intel_runtime_pm_put(i915, wakeref);
557 out:
558 	mock_file_free(i915, file);
559 	if (i915_reset_failed(i915))
560 		err = -EIO;
561 	return err;
562 }
563 
564 static int __igt_reset_engine(struct drm_i915_private *i915, bool active)
565 {
566 	struct intel_engine_cs *engine;
567 	enum intel_engine_id id;
568 	struct hang h;
569 	int err = 0;
570 
571 	/* Check that we can issue an engine reset on an idle engine (no-op) */
572 
573 	if (!intel_has_reset_engine(i915))
574 		return 0;
575 
576 	if (active) {
577 		mutex_lock(&i915->drm.struct_mutex);
578 		err = hang_init(&h, i915);
579 		mutex_unlock(&i915->drm.struct_mutex);
580 		if (err)
581 			return err;
582 	}
583 
584 	for_each_engine(engine, i915, id) {
585 		unsigned int reset_count, reset_engine_count;
586 		IGT_TIMEOUT(end_time);
587 
588 		if (active && !intel_engine_can_store_dword(engine))
589 			continue;
590 
591 		if (!wait_for_idle(engine)) {
592 			pr_err("%s failed to idle before reset\n",
593 			       engine->name);
594 			err = -EIO;
595 			break;
596 		}
597 
598 		reset_count = i915_reset_count(&i915->gpu_error);
599 		reset_engine_count = i915_reset_engine_count(&i915->gpu_error,
600 							     engine);
601 
602 		intel_engine_pm_get(engine);
603 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
604 		do {
605 			if (active) {
606 				struct i915_request *rq;
607 
608 				mutex_lock(&i915->drm.struct_mutex);
609 				rq = hang_create_request(&h, engine);
610 				if (IS_ERR(rq)) {
611 					err = PTR_ERR(rq);
612 					mutex_unlock(&i915->drm.struct_mutex);
613 					break;
614 				}
615 
616 				i915_request_get(rq);
617 				i915_request_add(rq);
618 				mutex_unlock(&i915->drm.struct_mutex);
619 
620 				if (!wait_until_running(&h, rq)) {
621 					struct drm_printer p = drm_info_printer(i915->drm.dev);
622 
623 					pr_err("%s: Failed to start request %llx, at %x\n",
624 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
625 					intel_engine_dump(engine, &p,
626 							  "%s\n", engine->name);
627 
628 					i915_request_put(rq);
629 					err = -EIO;
630 					break;
631 				}
632 
633 				i915_request_put(rq);
634 			}
635 
636 			err = i915_reset_engine(engine, NULL);
637 			if (err) {
638 				pr_err("i915_reset_engine failed\n");
639 				break;
640 			}
641 
642 			if (i915_reset_count(&i915->gpu_error) != reset_count) {
643 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
644 				err = -EINVAL;
645 				break;
646 			}
647 
648 			if (i915_reset_engine_count(&i915->gpu_error, engine) !=
649 			    ++reset_engine_count) {
650 				pr_err("%s engine reset not recorded!\n",
651 				       engine->name);
652 				err = -EINVAL;
653 				break;
654 			}
655 		} while (time_before(jiffies, end_time));
656 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
657 		intel_engine_pm_put(engine);
658 
659 		if (err)
660 			break;
661 
662 		err = igt_flush_test(i915, 0);
663 		if (err)
664 			break;
665 	}
666 
667 	if (i915_reset_failed(i915))
668 		err = -EIO;
669 
670 	if (active) {
671 		mutex_lock(&i915->drm.struct_mutex);
672 		hang_fini(&h);
673 		mutex_unlock(&i915->drm.struct_mutex);
674 	}
675 
676 	return err;
677 }
678 
679 static int igt_reset_idle_engine(void *arg)
680 {
681 	return __igt_reset_engine(arg, false);
682 }
683 
684 static int igt_reset_active_engine(void *arg)
685 {
686 	return __igt_reset_engine(arg, true);
687 }
688 
689 struct active_engine {
690 	struct task_struct *task;
691 	struct intel_engine_cs *engine;
692 	unsigned long resets;
693 	unsigned int flags;
694 };
695 
696 #define TEST_ACTIVE	BIT(0)
697 #define TEST_OTHERS	BIT(1)
698 #define TEST_SELF	BIT(2)
699 #define TEST_PRIORITY	BIT(3)
700 
701 static int active_request_put(struct i915_request *rq)
702 {
703 	int err = 0;
704 
705 	if (!rq)
706 		return 0;
707 
708 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
709 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
710 			  rq->engine->name,
711 			  rq->fence.context,
712 			  rq->fence.seqno);
713 		GEM_TRACE_DUMP();
714 
715 		i915_gem_set_wedged(rq->i915);
716 		err = -EIO;
717 	}
718 
719 	i915_request_put(rq);
720 
721 	return err;
722 }
723 
724 static int active_engine(void *data)
725 {
726 	I915_RND_STATE(prng);
727 	struct active_engine *arg = data;
728 	struct intel_engine_cs *engine = arg->engine;
729 	struct i915_request *rq[8] = {};
730 	struct i915_gem_context *ctx[ARRAY_SIZE(rq)];
731 	struct drm_file *file;
732 	unsigned long count = 0;
733 	int err = 0;
734 
735 	file = mock_file(engine->i915);
736 	if (IS_ERR(file))
737 		return PTR_ERR(file);
738 
739 	for (count = 0; count < ARRAY_SIZE(ctx); count++) {
740 		mutex_lock(&engine->i915->drm.struct_mutex);
741 		ctx[count] = live_context(engine->i915, file);
742 		mutex_unlock(&engine->i915->drm.struct_mutex);
743 		if (IS_ERR(ctx[count])) {
744 			err = PTR_ERR(ctx[count]);
745 			while (--count)
746 				i915_gem_context_put(ctx[count]);
747 			goto err_file;
748 		}
749 	}
750 
751 	while (!kthread_should_stop()) {
752 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
753 		struct i915_request *old = rq[idx];
754 		struct i915_request *new;
755 
756 		mutex_lock(&engine->i915->drm.struct_mutex);
757 		new = igt_request_alloc(ctx[idx], engine);
758 		if (IS_ERR(new)) {
759 			mutex_unlock(&engine->i915->drm.struct_mutex);
760 			err = PTR_ERR(new);
761 			break;
762 		}
763 
764 		if (arg->flags & TEST_PRIORITY)
765 			ctx[idx]->sched.priority =
766 				i915_prandom_u32_max_state(512, &prng);
767 
768 		rq[idx] = i915_request_get(new);
769 		i915_request_add(new);
770 		mutex_unlock(&engine->i915->drm.struct_mutex);
771 
772 		err = active_request_put(old);
773 		if (err)
774 			break;
775 
776 		cond_resched();
777 	}
778 
779 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
780 		int err__ = active_request_put(rq[count]);
781 
782 		/* Keep the first error */
783 		if (!err)
784 			err = err__;
785 	}
786 
787 err_file:
788 	mock_file_free(engine->i915, file);
789 	return err;
790 }
791 
792 static int __igt_reset_engines(struct drm_i915_private *i915,
793 			       const char *test_name,
794 			       unsigned int flags)
795 {
796 	struct intel_engine_cs *engine, *other;
797 	enum intel_engine_id id, tmp;
798 	struct hang h;
799 	int err = 0;
800 
801 	/* Check that issuing a reset on one engine does not interfere
802 	 * with any other engine.
803 	 */
804 
805 	if (!intel_has_reset_engine(i915))
806 		return 0;
807 
808 	if (flags & TEST_ACTIVE) {
809 		mutex_lock(&i915->drm.struct_mutex);
810 		err = hang_init(&h, i915);
811 		mutex_unlock(&i915->drm.struct_mutex);
812 		if (err)
813 			return err;
814 
815 		if (flags & TEST_PRIORITY)
816 			h.ctx->sched.priority = 1024;
817 	}
818 
819 	for_each_engine(engine, i915, id) {
820 		struct active_engine threads[I915_NUM_ENGINES] = {};
821 		unsigned long global = i915_reset_count(&i915->gpu_error);
822 		unsigned long count = 0, reported;
823 		IGT_TIMEOUT(end_time);
824 
825 		if (flags & TEST_ACTIVE &&
826 		    !intel_engine_can_store_dword(engine))
827 			continue;
828 
829 		if (!wait_for_idle(engine)) {
830 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
831 			       engine->name, test_name);
832 			err = -EIO;
833 			break;
834 		}
835 
836 		memset(threads, 0, sizeof(threads));
837 		for_each_engine(other, i915, tmp) {
838 			struct task_struct *tsk;
839 
840 			threads[tmp].resets =
841 				i915_reset_engine_count(&i915->gpu_error,
842 							other);
843 
844 			if (!(flags & TEST_OTHERS))
845 				continue;
846 
847 			if (other == engine && !(flags & TEST_SELF))
848 				continue;
849 
850 			threads[tmp].engine = other;
851 			threads[tmp].flags = flags;
852 
853 			tsk = kthread_run(active_engine, &threads[tmp],
854 					  "igt/%s", other->name);
855 			if (IS_ERR(tsk)) {
856 				err = PTR_ERR(tsk);
857 				goto unwind;
858 			}
859 
860 			threads[tmp].task = tsk;
861 			get_task_struct(tsk);
862 		}
863 
864 		intel_engine_pm_get(engine);
865 		set_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
866 		do {
867 			struct i915_request *rq = NULL;
868 
869 			if (flags & TEST_ACTIVE) {
870 				mutex_lock(&i915->drm.struct_mutex);
871 				rq = hang_create_request(&h, engine);
872 				if (IS_ERR(rq)) {
873 					err = PTR_ERR(rq);
874 					mutex_unlock(&i915->drm.struct_mutex);
875 					break;
876 				}
877 
878 				i915_request_get(rq);
879 				i915_request_add(rq);
880 				mutex_unlock(&i915->drm.struct_mutex);
881 
882 				if (!wait_until_running(&h, rq)) {
883 					struct drm_printer p = drm_info_printer(i915->drm.dev);
884 
885 					pr_err("%s: Failed to start request %llx, at %x\n",
886 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
887 					intel_engine_dump(engine, &p,
888 							  "%s\n", engine->name);
889 
890 					i915_request_put(rq);
891 					err = -EIO;
892 					break;
893 				}
894 			}
895 
896 			err = i915_reset_engine(engine, NULL);
897 			if (err) {
898 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
899 				       engine->name, test_name, err);
900 				break;
901 			}
902 
903 			count++;
904 
905 			if (rq) {
906 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
907 					struct drm_printer p =
908 						drm_info_printer(i915->drm.dev);
909 
910 					pr_err("i915_reset_engine(%s:%s):"
911 					       " failed to complete request after reset\n",
912 					       engine->name, test_name);
913 					intel_engine_dump(engine, &p,
914 							  "%s\n", engine->name);
915 					i915_request_put(rq);
916 
917 					GEM_TRACE_DUMP();
918 					i915_gem_set_wedged(i915);
919 					err = -EIO;
920 					break;
921 				}
922 
923 				i915_request_put(rq);
924 			}
925 
926 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
927 				struct drm_printer p =
928 					drm_info_printer(i915->drm.dev);
929 
930 				pr_err("i915_reset_engine(%s:%s):"
931 				       " failed to idle after reset\n",
932 				       engine->name, test_name);
933 				intel_engine_dump(engine, &p,
934 						  "%s\n", engine->name);
935 
936 				err = -EIO;
937 				break;
938 			}
939 		} while (time_before(jiffies, end_time));
940 		clear_bit(I915_RESET_ENGINE + id, &i915->gpu_error.flags);
941 		intel_engine_pm_put(engine);
942 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
943 			engine->name, test_name, count);
944 
945 		reported = i915_reset_engine_count(&i915->gpu_error, engine);
946 		reported -= threads[engine->id].resets;
947 		if (reported != count) {
948 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
949 			       engine->name, test_name, count, reported);
950 			if (!err)
951 				err = -EINVAL;
952 		}
953 
954 unwind:
955 		for_each_engine(other, i915, tmp) {
956 			int ret;
957 
958 			if (!threads[tmp].task)
959 				continue;
960 
961 			ret = kthread_stop(threads[tmp].task);
962 			if (ret) {
963 				pr_err("kthread for other engine %s failed, err=%d\n",
964 				       other->name, ret);
965 				if (!err)
966 					err = ret;
967 			}
968 			put_task_struct(threads[tmp].task);
969 
970 			if (other != engine &&
971 			    threads[tmp].resets !=
972 			    i915_reset_engine_count(&i915->gpu_error, other)) {
973 				pr_err("Innocent engine %s was reset (count=%ld)\n",
974 				       other->name,
975 				       i915_reset_engine_count(&i915->gpu_error,
976 							       other) -
977 				       threads[tmp].resets);
978 				if (!err)
979 					err = -EINVAL;
980 			}
981 		}
982 
983 		if (global != i915_reset_count(&i915->gpu_error)) {
984 			pr_err("Global reset (count=%ld)!\n",
985 			       i915_reset_count(&i915->gpu_error) - global);
986 			if (!err)
987 				err = -EINVAL;
988 		}
989 
990 		if (err)
991 			break;
992 
993 		mutex_lock(&i915->drm.struct_mutex);
994 		err = igt_flush_test(i915, I915_WAIT_LOCKED);
995 		mutex_unlock(&i915->drm.struct_mutex);
996 		if (err)
997 			break;
998 	}
999 
1000 	if (i915_reset_failed(i915))
1001 		err = -EIO;
1002 
1003 	if (flags & TEST_ACTIVE) {
1004 		mutex_lock(&i915->drm.struct_mutex);
1005 		hang_fini(&h);
1006 		mutex_unlock(&i915->drm.struct_mutex);
1007 	}
1008 
1009 	return err;
1010 }
1011 
1012 static int igt_reset_engines(void *arg)
1013 {
1014 	static const struct {
1015 		const char *name;
1016 		unsigned int flags;
1017 	} phases[] = {
1018 		{ "idle", 0 },
1019 		{ "active", TEST_ACTIVE },
1020 		{ "others-idle", TEST_OTHERS },
1021 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1022 		{
1023 			"others-priority",
1024 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1025 		},
1026 		{
1027 			"self-priority",
1028 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1029 		},
1030 		{ }
1031 	};
1032 	struct drm_i915_private *i915 = arg;
1033 	typeof(*phases) *p;
1034 	int err;
1035 
1036 	for (p = phases; p->name; p++) {
1037 		if (p->flags & TEST_PRIORITY) {
1038 			if (!(i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1039 				continue;
1040 		}
1041 
1042 		err = __igt_reset_engines(arg, p->name, p->flags);
1043 		if (err)
1044 			return err;
1045 	}
1046 
1047 	return 0;
1048 }
1049 
1050 static u32 fake_hangcheck(struct drm_i915_private *i915,
1051 			  intel_engine_mask_t mask)
1052 {
1053 	u32 count = i915_reset_count(&i915->gpu_error);
1054 
1055 	i915_reset(i915, mask, NULL);
1056 
1057 	return count;
1058 }
1059 
1060 static int igt_reset_wait(void *arg)
1061 {
1062 	struct drm_i915_private *i915 = arg;
1063 	struct i915_request *rq;
1064 	unsigned int reset_count;
1065 	struct hang h;
1066 	long timeout;
1067 	int err;
1068 
1069 	if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1070 		return 0;
1071 
1072 	/* Check that we detect a stuck waiter and issue a reset */
1073 
1074 	igt_global_reset_lock(i915);
1075 
1076 	mutex_lock(&i915->drm.struct_mutex);
1077 	err = hang_init(&h, i915);
1078 	if (err)
1079 		goto unlock;
1080 
1081 	rq = hang_create_request(&h, i915->engine[RCS0]);
1082 	if (IS_ERR(rq)) {
1083 		err = PTR_ERR(rq);
1084 		goto fini;
1085 	}
1086 
1087 	i915_request_get(rq);
1088 	i915_request_add(rq);
1089 
1090 	if (!wait_until_running(&h, rq)) {
1091 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1092 
1093 		pr_err("%s: Failed to start request %llx, at %x\n",
1094 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1095 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1096 
1097 		i915_gem_set_wedged(i915);
1098 
1099 		err = -EIO;
1100 		goto out_rq;
1101 	}
1102 
1103 	reset_count = fake_hangcheck(i915, ALL_ENGINES);
1104 
1105 	timeout = i915_request_wait(rq, I915_WAIT_LOCKED, 10);
1106 	if (timeout < 0) {
1107 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1108 		       timeout);
1109 		err = timeout;
1110 		goto out_rq;
1111 	}
1112 
1113 	if (i915_reset_count(&i915->gpu_error) == reset_count) {
1114 		pr_err("No GPU reset recorded!\n");
1115 		err = -EINVAL;
1116 		goto out_rq;
1117 	}
1118 
1119 out_rq:
1120 	i915_request_put(rq);
1121 fini:
1122 	hang_fini(&h);
1123 unlock:
1124 	mutex_unlock(&i915->drm.struct_mutex);
1125 	igt_global_reset_unlock(i915);
1126 
1127 	if (i915_reset_failed(i915))
1128 		return -EIO;
1129 
1130 	return err;
1131 }
1132 
1133 struct evict_vma {
1134 	struct completion completion;
1135 	struct i915_vma *vma;
1136 };
1137 
1138 static int evict_vma(void *data)
1139 {
1140 	struct evict_vma *arg = data;
1141 	struct i915_address_space *vm = arg->vma->vm;
1142 	struct drm_i915_private *i915 = vm->i915;
1143 	struct drm_mm_node evict = arg->vma->node;
1144 	int err;
1145 
1146 	complete(&arg->completion);
1147 
1148 	mutex_lock(&i915->drm.struct_mutex);
1149 	err = i915_gem_evict_for_node(vm, &evict, 0);
1150 	mutex_unlock(&i915->drm.struct_mutex);
1151 
1152 	return err;
1153 }
1154 
1155 static int evict_fence(void *data)
1156 {
1157 	struct evict_vma *arg = data;
1158 	struct drm_i915_private *i915 = arg->vma->vm->i915;
1159 	int err;
1160 
1161 	complete(&arg->completion);
1162 
1163 	mutex_lock(&i915->drm.struct_mutex);
1164 
1165 	/* Mark the fence register as dirty to force the mmio update. */
1166 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1167 	if (err) {
1168 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1169 		goto out_unlock;
1170 	}
1171 
1172 	err = i915_vma_pin_fence(arg->vma);
1173 	if (err) {
1174 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1175 		goto out_unlock;
1176 	}
1177 
1178 	i915_vma_unpin_fence(arg->vma);
1179 
1180 out_unlock:
1181 	mutex_unlock(&i915->drm.struct_mutex);
1182 
1183 	return err;
1184 }
1185 
1186 static int __igt_reset_evict_vma(struct drm_i915_private *i915,
1187 				 struct i915_address_space *vm,
1188 				 int (*fn)(void *),
1189 				 unsigned int flags)
1190 {
1191 	struct drm_i915_gem_object *obj;
1192 	struct task_struct *tsk = NULL;
1193 	struct i915_request *rq;
1194 	struct evict_vma arg;
1195 	struct hang h;
1196 	int err;
1197 
1198 	if (!intel_engine_can_store_dword(i915->engine[RCS0]))
1199 		return 0;
1200 
1201 	/* Check that we can recover an unbind stuck on a hanging request */
1202 
1203 	mutex_lock(&i915->drm.struct_mutex);
1204 	err = hang_init(&h, i915);
1205 	if (err)
1206 		goto unlock;
1207 
1208 	obj = i915_gem_object_create_internal(i915, SZ_1M);
1209 	if (IS_ERR(obj)) {
1210 		err = PTR_ERR(obj);
1211 		goto fini;
1212 	}
1213 
1214 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1215 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1216 		if (err) {
1217 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1218 			goto out_obj;
1219 		}
1220 	}
1221 
1222 	arg.vma = i915_vma_instance(obj, vm, NULL);
1223 	if (IS_ERR(arg.vma)) {
1224 		err = PTR_ERR(arg.vma);
1225 		goto out_obj;
1226 	}
1227 
1228 	rq = hang_create_request(&h, i915->engine[RCS0]);
1229 	if (IS_ERR(rq)) {
1230 		err = PTR_ERR(rq);
1231 		goto out_obj;
1232 	}
1233 
1234 	err = i915_vma_pin(arg.vma, 0, 0,
1235 			   i915_vma_is_ggtt(arg.vma) ?
1236 			   PIN_GLOBAL | PIN_MAPPABLE :
1237 			   PIN_USER);
1238 	if (err) {
1239 		i915_request_add(rq);
1240 		goto out_obj;
1241 	}
1242 
1243 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1244 		err = i915_vma_pin_fence(arg.vma);
1245 		if (err) {
1246 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1247 			i915_vma_unpin(arg.vma);
1248 			i915_request_add(rq);
1249 			goto out_obj;
1250 		}
1251 	}
1252 
1253 	err = i915_vma_move_to_active(arg.vma, rq, flags);
1254 
1255 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1256 		i915_vma_unpin_fence(arg.vma);
1257 	i915_vma_unpin(arg.vma);
1258 
1259 	i915_request_get(rq);
1260 	i915_request_add(rq);
1261 	if (err)
1262 		goto out_rq;
1263 
1264 	mutex_unlock(&i915->drm.struct_mutex);
1265 
1266 	if (!wait_until_running(&h, rq)) {
1267 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1268 
1269 		pr_err("%s: Failed to start request %llx, at %x\n",
1270 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1271 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1272 
1273 		i915_gem_set_wedged(i915);
1274 		goto out_reset;
1275 	}
1276 
1277 	init_completion(&arg.completion);
1278 
1279 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1280 	if (IS_ERR(tsk)) {
1281 		err = PTR_ERR(tsk);
1282 		tsk = NULL;
1283 		goto out_reset;
1284 	}
1285 	get_task_struct(tsk);
1286 
1287 	wait_for_completion(&arg.completion);
1288 
1289 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1290 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1291 
1292 		pr_err("igt/evict_vma kthread did not wait\n");
1293 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1294 
1295 		i915_gem_set_wedged(i915);
1296 		goto out_reset;
1297 	}
1298 
1299 out_reset:
1300 	igt_global_reset_lock(i915);
1301 	fake_hangcheck(rq->i915, rq->engine->mask);
1302 	igt_global_reset_unlock(i915);
1303 
1304 	if (tsk) {
1305 		struct igt_wedge_me w;
1306 
1307 		/* The reset, even indirectly, should take less than 10ms. */
1308 		igt_wedge_on_timeout(&w, i915, HZ / 10 /* 100ms timeout*/)
1309 			err = kthread_stop(tsk);
1310 
1311 		put_task_struct(tsk);
1312 	}
1313 
1314 	mutex_lock(&i915->drm.struct_mutex);
1315 out_rq:
1316 	i915_request_put(rq);
1317 out_obj:
1318 	i915_gem_object_put(obj);
1319 fini:
1320 	hang_fini(&h);
1321 unlock:
1322 	mutex_unlock(&i915->drm.struct_mutex);
1323 
1324 	if (i915_reset_failed(i915))
1325 		return -EIO;
1326 
1327 	return err;
1328 }
1329 
1330 static int igt_reset_evict_ggtt(void *arg)
1331 {
1332 	struct drm_i915_private *i915 = arg;
1333 
1334 	return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1335 				     evict_vma, EXEC_OBJECT_WRITE);
1336 }
1337 
1338 static int igt_reset_evict_ppgtt(void *arg)
1339 {
1340 	struct drm_i915_private *i915 = arg;
1341 	struct i915_gem_context *ctx;
1342 	struct drm_file *file;
1343 	int err;
1344 
1345 	file = mock_file(i915);
1346 	if (IS_ERR(file))
1347 		return PTR_ERR(file);
1348 
1349 	mutex_lock(&i915->drm.struct_mutex);
1350 	ctx = live_context(i915, file);
1351 	mutex_unlock(&i915->drm.struct_mutex);
1352 	if (IS_ERR(ctx)) {
1353 		err = PTR_ERR(ctx);
1354 		goto out;
1355 	}
1356 
1357 	err = 0;
1358 	if (ctx->ppgtt) /* aliasing == global gtt locking, covered above */
1359 		err = __igt_reset_evict_vma(i915, &ctx->ppgtt->vm,
1360 					    evict_vma, EXEC_OBJECT_WRITE);
1361 
1362 out:
1363 	mock_file_free(i915, file);
1364 	return err;
1365 }
1366 
1367 static int igt_reset_evict_fence(void *arg)
1368 {
1369 	struct drm_i915_private *i915 = arg;
1370 
1371 	return __igt_reset_evict_vma(i915, &i915->ggtt.vm,
1372 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1373 }
1374 
1375 static int wait_for_others(struct drm_i915_private *i915,
1376 			   struct intel_engine_cs *exclude)
1377 {
1378 	struct intel_engine_cs *engine;
1379 	enum intel_engine_id id;
1380 
1381 	for_each_engine(engine, i915, id) {
1382 		if (engine == exclude)
1383 			continue;
1384 
1385 		if (!wait_for_idle(engine))
1386 			return -EIO;
1387 	}
1388 
1389 	return 0;
1390 }
1391 
1392 static int igt_reset_queue(void *arg)
1393 {
1394 	struct drm_i915_private *i915 = arg;
1395 	struct intel_engine_cs *engine;
1396 	enum intel_engine_id id;
1397 	struct hang h;
1398 	int err;
1399 
1400 	/* Check that we replay pending requests following a hang */
1401 
1402 	igt_global_reset_lock(i915);
1403 
1404 	mutex_lock(&i915->drm.struct_mutex);
1405 	err = hang_init(&h, i915);
1406 	if (err)
1407 		goto unlock;
1408 
1409 	for_each_engine(engine, i915, id) {
1410 		struct i915_request *prev;
1411 		IGT_TIMEOUT(end_time);
1412 		unsigned int count;
1413 
1414 		if (!intel_engine_can_store_dword(engine))
1415 			continue;
1416 
1417 		prev = hang_create_request(&h, engine);
1418 		if (IS_ERR(prev)) {
1419 			err = PTR_ERR(prev);
1420 			goto fini;
1421 		}
1422 
1423 		i915_request_get(prev);
1424 		i915_request_add(prev);
1425 
1426 		count = 0;
1427 		do {
1428 			struct i915_request *rq;
1429 			unsigned int reset_count;
1430 
1431 			rq = hang_create_request(&h, engine);
1432 			if (IS_ERR(rq)) {
1433 				err = PTR_ERR(rq);
1434 				goto fini;
1435 			}
1436 
1437 			i915_request_get(rq);
1438 			i915_request_add(rq);
1439 
1440 			/*
1441 			 * XXX We don't handle resetting the kernel context
1442 			 * very well. If we trigger a device reset twice in
1443 			 * quick succession while the kernel context is
1444 			 * executing, we may end up skipping the breadcrumb.
1445 			 * This is really only a problem for the selftest as
1446 			 * normally there is a large interlude between resets
1447 			 * (hangcheck), or we focus on resetting just one
1448 			 * engine and so avoid repeatedly resetting innocents.
1449 			 */
1450 			err = wait_for_others(i915, engine);
1451 			if (err) {
1452 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1453 				       __func__, engine->name);
1454 				i915_request_put(rq);
1455 				i915_request_put(prev);
1456 
1457 				GEM_TRACE_DUMP();
1458 				i915_gem_set_wedged(i915);
1459 				goto fini;
1460 			}
1461 
1462 			if (!wait_until_running(&h, prev)) {
1463 				struct drm_printer p = drm_info_printer(i915->drm.dev);
1464 
1465 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1466 				       __func__, engine->name,
1467 				       prev->fence.seqno, hws_seqno(&h, prev));
1468 				intel_engine_dump(engine, &p,
1469 						  "%s\n", engine->name);
1470 
1471 				i915_request_put(rq);
1472 				i915_request_put(prev);
1473 
1474 				i915_gem_set_wedged(i915);
1475 
1476 				err = -EIO;
1477 				goto fini;
1478 			}
1479 
1480 			reset_count = fake_hangcheck(i915, BIT(id));
1481 
1482 			if (prev->fence.error != -EIO) {
1483 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1484 				       prev->fence.error);
1485 				i915_request_put(rq);
1486 				i915_request_put(prev);
1487 				err = -EINVAL;
1488 				goto fini;
1489 			}
1490 
1491 			if (rq->fence.error) {
1492 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1493 				       rq->fence.error);
1494 				i915_request_put(rq);
1495 				i915_request_put(prev);
1496 				err = -EINVAL;
1497 				goto fini;
1498 			}
1499 
1500 			if (i915_reset_count(&i915->gpu_error) == reset_count) {
1501 				pr_err("No GPU reset recorded!\n");
1502 				i915_request_put(rq);
1503 				i915_request_put(prev);
1504 				err = -EINVAL;
1505 				goto fini;
1506 			}
1507 
1508 			i915_request_put(prev);
1509 			prev = rq;
1510 			count++;
1511 		} while (time_before(jiffies, end_time));
1512 		pr_info("%s: Completed %d resets\n", engine->name, count);
1513 
1514 		*h.batch = MI_BATCH_BUFFER_END;
1515 		i915_gem_chipset_flush(i915);
1516 
1517 		i915_request_put(prev);
1518 
1519 		err = igt_flush_test(i915, I915_WAIT_LOCKED);
1520 		if (err)
1521 			break;
1522 	}
1523 
1524 fini:
1525 	hang_fini(&h);
1526 unlock:
1527 	mutex_unlock(&i915->drm.struct_mutex);
1528 	igt_global_reset_unlock(i915);
1529 
1530 	if (i915_reset_failed(i915))
1531 		return -EIO;
1532 
1533 	return err;
1534 }
1535 
1536 static int igt_handle_error(void *arg)
1537 {
1538 	struct drm_i915_private *i915 = arg;
1539 	struct intel_engine_cs *engine = i915->engine[RCS0];
1540 	struct hang h;
1541 	struct i915_request *rq;
1542 	struct i915_gpu_state *error;
1543 	int err;
1544 
1545 	/* Check that we can issue a global GPU and engine reset */
1546 
1547 	if (!intel_has_reset_engine(i915))
1548 		return 0;
1549 
1550 	if (!engine || !intel_engine_can_store_dword(engine))
1551 		return 0;
1552 
1553 	mutex_lock(&i915->drm.struct_mutex);
1554 
1555 	err = hang_init(&h, i915);
1556 	if (err)
1557 		goto err_unlock;
1558 
1559 	rq = hang_create_request(&h, engine);
1560 	if (IS_ERR(rq)) {
1561 		err = PTR_ERR(rq);
1562 		goto err_fini;
1563 	}
1564 
1565 	i915_request_get(rq);
1566 	i915_request_add(rq);
1567 
1568 	if (!wait_until_running(&h, rq)) {
1569 		struct drm_printer p = drm_info_printer(i915->drm.dev);
1570 
1571 		pr_err("%s: Failed to start request %llx, at %x\n",
1572 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1573 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1574 
1575 		i915_gem_set_wedged(i915);
1576 
1577 		err = -EIO;
1578 		goto err_request;
1579 	}
1580 
1581 	mutex_unlock(&i915->drm.struct_mutex);
1582 
1583 	/* Temporarily disable error capture */
1584 	error = xchg(&i915->gpu_error.first_error, (void *)-1);
1585 
1586 	i915_handle_error(i915, engine->mask, 0, NULL);
1587 
1588 	xchg(&i915->gpu_error.first_error, error);
1589 
1590 	mutex_lock(&i915->drm.struct_mutex);
1591 
1592 	if (rq->fence.error != -EIO) {
1593 		pr_err("Guilty request not identified!\n");
1594 		err = -EINVAL;
1595 		goto err_request;
1596 	}
1597 
1598 err_request:
1599 	i915_request_put(rq);
1600 err_fini:
1601 	hang_fini(&h);
1602 err_unlock:
1603 	mutex_unlock(&i915->drm.struct_mutex);
1604 	return err;
1605 }
1606 
1607 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1608 				     const struct igt_atomic_section *p,
1609 				     const char *mode)
1610 {
1611 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1612 	int err;
1613 
1614 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1615 		  engine->name, mode, p->name);
1616 
1617 	tasklet_disable_nosync(t);
1618 	p->critical_section_begin();
1619 
1620 	err = i915_reset_engine(engine, NULL);
1621 
1622 	p->critical_section_end();
1623 	tasklet_enable(t);
1624 
1625 	if (err)
1626 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1627 		       engine->name, mode, p->name);
1628 
1629 	return err;
1630 }
1631 
1632 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1633 				   const struct igt_atomic_section *p)
1634 {
1635 	struct drm_i915_private *i915 = engine->i915;
1636 	struct i915_request *rq;
1637 	struct hang h;
1638 	int err;
1639 
1640 	err = __igt_atomic_reset_engine(engine, p, "idle");
1641 	if (err)
1642 		return err;
1643 
1644 	err = hang_init(&h, i915);
1645 	if (err)
1646 		return err;
1647 
1648 	rq = hang_create_request(&h, engine);
1649 	if (IS_ERR(rq)) {
1650 		err = PTR_ERR(rq);
1651 		goto out;
1652 	}
1653 
1654 	i915_request_get(rq);
1655 	i915_request_add(rq);
1656 
1657 	if (wait_until_running(&h, rq)) {
1658 		err = __igt_atomic_reset_engine(engine, p, "active");
1659 	} else {
1660 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1661 		       __func__, engine->name,
1662 		       rq->fence.seqno, hws_seqno(&h, rq));
1663 		i915_gem_set_wedged(i915);
1664 		err = -EIO;
1665 	}
1666 
1667 	if (err == 0) {
1668 		struct igt_wedge_me w;
1669 
1670 		igt_wedge_on_timeout(&w, i915, HZ / 20 /* 50ms timeout*/)
1671 			i915_request_wait(rq,
1672 					  I915_WAIT_LOCKED,
1673 					  MAX_SCHEDULE_TIMEOUT);
1674 		if (i915_reset_failed(i915))
1675 			err = -EIO;
1676 	}
1677 
1678 	i915_request_put(rq);
1679 out:
1680 	hang_fini(&h);
1681 	return err;
1682 }
1683 
1684 static int igt_reset_engines_atomic(void *arg)
1685 {
1686 	struct drm_i915_private *i915 = arg;
1687 	const typeof(*igt_atomic_phases) *p;
1688 	int err = 0;
1689 
1690 	/* Check that the engines resets are usable from atomic context */
1691 
1692 	if (!intel_has_reset_engine(i915))
1693 		return 0;
1694 
1695 	if (USES_GUC_SUBMISSION(i915))
1696 		return 0;
1697 
1698 	igt_global_reset_lock(i915);
1699 	mutex_lock(&i915->drm.struct_mutex);
1700 
1701 	/* Flush any requests before we get started and check basics */
1702 	if (!igt_force_reset(i915))
1703 		goto unlock;
1704 
1705 	for (p = igt_atomic_phases; p->name; p++) {
1706 		struct intel_engine_cs *engine;
1707 		enum intel_engine_id id;
1708 
1709 		for_each_engine(engine, i915, id) {
1710 			err = igt_atomic_reset_engine(engine, p);
1711 			if (err)
1712 				goto out;
1713 		}
1714 	}
1715 
1716 out:
1717 	/* As we poke around the guts, do a full reset before continuing. */
1718 	igt_force_reset(i915);
1719 
1720 unlock:
1721 	mutex_unlock(&i915->drm.struct_mutex);
1722 	igt_global_reset_unlock(i915);
1723 
1724 	return err;
1725 }
1726 
1727 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1728 {
1729 	static const struct i915_subtest tests[] = {
1730 		SUBTEST(igt_hang_sanitycheck),
1731 		SUBTEST(igt_reset_nop),
1732 		SUBTEST(igt_reset_nop_engine),
1733 		SUBTEST(igt_reset_idle_engine),
1734 		SUBTEST(igt_reset_active_engine),
1735 		SUBTEST(igt_reset_engines),
1736 		SUBTEST(igt_reset_engines_atomic),
1737 		SUBTEST(igt_reset_queue),
1738 		SUBTEST(igt_reset_wait),
1739 		SUBTEST(igt_reset_evict_ggtt),
1740 		SUBTEST(igt_reset_evict_ppgtt),
1741 		SUBTEST(igt_reset_evict_fence),
1742 		SUBTEST(igt_handle_error),
1743 	};
1744 	intel_wakeref_t wakeref;
1745 	bool saved_hangcheck;
1746 	int err;
1747 
1748 	if (!intel_has_gpu_reset(i915))
1749 		return 0;
1750 
1751 	if (i915_terminally_wedged(i915))
1752 		return -EIO; /* we're long past hope of a successful reset */
1753 
1754 	wakeref = intel_runtime_pm_get(i915);
1755 	saved_hangcheck = fetch_and_zero(&i915_modparams.enable_hangcheck);
1756 	drain_delayed_work(&i915->gpu_error.hangcheck_work); /* flush param */
1757 
1758 	err = i915_subtests(tests, i915);
1759 
1760 	mutex_lock(&i915->drm.struct_mutex);
1761 	igt_flush_test(i915, I915_WAIT_LOCKED);
1762 	mutex_unlock(&i915->drm.struct_mutex);
1763 
1764 	i915_modparams.enable_hangcheck = saved_hangcheck;
1765 	intel_runtime_pm_put(i915, wakeref);
1766 
1767 	return err;
1768 }
1769