1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5 
6 #include <linux/kthread.h>
7 
8 #include "gem/i915_gem_context.h"
9 
10 #include "intel_gt.h"
11 #include "intel_engine_heartbeat.h"
12 #include "intel_engine_pm.h"
13 #include "selftest_engine_heartbeat.h"
14 
15 #include "i915_selftest.h"
16 #include "selftests/i915_random.h"
17 #include "selftests/igt_flush_test.h"
18 #include "selftests/igt_reset.h"
19 #include "selftests/igt_atomic.h"
20 
21 #include "selftests/mock_drm.h"
22 
23 #include "gem/selftests/mock_context.h"
24 #include "gem/selftests/igt_gem_utils.h"
25 
26 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
27 
28 struct hang {
29 	struct intel_gt *gt;
30 	struct drm_i915_gem_object *hws;
31 	struct drm_i915_gem_object *obj;
32 	struct i915_gem_context *ctx;
33 	u32 *seqno;
34 	u32 *batch;
35 };
36 
37 static int hang_init(struct hang *h, struct intel_gt *gt)
38 {
39 	void *vaddr;
40 	int err;
41 
42 	memset(h, 0, sizeof(*h));
43 	h->gt = gt;
44 
45 	h->ctx = kernel_context(gt->i915, NULL);
46 	if (IS_ERR(h->ctx))
47 		return PTR_ERR(h->ctx);
48 
49 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
50 
51 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
52 	if (IS_ERR(h->hws)) {
53 		err = PTR_ERR(h->hws);
54 		goto err_ctx;
55 	}
56 
57 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
58 	if (IS_ERR(h->obj)) {
59 		err = PTR_ERR(h->obj);
60 		goto err_hws;
61 	}
62 
63 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
64 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
65 	if (IS_ERR(vaddr)) {
66 		err = PTR_ERR(vaddr);
67 		goto err_obj;
68 	}
69 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
70 
71 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
72 						 i915_coherent_map_type(gt->i915, h->obj, false));
73 	if (IS_ERR(vaddr)) {
74 		err = PTR_ERR(vaddr);
75 		goto err_unpin_hws;
76 	}
77 	h->batch = vaddr;
78 
79 	return 0;
80 
81 err_unpin_hws:
82 	i915_gem_object_unpin_map(h->hws);
83 err_obj:
84 	i915_gem_object_put(h->obj);
85 err_hws:
86 	i915_gem_object_put(h->hws);
87 err_ctx:
88 	kernel_context_close(h->ctx);
89 	return err;
90 }
91 
92 static u64 hws_address(const struct i915_vma *hws,
93 		       const struct i915_request *rq)
94 {
95 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
96 }
97 
98 static int move_to_active(struct i915_vma *vma,
99 			  struct i915_request *rq,
100 			  unsigned int flags)
101 {
102 	int err;
103 
104 	i915_vma_lock(vma);
105 	err = i915_request_await_object(rq, vma->obj,
106 					flags & EXEC_OBJECT_WRITE);
107 	if (err == 0)
108 		err = i915_vma_move_to_active(vma, rq, flags);
109 	i915_vma_unlock(vma);
110 
111 	return err;
112 }
113 
114 static struct i915_request *
115 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
116 {
117 	struct intel_gt *gt = h->gt;
118 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
119 	struct drm_i915_gem_object *obj;
120 	struct i915_request *rq = NULL;
121 	struct i915_vma *hws, *vma;
122 	unsigned int flags;
123 	void *vaddr;
124 	u32 *batch;
125 	int err;
126 
127 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
128 	if (IS_ERR(obj)) {
129 		i915_vm_put(vm);
130 		return ERR_CAST(obj);
131 	}
132 
133 	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
134 	if (IS_ERR(vaddr)) {
135 		i915_gem_object_put(obj);
136 		i915_vm_put(vm);
137 		return ERR_CAST(vaddr);
138 	}
139 
140 	i915_gem_object_unpin_map(h->obj);
141 	i915_gem_object_put(h->obj);
142 
143 	h->obj = obj;
144 	h->batch = vaddr;
145 
146 	vma = i915_vma_instance(h->obj, vm, NULL);
147 	if (IS_ERR(vma)) {
148 		i915_vm_put(vm);
149 		return ERR_CAST(vma);
150 	}
151 
152 	hws = i915_vma_instance(h->hws, vm, NULL);
153 	if (IS_ERR(hws)) {
154 		i915_vm_put(vm);
155 		return ERR_CAST(hws);
156 	}
157 
158 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
159 	if (err) {
160 		i915_vm_put(vm);
161 		return ERR_PTR(err);
162 	}
163 
164 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
165 	if (err)
166 		goto unpin_vma;
167 
168 	rq = igt_request_alloc(h->ctx, engine);
169 	if (IS_ERR(rq)) {
170 		err = PTR_ERR(rq);
171 		goto unpin_hws;
172 	}
173 
174 	err = move_to_active(vma, rq, 0);
175 	if (err)
176 		goto cancel_rq;
177 
178 	err = move_to_active(hws, rq, 0);
179 	if (err)
180 		goto cancel_rq;
181 
182 	batch = h->batch;
183 	if (GRAPHICS_VER(gt->i915) >= 8) {
184 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
185 		*batch++ = lower_32_bits(hws_address(hws, rq));
186 		*batch++ = upper_32_bits(hws_address(hws, rq));
187 		*batch++ = rq->fence.seqno;
188 		*batch++ = MI_NOOP;
189 
190 		memset(batch, 0, 1024);
191 		batch += 1024 / sizeof(*batch);
192 
193 		*batch++ = MI_NOOP;
194 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
195 		*batch++ = lower_32_bits(vma->node.start);
196 		*batch++ = upper_32_bits(vma->node.start);
197 	} else if (GRAPHICS_VER(gt->i915) >= 6) {
198 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
199 		*batch++ = 0;
200 		*batch++ = lower_32_bits(hws_address(hws, rq));
201 		*batch++ = rq->fence.seqno;
202 		*batch++ = MI_NOOP;
203 
204 		memset(batch, 0, 1024);
205 		batch += 1024 / sizeof(*batch);
206 
207 		*batch++ = MI_NOOP;
208 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
209 		*batch++ = lower_32_bits(vma->node.start);
210 	} else if (GRAPHICS_VER(gt->i915) >= 4) {
211 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
212 		*batch++ = 0;
213 		*batch++ = lower_32_bits(hws_address(hws, rq));
214 		*batch++ = rq->fence.seqno;
215 		*batch++ = MI_NOOP;
216 
217 		memset(batch, 0, 1024);
218 		batch += 1024 / sizeof(*batch);
219 
220 		*batch++ = MI_NOOP;
221 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
222 		*batch++ = lower_32_bits(vma->node.start);
223 	} else {
224 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
225 		*batch++ = lower_32_bits(hws_address(hws, rq));
226 		*batch++ = rq->fence.seqno;
227 		*batch++ = MI_NOOP;
228 
229 		memset(batch, 0, 1024);
230 		batch += 1024 / sizeof(*batch);
231 
232 		*batch++ = MI_NOOP;
233 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
234 		*batch++ = lower_32_bits(vma->node.start);
235 	}
236 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
237 	intel_gt_chipset_flush(engine->gt);
238 
239 	if (rq->engine->emit_init_breadcrumb) {
240 		err = rq->engine->emit_init_breadcrumb(rq);
241 		if (err)
242 			goto cancel_rq;
243 	}
244 
245 	flags = 0;
246 	if (GRAPHICS_VER(gt->i915) <= 5)
247 		flags |= I915_DISPATCH_SECURE;
248 
249 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
250 
251 cancel_rq:
252 	if (err) {
253 		i915_request_set_error_once(rq, err);
254 		i915_request_add(rq);
255 	}
256 unpin_hws:
257 	i915_vma_unpin(hws);
258 unpin_vma:
259 	i915_vma_unpin(vma);
260 	i915_vm_put(vm);
261 	return err ? ERR_PTR(err) : rq;
262 }
263 
264 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
265 {
266 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
267 }
268 
269 static void hang_fini(struct hang *h)
270 {
271 	*h->batch = MI_BATCH_BUFFER_END;
272 	intel_gt_chipset_flush(h->gt);
273 
274 	i915_gem_object_unpin_map(h->obj);
275 	i915_gem_object_put(h->obj);
276 
277 	i915_gem_object_unpin_map(h->hws);
278 	i915_gem_object_put(h->hws);
279 
280 	kernel_context_close(h->ctx);
281 
282 	igt_flush_test(h->gt->i915);
283 }
284 
285 static bool wait_until_running(struct hang *h, struct i915_request *rq)
286 {
287 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
288 					       rq->fence.seqno),
289 			     10) &&
290 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
291 					    rq->fence.seqno),
292 			  1000));
293 }
294 
295 static int igt_hang_sanitycheck(void *arg)
296 {
297 	struct intel_gt *gt = arg;
298 	struct i915_request *rq;
299 	struct intel_engine_cs *engine;
300 	enum intel_engine_id id;
301 	struct hang h;
302 	int err;
303 
304 	/* Basic check that we can execute our hanging batch */
305 
306 	err = hang_init(&h, gt);
307 	if (err)
308 		return err;
309 
310 	for_each_engine(engine, gt, id) {
311 		struct intel_wedge_me w;
312 		long timeout;
313 
314 		if (!intel_engine_can_store_dword(engine))
315 			continue;
316 
317 		rq = hang_create_request(&h, engine);
318 		if (IS_ERR(rq)) {
319 			err = PTR_ERR(rq);
320 			pr_err("Failed to create request for %s, err=%d\n",
321 			       engine->name, err);
322 			goto fini;
323 		}
324 
325 		i915_request_get(rq);
326 
327 		*h.batch = MI_BATCH_BUFFER_END;
328 		intel_gt_chipset_flush(engine->gt);
329 
330 		i915_request_add(rq);
331 
332 		timeout = 0;
333 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
334 			timeout = i915_request_wait(rq, 0,
335 						    MAX_SCHEDULE_TIMEOUT);
336 		if (intel_gt_is_wedged(gt))
337 			timeout = -EIO;
338 
339 		i915_request_put(rq);
340 
341 		if (timeout < 0) {
342 			err = timeout;
343 			pr_err("Wait for request failed on %s, err=%d\n",
344 			       engine->name, err);
345 			goto fini;
346 		}
347 	}
348 
349 fini:
350 	hang_fini(&h);
351 	return err;
352 }
353 
354 static bool wait_for_idle(struct intel_engine_cs *engine)
355 {
356 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
357 }
358 
359 static int igt_reset_nop(void *arg)
360 {
361 	struct intel_gt *gt = arg;
362 	struct i915_gpu_error *global = &gt->i915->gpu_error;
363 	struct intel_engine_cs *engine;
364 	unsigned int reset_count, count;
365 	enum intel_engine_id id;
366 	IGT_TIMEOUT(end_time);
367 	int err = 0;
368 
369 	/* Check that we can reset during non-user portions of requests */
370 
371 	reset_count = i915_reset_count(global);
372 	count = 0;
373 	do {
374 		for_each_engine(engine, gt, id) {
375 			struct intel_context *ce;
376 			int i;
377 
378 			ce = intel_context_create(engine);
379 			if (IS_ERR(ce)) {
380 				err = PTR_ERR(ce);
381 				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
382 				break;
383 			}
384 
385 			for (i = 0; i < 16; i++) {
386 				struct i915_request *rq;
387 
388 				rq = intel_context_create_request(ce);
389 				if (IS_ERR(rq)) {
390 					err = PTR_ERR(rq);
391 					pr_err("[%s] Create request failed: %d!\n",
392 					       engine->name, err);
393 					break;
394 				}
395 
396 				i915_request_add(rq);
397 			}
398 
399 			intel_context_put(ce);
400 		}
401 
402 		igt_global_reset_lock(gt);
403 		intel_gt_reset(gt, ALL_ENGINES, NULL);
404 		igt_global_reset_unlock(gt);
405 
406 		if (intel_gt_is_wedged(gt)) {
407 			pr_err("[%s] GT is wedged!\n", engine->name);
408 			err = -EIO;
409 			break;
410 		}
411 
412 		if (i915_reset_count(global) != reset_count + ++count) {
413 			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
414 			       engine->name, i915_reset_count(global), reset_count, count);
415 			err = -EINVAL;
416 			break;
417 		}
418 
419 		err = igt_flush_test(gt->i915);
420 		if (err) {
421 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
422 			break;
423 		}
424 	} while (time_before(jiffies, end_time));
425 	pr_info("%s: %d resets\n", __func__, count);
426 
427 	if (igt_flush_test(gt->i915)) {
428 		pr_err("Post flush failed: %d!\n", err);
429 		err = -EIO;
430 	}
431 
432 	return err;
433 }
434 
435 static int igt_reset_nop_engine(void *arg)
436 {
437 	struct intel_gt *gt = arg;
438 	struct i915_gpu_error *global = &gt->i915->gpu_error;
439 	struct intel_engine_cs *engine;
440 	enum intel_engine_id id;
441 
442 	/* Check that we can engine-reset during non-user portions */
443 
444 	if (!intel_has_reset_engine(gt))
445 		return 0;
446 
447 	for_each_engine(engine, gt, id) {
448 		unsigned int reset_count, reset_engine_count, count;
449 		struct intel_context *ce;
450 		IGT_TIMEOUT(end_time);
451 		int err;
452 
453 		ce = intel_context_create(engine);
454 		if (IS_ERR(ce)) {
455 			pr_err("[%s] Create context failed: %d!\n", engine->name, err);
456 			return PTR_ERR(ce);
457 		}
458 
459 		reset_count = i915_reset_count(global);
460 		reset_engine_count = i915_reset_engine_count(global, engine);
461 		count = 0;
462 
463 		st_engine_heartbeat_disable(engine);
464 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
465 		do {
466 			int i;
467 
468 			if (!wait_for_idle(engine)) {
469 				pr_err("%s failed to idle before reset\n",
470 				       engine->name);
471 				err = -EIO;
472 				break;
473 			}
474 
475 			for (i = 0; i < 16; i++) {
476 				struct i915_request *rq;
477 
478 				rq = intel_context_create_request(ce);
479 				if (IS_ERR(rq)) {
480 					struct drm_printer p =
481 						drm_info_printer(gt->i915->drm.dev);
482 					intel_engine_dump(engine, &p,
483 							  "%s(%s): failed to submit request\n",
484 							  __func__,
485 							  engine->name);
486 
487 					GEM_TRACE("%s(%s): failed to submit request\n",
488 						  __func__,
489 						  engine->name);
490 					GEM_TRACE_DUMP();
491 
492 					intel_gt_set_wedged(gt);
493 
494 					err = PTR_ERR(rq);
495 					break;
496 				}
497 
498 				i915_request_add(rq);
499 			}
500 			err = intel_engine_reset(engine, NULL);
501 			if (err) {
502 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
503 				       engine->name, err);
504 				break;
505 			}
506 
507 			if (i915_reset_count(global) != reset_count) {
508 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
509 				err = -EINVAL;
510 				break;
511 			}
512 
513 			if (i915_reset_engine_count(global, engine) !=
514 			    reset_engine_count + ++count) {
515 				pr_err("%s engine reset not recorded!\n",
516 				       engine->name);
517 				err = -EINVAL;
518 				break;
519 			}
520 		} while (time_before(jiffies, end_time));
521 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
522 		st_engine_heartbeat_enable(engine);
523 
524 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
525 
526 		intel_context_put(ce);
527 		if (igt_flush_test(gt->i915))
528 			err = -EIO;
529 		if (err)
530 			return err;
531 	}
532 
533 	return 0;
534 }
535 
536 static void force_reset_timeout(struct intel_engine_cs *engine)
537 {
538 	engine->reset_timeout.probability = 999;
539 	atomic_set(&engine->reset_timeout.times, -1);
540 }
541 
542 static void cancel_reset_timeout(struct intel_engine_cs *engine)
543 {
544 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
545 }
546 
547 static int igt_reset_fail_engine(void *arg)
548 {
549 	struct intel_gt *gt = arg;
550 	struct intel_engine_cs *engine;
551 	enum intel_engine_id id;
552 
553 	/* Check that we can recover from engine-reset failues */
554 
555 	if (!intel_has_reset_engine(gt))
556 		return 0;
557 
558 	for_each_engine(engine, gt, id) {
559 		unsigned int count;
560 		struct intel_context *ce;
561 		IGT_TIMEOUT(end_time);
562 		int err;
563 
564 		ce = intel_context_create(engine);
565 		if (IS_ERR(ce)) {
566 			pr_err("[%s] Create context failed: %d!\n", engine->name, err);
567 			return PTR_ERR(ce);
568 		}
569 
570 		st_engine_heartbeat_disable(engine);
571 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
572 
573 		force_reset_timeout(engine);
574 		err = intel_engine_reset(engine, NULL);
575 		cancel_reset_timeout(engine);
576 		if (err == 0) /* timeouts only generated on gen8+ */
577 			goto skip;
578 
579 		count = 0;
580 		do {
581 			struct i915_request *last = NULL;
582 			int i;
583 
584 			if (!wait_for_idle(engine)) {
585 				pr_err("%s failed to idle before reset\n",
586 				       engine->name);
587 				err = -EIO;
588 				break;
589 			}
590 
591 			for (i = 0; i < count % 15; i++) {
592 				struct i915_request *rq;
593 
594 				rq = intel_context_create_request(ce);
595 				if (IS_ERR(rq)) {
596 					struct drm_printer p =
597 						drm_info_printer(gt->i915->drm.dev);
598 					intel_engine_dump(engine, &p,
599 							  "%s(%s): failed to submit request\n",
600 							  __func__,
601 							  engine->name);
602 
603 					GEM_TRACE("%s(%s): failed to submit request\n",
604 						  __func__,
605 						  engine->name);
606 					GEM_TRACE_DUMP();
607 
608 					intel_gt_set_wedged(gt);
609 					if (last)
610 						i915_request_put(last);
611 
612 					err = PTR_ERR(rq);
613 					goto out;
614 				}
615 
616 				if (last)
617 					i915_request_put(last);
618 				last = i915_request_get(rq);
619 				i915_request_add(rq);
620 			}
621 
622 			if (count & 1) {
623 				err = intel_engine_reset(engine, NULL);
624 				if (err) {
625 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
626 						      engine->name, err);
627 					GEM_TRACE_DUMP();
628 					i915_request_put(last);
629 					break;
630 				}
631 			} else {
632 				force_reset_timeout(engine);
633 				err = intel_engine_reset(engine, NULL);
634 				cancel_reset_timeout(engine);
635 				if (err != -ETIMEDOUT) {
636 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
637 					       engine->name, err);
638 					i915_request_put(last);
639 					break;
640 				}
641 			}
642 
643 			err = 0;
644 			if (last) {
645 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
646 					struct drm_printer p =
647 						drm_info_printer(gt->i915->drm.dev);
648 
649 					intel_engine_dump(engine, &p,
650 							  "%s(%s): failed to complete request\n",
651 							  __func__,
652 							  engine->name);
653 
654 					GEM_TRACE("%s(%s): failed to complete request\n",
655 						  __func__,
656 						  engine->name);
657 					GEM_TRACE_DUMP();
658 
659 					err = -EIO;
660 				}
661 				i915_request_put(last);
662 			}
663 			count++;
664 		} while (err == 0 && time_before(jiffies, end_time));
665 out:
666 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
667 skip:
668 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
669 		st_engine_heartbeat_enable(engine);
670 		intel_context_put(ce);
671 
672 		if (igt_flush_test(gt->i915))
673 			err = -EIO;
674 		if (err)
675 			return err;
676 	}
677 
678 	return 0;
679 }
680 
681 static int __igt_reset_engine(struct intel_gt *gt, bool active)
682 {
683 	struct i915_gpu_error *global = &gt->i915->gpu_error;
684 	struct intel_engine_cs *engine;
685 	enum intel_engine_id id;
686 	struct hang h;
687 	int err = 0;
688 
689 	/* Check that we can issue an engine reset on an idle engine (no-op) */
690 
691 	if (!intel_has_reset_engine(gt))
692 		return 0;
693 
694 	if (active) {
695 		err = hang_init(&h, gt);
696 		if (err)
697 			return err;
698 	}
699 
700 	for_each_engine(engine, gt, id) {
701 		unsigned int reset_count, reset_engine_count;
702 		unsigned long count;
703 		IGT_TIMEOUT(end_time);
704 
705 		if (active && !intel_engine_can_store_dword(engine))
706 			continue;
707 
708 		if (!wait_for_idle(engine)) {
709 			pr_err("%s failed to idle before reset\n",
710 			       engine->name);
711 			err = -EIO;
712 			break;
713 		}
714 
715 		reset_count = i915_reset_count(global);
716 		reset_engine_count = i915_reset_engine_count(global, engine);
717 
718 		st_engine_heartbeat_disable(engine);
719 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
720 		count = 0;
721 		do {
722 			if (active) {
723 				struct i915_request *rq;
724 
725 				rq = hang_create_request(&h, engine);
726 				if (IS_ERR(rq)) {
727 					err = PTR_ERR(rq);
728 					pr_err("[%s] Create hang request failed: %d!\n",
729 					       engine->name, err);
730 					break;
731 				}
732 
733 				i915_request_get(rq);
734 				i915_request_add(rq);
735 
736 				if (!wait_until_running(&h, rq)) {
737 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
738 
739 					pr_err("%s: Failed to start request %llx, at %x\n",
740 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
741 					intel_engine_dump(engine, &p,
742 							  "%s\n", engine->name);
743 
744 					i915_request_put(rq);
745 					err = -EIO;
746 					break;
747 				}
748 
749 				i915_request_put(rq);
750 			}
751 
752 			err = intel_engine_reset(engine, NULL);
753 			if (err) {
754 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
755 				       engine->name, err);
756 				break;
757 			}
758 
759 			if (i915_reset_count(global) != reset_count) {
760 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
761 				err = -EINVAL;
762 				break;
763 			}
764 
765 			if (i915_reset_engine_count(global, engine) !=
766 			    ++reset_engine_count) {
767 				pr_err("%s engine reset not recorded!\n",
768 				       engine->name);
769 				err = -EINVAL;
770 				break;
771 			}
772 
773 			count++;
774 		} while (time_before(jiffies, end_time));
775 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
776 		st_engine_heartbeat_enable(engine);
777 		pr_info("%s: Completed %lu %s resets\n",
778 			engine->name, count, active ? "active" : "idle");
779 
780 		if (err)
781 			break;
782 
783 		err = igt_flush_test(gt->i915);
784 		if (err) {
785 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
786 			break;
787 		}
788 	}
789 
790 	if (intel_gt_is_wedged(gt)) {
791 		pr_err("GT is wedged!\n");
792 		err = -EIO;
793 	}
794 
795 	if (active)
796 		hang_fini(&h);
797 
798 	return err;
799 }
800 
801 static int igt_reset_idle_engine(void *arg)
802 {
803 	return __igt_reset_engine(arg, false);
804 }
805 
806 static int igt_reset_active_engine(void *arg)
807 {
808 	return __igt_reset_engine(arg, true);
809 }
810 
811 struct active_engine {
812 	struct task_struct *task;
813 	struct intel_engine_cs *engine;
814 	unsigned long resets;
815 	unsigned int flags;
816 };
817 
818 #define TEST_ACTIVE	BIT(0)
819 #define TEST_OTHERS	BIT(1)
820 #define TEST_SELF	BIT(2)
821 #define TEST_PRIORITY	BIT(3)
822 
823 static int active_request_put(struct i915_request *rq)
824 {
825 	int err = 0;
826 
827 	if (!rq)
828 		return 0;
829 
830 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
831 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
832 			  rq->engine->name,
833 			  rq->fence.context,
834 			  rq->fence.seqno);
835 		GEM_TRACE_DUMP();
836 
837 		intel_gt_set_wedged(rq->engine->gt);
838 		err = -EIO;
839 	}
840 
841 	i915_request_put(rq);
842 
843 	return err;
844 }
845 
846 static int active_engine(void *data)
847 {
848 	I915_RND_STATE(prng);
849 	struct active_engine *arg = data;
850 	struct intel_engine_cs *engine = arg->engine;
851 	struct i915_request *rq[8] = {};
852 	struct intel_context *ce[ARRAY_SIZE(rq)];
853 	unsigned long count;
854 	int err = 0;
855 
856 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
857 		ce[count] = intel_context_create(engine);
858 		if (IS_ERR(ce[count])) {
859 			err = PTR_ERR(ce[count]);
860 			pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
861 			while (--count)
862 				intel_context_put(ce[count]);
863 			return err;
864 		}
865 	}
866 
867 	count = 0;
868 	while (!kthread_should_stop()) {
869 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
870 		struct i915_request *old = rq[idx];
871 		struct i915_request *new;
872 
873 		new = intel_context_create_request(ce[idx]);
874 		if (IS_ERR(new)) {
875 			err = PTR_ERR(new);
876 			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
877 			break;
878 		}
879 
880 		rq[idx] = i915_request_get(new);
881 		i915_request_add(new);
882 
883 		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
884 			struct i915_sched_attr attr = {
885 				.priority =
886 					i915_prandom_u32_max_state(512, &prng),
887 			};
888 			engine->sched_engine->schedule(rq[idx], &attr);
889 		}
890 
891 		err = active_request_put(old);
892 		if (err) {
893 			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
894 			break;
895 		}
896 
897 		cond_resched();
898 	}
899 
900 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
901 		int err__ = active_request_put(rq[count]);
902 
903 		if (err)
904 			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
905 
906 		/* Keep the first error */
907 		if (!err)
908 			err = err__;
909 
910 		intel_context_put(ce[count]);
911 	}
912 
913 	return err;
914 }
915 
916 static int __igt_reset_engines(struct intel_gt *gt,
917 			       const char *test_name,
918 			       unsigned int flags)
919 {
920 	struct i915_gpu_error *global = &gt->i915->gpu_error;
921 	struct intel_engine_cs *engine, *other;
922 	enum intel_engine_id id, tmp;
923 	struct hang h;
924 	int err = 0;
925 
926 	/* Check that issuing a reset on one engine does not interfere
927 	 * with any other engine.
928 	 */
929 
930 	if (!intel_has_reset_engine(gt))
931 		return 0;
932 
933 	if (flags & TEST_ACTIVE) {
934 		err = hang_init(&h, gt);
935 		if (err)
936 			return err;
937 
938 		if (flags & TEST_PRIORITY)
939 			h.ctx->sched.priority = 1024;
940 	}
941 
942 	for_each_engine(engine, gt, id) {
943 		struct active_engine threads[I915_NUM_ENGINES] = {};
944 		unsigned long device = i915_reset_count(global);
945 		unsigned long count = 0, reported;
946 		IGT_TIMEOUT(end_time);
947 
948 		if (flags & TEST_ACTIVE &&
949 		    !intel_engine_can_store_dword(engine))
950 			continue;
951 
952 		if (!wait_for_idle(engine)) {
953 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
954 			       engine->name, test_name);
955 			err = -EIO;
956 			break;
957 		}
958 
959 		memset(threads, 0, sizeof(threads));
960 		for_each_engine(other, gt, tmp) {
961 			struct task_struct *tsk;
962 
963 			threads[tmp].resets =
964 				i915_reset_engine_count(global, other);
965 
966 			if (other == engine && !(flags & TEST_SELF))
967 				continue;
968 
969 			if (other != engine && !(flags & TEST_OTHERS))
970 				continue;
971 
972 			threads[tmp].engine = other;
973 			threads[tmp].flags = flags;
974 
975 			tsk = kthread_run(active_engine, &threads[tmp],
976 					  "igt/%s", other->name);
977 			if (IS_ERR(tsk)) {
978 				err = PTR_ERR(tsk);
979 				pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
980 				goto unwind;
981 			}
982 
983 			threads[tmp].task = tsk;
984 			get_task_struct(tsk);
985 		}
986 
987 		yield(); /* start all threads before we begin */
988 
989 		st_engine_heartbeat_disable(engine);
990 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
991 		do {
992 			struct i915_request *rq = NULL;
993 
994 			if (flags & TEST_ACTIVE) {
995 				rq = hang_create_request(&h, engine);
996 				if (IS_ERR(rq)) {
997 					err = PTR_ERR(rq);
998 					pr_err("[%s] Create hang request failed: %d!\n",
999 					       engine->name, err);
1000 					break;
1001 				}
1002 
1003 				i915_request_get(rq);
1004 				i915_request_add(rq);
1005 
1006 				if (!wait_until_running(&h, rq)) {
1007 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1008 
1009 					pr_err("%s: Failed to start request %llx, at %x\n",
1010 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1011 					intel_engine_dump(engine, &p,
1012 							  "%s\n", engine->name);
1013 
1014 					i915_request_put(rq);
1015 					err = -EIO;
1016 					break;
1017 				}
1018 			}
1019 
1020 			err = intel_engine_reset(engine, NULL);
1021 			if (err) {
1022 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1023 				       engine->name, test_name, err);
1024 				break;
1025 			}
1026 
1027 			count++;
1028 
1029 			if (rq) {
1030 				if (rq->fence.error != -EIO) {
1031 					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1032 					       engine->name, test_name,
1033 					       rq->fence.context,
1034 					       rq->fence.seqno, rq->context->guc_id);
1035 					i915_request_put(rq);
1036 
1037 					GEM_TRACE_DUMP();
1038 					intel_gt_set_wedged(gt);
1039 					err = -EIO;
1040 					break;
1041 				}
1042 
1043 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1044 					struct drm_printer p =
1045 						drm_info_printer(gt->i915->drm.dev);
1046 
1047 					pr_err("i915_reset_engine(%s:%s):"
1048 					       " failed to complete request %llx:%lld after reset\n",
1049 					       engine->name, test_name,
1050 					       rq->fence.context,
1051 					       rq->fence.seqno);
1052 					intel_engine_dump(engine, &p,
1053 							  "%s\n", engine->name);
1054 					i915_request_put(rq);
1055 
1056 					GEM_TRACE_DUMP();
1057 					intel_gt_set_wedged(gt);
1058 					err = -EIO;
1059 					break;
1060 				}
1061 
1062 				i915_request_put(rq);
1063 			}
1064 
1065 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1066 				struct drm_printer p =
1067 					drm_info_printer(gt->i915->drm.dev);
1068 
1069 				pr_err("i915_reset_engine(%s:%s):"
1070 				       " failed to idle after reset\n",
1071 				       engine->name, test_name);
1072 				intel_engine_dump(engine, &p,
1073 						  "%s\n", engine->name);
1074 
1075 				err = -EIO;
1076 				break;
1077 			}
1078 		} while (time_before(jiffies, end_time));
1079 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1080 		st_engine_heartbeat_enable(engine);
1081 
1082 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1083 			engine->name, test_name, count);
1084 
1085 		reported = i915_reset_engine_count(global, engine);
1086 		reported -= threads[engine->id].resets;
1087 		if (reported != count) {
1088 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1089 			       engine->name, test_name, count, reported);
1090 			if (!err)
1091 				err = -EINVAL;
1092 		}
1093 
1094 unwind:
1095 		for_each_engine(other, gt, tmp) {
1096 			int ret;
1097 
1098 			if (!threads[tmp].task)
1099 				continue;
1100 
1101 			ret = kthread_stop(threads[tmp].task);
1102 			if (ret) {
1103 				pr_err("kthread for other engine %s failed, err=%d\n",
1104 				       other->name, ret);
1105 				if (!err)
1106 					err = ret;
1107 			}
1108 			put_task_struct(threads[tmp].task);
1109 
1110 			if (other->uabi_class != engine->uabi_class &&
1111 			    threads[tmp].resets !=
1112 			    i915_reset_engine_count(global, other)) {
1113 				pr_err("Innocent engine %s was reset (count=%ld)\n",
1114 				       other->name,
1115 				       i915_reset_engine_count(global, other) -
1116 				       threads[tmp].resets);
1117 				if (!err)
1118 					err = -EINVAL;
1119 			}
1120 		}
1121 
1122 		if (device != i915_reset_count(global)) {
1123 			pr_err("Global reset (count=%ld)!\n",
1124 			       i915_reset_count(global) - device);
1125 			if (!err)
1126 				err = -EINVAL;
1127 		}
1128 
1129 		if (err)
1130 			break;
1131 
1132 		err = igt_flush_test(gt->i915);
1133 		if (err) {
1134 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1135 			break;
1136 		}
1137 	}
1138 
1139 	if (intel_gt_is_wedged(gt))
1140 		err = -EIO;
1141 
1142 	if (flags & TEST_ACTIVE)
1143 		hang_fini(&h);
1144 
1145 	return err;
1146 }
1147 
1148 static int igt_reset_engines(void *arg)
1149 {
1150 	static const struct {
1151 		const char *name;
1152 		unsigned int flags;
1153 	} phases[] = {
1154 		{ "idle", 0 },
1155 		{ "active", TEST_ACTIVE },
1156 		{ "others-idle", TEST_OTHERS },
1157 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1158 		{
1159 			"others-priority",
1160 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1161 		},
1162 		{
1163 			"self-priority",
1164 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1165 		},
1166 		{ }
1167 	};
1168 	struct intel_gt *gt = arg;
1169 	typeof(*phases) *p;
1170 	int err;
1171 
1172 	for (p = phases; p->name; p++) {
1173 		if (p->flags & TEST_PRIORITY) {
1174 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1175 				continue;
1176 		}
1177 
1178 		err = __igt_reset_engines(arg, p->name, p->flags);
1179 		if (err)
1180 			return err;
1181 	}
1182 
1183 	return 0;
1184 }
1185 
1186 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1187 {
1188 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1189 
1190 	intel_gt_reset(gt, mask, NULL);
1191 
1192 	return count;
1193 }
1194 
1195 static int igt_reset_wait(void *arg)
1196 {
1197 	struct intel_gt *gt = arg;
1198 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1199 	struct intel_engine_cs *engine = gt->engine[RCS0];
1200 	struct i915_request *rq;
1201 	unsigned int reset_count;
1202 	struct hang h;
1203 	long timeout;
1204 	int err;
1205 
1206 	if (!engine || !intel_engine_can_store_dword(engine))
1207 		return 0;
1208 
1209 	/* Check that we detect a stuck waiter and issue a reset */
1210 
1211 	igt_global_reset_lock(gt);
1212 
1213 	err = hang_init(&h, gt);
1214 	if (err) {
1215 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1216 		goto unlock;
1217 	}
1218 
1219 	rq = hang_create_request(&h, engine);
1220 	if (IS_ERR(rq)) {
1221 		err = PTR_ERR(rq);
1222 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1223 		goto fini;
1224 	}
1225 
1226 	i915_request_get(rq);
1227 	i915_request_add(rq);
1228 
1229 	if (!wait_until_running(&h, rq)) {
1230 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1231 
1232 		pr_err("%s: Failed to start request %llx, at %x\n",
1233 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1234 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1235 
1236 		intel_gt_set_wedged(gt);
1237 
1238 		err = -EIO;
1239 		goto out_rq;
1240 	}
1241 
1242 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1243 
1244 	timeout = i915_request_wait(rq, 0, 10);
1245 	if (timeout < 0) {
1246 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1247 		       timeout);
1248 		err = timeout;
1249 		goto out_rq;
1250 	}
1251 
1252 	if (i915_reset_count(global) == reset_count) {
1253 		pr_err("No GPU reset recorded!\n");
1254 		err = -EINVAL;
1255 		goto out_rq;
1256 	}
1257 
1258 out_rq:
1259 	i915_request_put(rq);
1260 fini:
1261 	hang_fini(&h);
1262 unlock:
1263 	igt_global_reset_unlock(gt);
1264 
1265 	if (intel_gt_is_wedged(gt))
1266 		return -EIO;
1267 
1268 	return err;
1269 }
1270 
1271 struct evict_vma {
1272 	struct completion completion;
1273 	struct i915_vma *vma;
1274 };
1275 
1276 static int evict_vma(void *data)
1277 {
1278 	struct evict_vma *arg = data;
1279 	struct i915_address_space *vm = arg->vma->vm;
1280 	struct drm_mm_node evict = arg->vma->node;
1281 	int err;
1282 
1283 	complete(&arg->completion);
1284 
1285 	mutex_lock(&vm->mutex);
1286 	err = i915_gem_evict_for_node(vm, &evict, 0);
1287 	mutex_unlock(&vm->mutex);
1288 
1289 	return err;
1290 }
1291 
1292 static int evict_fence(void *data)
1293 {
1294 	struct evict_vma *arg = data;
1295 	int err;
1296 
1297 	complete(&arg->completion);
1298 
1299 	/* Mark the fence register as dirty to force the mmio update. */
1300 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1301 	if (err) {
1302 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1303 		return err;
1304 	}
1305 
1306 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1307 	if (err) {
1308 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1309 		return err;
1310 	}
1311 
1312 	err = i915_vma_pin_fence(arg->vma);
1313 	i915_vma_unpin(arg->vma);
1314 	if (err) {
1315 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1316 		return err;
1317 	}
1318 
1319 	i915_vma_unpin_fence(arg->vma);
1320 
1321 	return 0;
1322 }
1323 
1324 static int __igt_reset_evict_vma(struct intel_gt *gt,
1325 				 struct i915_address_space *vm,
1326 				 int (*fn)(void *),
1327 				 unsigned int flags)
1328 {
1329 	struct intel_engine_cs *engine = gt->engine[RCS0];
1330 	struct drm_i915_gem_object *obj;
1331 	struct task_struct *tsk = NULL;
1332 	struct i915_request *rq;
1333 	struct evict_vma arg;
1334 	struct hang h;
1335 	unsigned int pin_flags;
1336 	int err;
1337 
1338 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1339 		return 0;
1340 
1341 	if (!engine || !intel_engine_can_store_dword(engine))
1342 		return 0;
1343 
1344 	/* Check that we can recover an unbind stuck on a hanging request */
1345 
1346 	err = hang_init(&h, gt);
1347 	if (err) {
1348 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1349 		return err;
1350 	}
1351 
1352 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1353 	if (IS_ERR(obj)) {
1354 		err = PTR_ERR(obj);
1355 		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1356 		goto fini;
1357 	}
1358 
1359 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1360 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1361 		if (err) {
1362 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1363 			goto out_obj;
1364 		}
1365 	}
1366 
1367 	arg.vma = i915_vma_instance(obj, vm, NULL);
1368 	if (IS_ERR(arg.vma)) {
1369 		err = PTR_ERR(arg.vma);
1370 		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1371 		goto out_obj;
1372 	}
1373 
1374 	rq = hang_create_request(&h, engine);
1375 	if (IS_ERR(rq)) {
1376 		err = PTR_ERR(rq);
1377 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1378 		goto out_obj;
1379 	}
1380 
1381 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1382 
1383 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1384 		pin_flags |= PIN_MAPPABLE;
1385 
1386 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1387 	if (err) {
1388 		i915_request_add(rq);
1389 		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1390 		goto out_obj;
1391 	}
1392 
1393 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1394 		err = i915_vma_pin_fence(arg.vma);
1395 		if (err) {
1396 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1397 			i915_vma_unpin(arg.vma);
1398 			i915_request_add(rq);
1399 			goto out_obj;
1400 		}
1401 	}
1402 
1403 	i915_vma_lock(arg.vma);
1404 	err = i915_request_await_object(rq, arg.vma->obj,
1405 					flags & EXEC_OBJECT_WRITE);
1406 	if (err == 0) {
1407 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1408 		if (err)
1409 			pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1410 	} else {
1411 		pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1412 	}
1413 
1414 	i915_vma_unlock(arg.vma);
1415 
1416 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1417 		i915_vma_unpin_fence(arg.vma);
1418 	i915_vma_unpin(arg.vma);
1419 
1420 	i915_request_get(rq);
1421 	i915_request_add(rq);
1422 	if (err)
1423 		goto out_rq;
1424 
1425 	if (!wait_until_running(&h, rq)) {
1426 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1427 
1428 		pr_err("%s: Failed to start request %llx, at %x\n",
1429 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1430 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1431 
1432 		intel_gt_set_wedged(gt);
1433 		goto out_reset;
1434 	}
1435 
1436 	init_completion(&arg.completion);
1437 
1438 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1439 	if (IS_ERR(tsk)) {
1440 		err = PTR_ERR(tsk);
1441 		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1442 		tsk = NULL;
1443 		goto out_reset;
1444 	}
1445 	get_task_struct(tsk);
1446 
1447 	wait_for_completion(&arg.completion);
1448 
1449 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1450 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1451 
1452 		pr_err("igt/evict_vma kthread did not wait\n");
1453 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1454 
1455 		intel_gt_set_wedged(gt);
1456 		goto out_reset;
1457 	}
1458 
1459 out_reset:
1460 	igt_global_reset_lock(gt);
1461 	fake_hangcheck(gt, rq->engine->mask);
1462 	igt_global_reset_unlock(gt);
1463 
1464 	if (tsk) {
1465 		struct intel_wedge_me w;
1466 
1467 		/* The reset, even indirectly, should take less than 10ms. */
1468 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1469 			err = kthread_stop(tsk);
1470 
1471 		put_task_struct(tsk);
1472 	}
1473 
1474 out_rq:
1475 	i915_request_put(rq);
1476 out_obj:
1477 	i915_gem_object_put(obj);
1478 fini:
1479 	hang_fini(&h);
1480 	if (intel_gt_is_wedged(gt))
1481 		return -EIO;
1482 
1483 	return err;
1484 }
1485 
1486 static int igt_reset_evict_ggtt(void *arg)
1487 {
1488 	struct intel_gt *gt = arg;
1489 
1490 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1491 				     evict_vma, EXEC_OBJECT_WRITE);
1492 }
1493 
1494 static int igt_reset_evict_ppgtt(void *arg)
1495 {
1496 	struct intel_gt *gt = arg;
1497 	struct i915_ppgtt *ppgtt;
1498 	int err;
1499 
1500 	/* aliasing == global gtt locking, covered above */
1501 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1502 		return 0;
1503 
1504 	ppgtt = i915_ppgtt_create(gt);
1505 	if (IS_ERR(ppgtt))
1506 		return PTR_ERR(ppgtt);
1507 
1508 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1509 				    evict_vma, EXEC_OBJECT_WRITE);
1510 	i915_vm_put(&ppgtt->vm);
1511 
1512 	return err;
1513 }
1514 
1515 static int igt_reset_evict_fence(void *arg)
1516 {
1517 	struct intel_gt *gt = arg;
1518 
1519 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1520 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1521 }
1522 
1523 static int wait_for_others(struct intel_gt *gt,
1524 			   struct intel_engine_cs *exclude)
1525 {
1526 	struct intel_engine_cs *engine;
1527 	enum intel_engine_id id;
1528 
1529 	for_each_engine(engine, gt, id) {
1530 		if (engine == exclude)
1531 			continue;
1532 
1533 		if (!wait_for_idle(engine))
1534 			return -EIO;
1535 	}
1536 
1537 	return 0;
1538 }
1539 
1540 static int igt_reset_queue(void *arg)
1541 {
1542 	struct intel_gt *gt = arg;
1543 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1544 	struct intel_engine_cs *engine;
1545 	enum intel_engine_id id;
1546 	struct hang h;
1547 	int err;
1548 
1549 	/* Check that we replay pending requests following a hang */
1550 
1551 	igt_global_reset_lock(gt);
1552 
1553 	err = hang_init(&h, gt);
1554 	if (err)
1555 		goto unlock;
1556 
1557 	for_each_engine(engine, gt, id) {
1558 		struct i915_request *prev;
1559 		IGT_TIMEOUT(end_time);
1560 		unsigned int count;
1561 
1562 		if (!intel_engine_can_store_dword(engine))
1563 			continue;
1564 
1565 		prev = hang_create_request(&h, engine);
1566 		if (IS_ERR(prev)) {
1567 			err = PTR_ERR(prev);
1568 			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1569 			goto fini;
1570 		}
1571 
1572 		i915_request_get(prev);
1573 		i915_request_add(prev);
1574 
1575 		count = 0;
1576 		do {
1577 			struct i915_request *rq;
1578 			unsigned int reset_count;
1579 
1580 			rq = hang_create_request(&h, engine);
1581 			if (IS_ERR(rq)) {
1582 				err = PTR_ERR(rq);
1583 				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1584 				goto fini;
1585 			}
1586 
1587 			i915_request_get(rq);
1588 			i915_request_add(rq);
1589 
1590 			/*
1591 			 * XXX We don't handle resetting the kernel context
1592 			 * very well. If we trigger a device reset twice in
1593 			 * quick succession while the kernel context is
1594 			 * executing, we may end up skipping the breadcrumb.
1595 			 * This is really only a problem for the selftest as
1596 			 * normally there is a large interlude between resets
1597 			 * (hangcheck), or we focus on resetting just one
1598 			 * engine and so avoid repeatedly resetting innocents.
1599 			 */
1600 			err = wait_for_others(gt, engine);
1601 			if (err) {
1602 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1603 				       __func__, engine->name);
1604 				i915_request_put(rq);
1605 				i915_request_put(prev);
1606 
1607 				GEM_TRACE_DUMP();
1608 				intel_gt_set_wedged(gt);
1609 				goto fini;
1610 			}
1611 
1612 			if (!wait_until_running(&h, prev)) {
1613 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1614 
1615 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1616 				       __func__, engine->name,
1617 				       prev->fence.seqno, hws_seqno(&h, prev));
1618 				intel_engine_dump(engine, &p,
1619 						  "%s\n", engine->name);
1620 
1621 				i915_request_put(rq);
1622 				i915_request_put(prev);
1623 
1624 				intel_gt_set_wedged(gt);
1625 
1626 				err = -EIO;
1627 				goto fini;
1628 			}
1629 
1630 			reset_count = fake_hangcheck(gt, BIT(id));
1631 
1632 			if (prev->fence.error != -EIO) {
1633 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1634 				       prev->fence.error);
1635 				i915_request_put(rq);
1636 				i915_request_put(prev);
1637 				err = -EINVAL;
1638 				goto fini;
1639 			}
1640 
1641 			if (rq->fence.error) {
1642 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1643 				       rq->fence.error);
1644 				i915_request_put(rq);
1645 				i915_request_put(prev);
1646 				err = -EINVAL;
1647 				goto fini;
1648 			}
1649 
1650 			if (i915_reset_count(global) == reset_count) {
1651 				pr_err("No GPU reset recorded!\n");
1652 				i915_request_put(rq);
1653 				i915_request_put(prev);
1654 				err = -EINVAL;
1655 				goto fini;
1656 			}
1657 
1658 			i915_request_put(prev);
1659 			prev = rq;
1660 			count++;
1661 		} while (time_before(jiffies, end_time));
1662 		pr_info("%s: Completed %d queued resets\n",
1663 			engine->name, count);
1664 
1665 		*h.batch = MI_BATCH_BUFFER_END;
1666 		intel_gt_chipset_flush(engine->gt);
1667 
1668 		i915_request_put(prev);
1669 
1670 		err = igt_flush_test(gt->i915);
1671 		if (err) {
1672 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1673 			break;
1674 		}
1675 	}
1676 
1677 fini:
1678 	hang_fini(&h);
1679 unlock:
1680 	igt_global_reset_unlock(gt);
1681 
1682 	if (intel_gt_is_wedged(gt))
1683 		return -EIO;
1684 
1685 	return err;
1686 }
1687 
1688 static int igt_handle_error(void *arg)
1689 {
1690 	struct intel_gt *gt = arg;
1691 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1692 	struct intel_engine_cs *engine = gt->engine[RCS0];
1693 	struct hang h;
1694 	struct i915_request *rq;
1695 	struct i915_gpu_coredump *error;
1696 	int err;
1697 
1698 	/* Check that we can issue a global GPU and engine reset */
1699 
1700 	if (!intel_has_reset_engine(gt))
1701 		return 0;
1702 
1703 	if (!engine || !intel_engine_can_store_dword(engine))
1704 		return 0;
1705 
1706 	err = hang_init(&h, gt);
1707 	if (err) {
1708 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1709 		return err;
1710 	}
1711 
1712 	rq = hang_create_request(&h, engine);
1713 	if (IS_ERR(rq)) {
1714 		err = PTR_ERR(rq);
1715 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1716 		goto err_fini;
1717 	}
1718 
1719 	i915_request_get(rq);
1720 	i915_request_add(rq);
1721 
1722 	if (!wait_until_running(&h, rq)) {
1723 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1724 
1725 		pr_err("%s: Failed to start request %llx, at %x\n",
1726 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1727 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1728 
1729 		intel_gt_set_wedged(gt);
1730 
1731 		err = -EIO;
1732 		goto err_request;
1733 	}
1734 
1735 	/* Temporarily disable error capture */
1736 	error = xchg(&global->first_error, (void *)-1);
1737 
1738 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1739 
1740 	xchg(&global->first_error, error);
1741 
1742 	if (rq->fence.error != -EIO) {
1743 		pr_err("Guilty request not identified!\n");
1744 		err = -EINVAL;
1745 		goto err_request;
1746 	}
1747 
1748 err_request:
1749 	i915_request_put(rq);
1750 err_fini:
1751 	hang_fini(&h);
1752 	return err;
1753 }
1754 
1755 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1756 				     const struct igt_atomic_section *p,
1757 				     const char *mode)
1758 {
1759 	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1760 	int err;
1761 
1762 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1763 		  engine->name, mode, p->name);
1764 
1765 	if (t->func)
1766 		tasklet_disable(t);
1767 	if (strcmp(p->name, "softirq"))
1768 		local_bh_disable();
1769 	p->critical_section_begin();
1770 
1771 	err = __intel_engine_reset_bh(engine, NULL);
1772 
1773 	p->critical_section_end();
1774 	if (strcmp(p->name, "softirq"))
1775 		local_bh_enable();
1776 	if (t->func) {
1777 		tasklet_enable(t);
1778 		tasklet_hi_schedule(t);
1779 	}
1780 
1781 	if (err)
1782 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1783 		       engine->name, mode, p->name);
1784 
1785 	return err;
1786 }
1787 
1788 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1789 				   const struct igt_atomic_section *p)
1790 {
1791 	struct i915_request *rq;
1792 	struct hang h;
1793 	int err;
1794 
1795 	err = __igt_atomic_reset_engine(engine, p, "idle");
1796 	if (err)
1797 		return err;
1798 
1799 	err = hang_init(&h, engine->gt);
1800 	if (err) {
1801 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1802 		return err;
1803 	}
1804 
1805 	rq = hang_create_request(&h, engine);
1806 	if (IS_ERR(rq)) {
1807 		err = PTR_ERR(rq);
1808 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1809 		goto out;
1810 	}
1811 
1812 	i915_request_get(rq);
1813 	i915_request_add(rq);
1814 
1815 	if (wait_until_running(&h, rq)) {
1816 		err = __igt_atomic_reset_engine(engine, p, "active");
1817 	} else {
1818 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1819 		       __func__, engine->name,
1820 		       rq->fence.seqno, hws_seqno(&h, rq));
1821 		intel_gt_set_wedged(engine->gt);
1822 		err = -EIO;
1823 	}
1824 
1825 	if (err == 0) {
1826 		struct intel_wedge_me w;
1827 
1828 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1829 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1830 		if (intel_gt_is_wedged(engine->gt))
1831 			err = -EIO;
1832 	}
1833 
1834 	i915_request_put(rq);
1835 out:
1836 	hang_fini(&h);
1837 	return err;
1838 }
1839 
1840 static int igt_reset_engines_atomic(void *arg)
1841 {
1842 	struct intel_gt *gt = arg;
1843 	const typeof(*igt_atomic_phases) *p;
1844 	int err = 0;
1845 
1846 	/* Check that the engines resets are usable from atomic context */
1847 
1848 	if (!intel_has_reset_engine(gt))
1849 		return 0;
1850 
1851 	if (intel_uc_uses_guc_submission(&gt->uc))
1852 		return 0;
1853 
1854 	igt_global_reset_lock(gt);
1855 
1856 	/* Flush any requests before we get started and check basics */
1857 	if (!igt_force_reset(gt))
1858 		goto unlock;
1859 
1860 	for (p = igt_atomic_phases; p->name; p++) {
1861 		struct intel_engine_cs *engine;
1862 		enum intel_engine_id id;
1863 
1864 		for_each_engine(engine, gt, id) {
1865 			err = igt_atomic_reset_engine(engine, p);
1866 			if (err)
1867 				goto out;
1868 		}
1869 	}
1870 
1871 out:
1872 	/* As we poke around the guts, do a full reset before continuing. */
1873 	igt_force_reset(gt);
1874 unlock:
1875 	igt_global_reset_unlock(gt);
1876 
1877 	return err;
1878 }
1879 
1880 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1881 {
1882 	static const struct i915_subtest tests[] = {
1883 		SUBTEST(igt_hang_sanitycheck),
1884 		SUBTEST(igt_reset_nop),
1885 		SUBTEST(igt_reset_nop_engine),
1886 		SUBTEST(igt_reset_idle_engine),
1887 		SUBTEST(igt_reset_active_engine),
1888 		SUBTEST(igt_reset_fail_engine),
1889 		SUBTEST(igt_reset_engines),
1890 		SUBTEST(igt_reset_engines_atomic),
1891 		SUBTEST(igt_reset_queue),
1892 		SUBTEST(igt_reset_wait),
1893 		SUBTEST(igt_reset_evict_ggtt),
1894 		SUBTEST(igt_reset_evict_ppgtt),
1895 		SUBTEST(igt_reset_evict_fence),
1896 		SUBTEST(igt_handle_error),
1897 	};
1898 	struct intel_gt *gt = &i915->gt;
1899 	intel_wakeref_t wakeref;
1900 	int err;
1901 
1902 	if (!intel_has_gpu_reset(gt))
1903 		return 0;
1904 
1905 	if (intel_gt_is_wedged(gt))
1906 		return -EIO; /* we're long past hope of a successful reset */
1907 
1908 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1909 
1910 	err = intel_gt_live_subtests(tests, gt);
1911 
1912 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1913 
1914 	return err;
1915 }
1916