1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5 
6 #include <linux/kthread.h>
7 
8 #include "gem/i915_gem_context.h"
9 #include "gem/i915_gem_internal.h"
10 
11 #include "i915_gem_evict.h"
12 #include "intel_gt.h"
13 #include "intel_engine_heartbeat.h"
14 #include "intel_engine_pm.h"
15 #include "selftest_engine_heartbeat.h"
16 
17 #include "i915_selftest.h"
18 #include "selftests/i915_random.h"
19 #include "selftests/igt_flush_test.h"
20 #include "selftests/igt_reset.h"
21 #include "selftests/igt_atomic.h"
22 #include "selftests/igt_spinner.h"
23 #include "selftests/intel_scheduler_helpers.h"
24 
25 #include "selftests/mock_drm.h"
26 
27 #include "gem/selftests/mock_context.h"
28 #include "gem/selftests/igt_gem_utils.h"
29 
30 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
31 
32 struct hang {
33 	struct intel_gt *gt;
34 	struct drm_i915_gem_object *hws;
35 	struct drm_i915_gem_object *obj;
36 	struct i915_gem_context *ctx;
37 	u32 *seqno;
38 	u32 *batch;
39 };
40 
41 static int hang_init(struct hang *h, struct intel_gt *gt)
42 {
43 	void *vaddr;
44 	int err;
45 
46 	memset(h, 0, sizeof(*h));
47 	h->gt = gt;
48 
49 	h->ctx = kernel_context(gt->i915, NULL);
50 	if (IS_ERR(h->ctx))
51 		return PTR_ERR(h->ctx);
52 
53 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
54 
55 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
56 	if (IS_ERR(h->hws)) {
57 		err = PTR_ERR(h->hws);
58 		goto err_ctx;
59 	}
60 
61 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
62 	if (IS_ERR(h->obj)) {
63 		err = PTR_ERR(h->obj);
64 		goto err_hws;
65 	}
66 
67 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
68 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
69 	if (IS_ERR(vaddr)) {
70 		err = PTR_ERR(vaddr);
71 		goto err_obj;
72 	}
73 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
74 
75 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
76 						 i915_coherent_map_type(gt->i915, h->obj, false));
77 	if (IS_ERR(vaddr)) {
78 		err = PTR_ERR(vaddr);
79 		goto err_unpin_hws;
80 	}
81 	h->batch = vaddr;
82 
83 	return 0;
84 
85 err_unpin_hws:
86 	i915_gem_object_unpin_map(h->hws);
87 err_obj:
88 	i915_gem_object_put(h->obj);
89 err_hws:
90 	i915_gem_object_put(h->hws);
91 err_ctx:
92 	kernel_context_close(h->ctx);
93 	return err;
94 }
95 
96 static u64 hws_address(const struct i915_vma *hws,
97 		       const struct i915_request *rq)
98 {
99 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
100 }
101 
102 static int move_to_active(struct i915_vma *vma,
103 			  struct i915_request *rq,
104 			  unsigned int flags)
105 {
106 	int err;
107 
108 	i915_vma_lock(vma);
109 	err = i915_vma_move_to_active(vma, rq, flags);
110 	i915_vma_unlock(vma);
111 
112 	return err;
113 }
114 
115 static struct i915_request *
116 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
117 {
118 	struct intel_gt *gt = h->gt;
119 	struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
120 	struct drm_i915_gem_object *obj;
121 	struct i915_request *rq = NULL;
122 	struct i915_vma *hws, *vma;
123 	unsigned int flags;
124 	void *vaddr;
125 	u32 *batch;
126 	int err;
127 
128 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
129 	if (IS_ERR(obj)) {
130 		i915_vm_put(vm);
131 		return ERR_CAST(obj);
132 	}
133 
134 	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
135 	if (IS_ERR(vaddr)) {
136 		i915_gem_object_put(obj);
137 		i915_vm_put(vm);
138 		return ERR_CAST(vaddr);
139 	}
140 
141 	i915_gem_object_unpin_map(h->obj);
142 	i915_gem_object_put(h->obj);
143 
144 	h->obj = obj;
145 	h->batch = vaddr;
146 
147 	vma = i915_vma_instance(h->obj, vm, NULL);
148 	if (IS_ERR(vma)) {
149 		i915_vm_put(vm);
150 		return ERR_CAST(vma);
151 	}
152 
153 	hws = i915_vma_instance(h->hws, vm, NULL);
154 	if (IS_ERR(hws)) {
155 		i915_vm_put(vm);
156 		return ERR_CAST(hws);
157 	}
158 
159 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
160 	if (err) {
161 		i915_vm_put(vm);
162 		return ERR_PTR(err);
163 	}
164 
165 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
166 	if (err)
167 		goto unpin_vma;
168 
169 	rq = igt_request_alloc(h->ctx, engine);
170 	if (IS_ERR(rq)) {
171 		err = PTR_ERR(rq);
172 		goto unpin_hws;
173 	}
174 
175 	err = move_to_active(vma, rq, 0);
176 	if (err)
177 		goto cancel_rq;
178 
179 	err = move_to_active(hws, rq, 0);
180 	if (err)
181 		goto cancel_rq;
182 
183 	batch = h->batch;
184 	if (GRAPHICS_VER(gt->i915) >= 8) {
185 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
186 		*batch++ = lower_32_bits(hws_address(hws, rq));
187 		*batch++ = upper_32_bits(hws_address(hws, rq));
188 		*batch++ = rq->fence.seqno;
189 		*batch++ = MI_NOOP;
190 
191 		memset(batch, 0, 1024);
192 		batch += 1024 / sizeof(*batch);
193 
194 		*batch++ = MI_NOOP;
195 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
196 		*batch++ = lower_32_bits(vma->node.start);
197 		*batch++ = upper_32_bits(vma->node.start);
198 	} else if (GRAPHICS_VER(gt->i915) >= 6) {
199 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
200 		*batch++ = 0;
201 		*batch++ = lower_32_bits(hws_address(hws, rq));
202 		*batch++ = rq->fence.seqno;
203 		*batch++ = MI_NOOP;
204 
205 		memset(batch, 0, 1024);
206 		batch += 1024 / sizeof(*batch);
207 
208 		*batch++ = MI_NOOP;
209 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
210 		*batch++ = lower_32_bits(vma->node.start);
211 	} else if (GRAPHICS_VER(gt->i915) >= 4) {
212 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
213 		*batch++ = 0;
214 		*batch++ = lower_32_bits(hws_address(hws, rq));
215 		*batch++ = rq->fence.seqno;
216 		*batch++ = MI_NOOP;
217 
218 		memset(batch, 0, 1024);
219 		batch += 1024 / sizeof(*batch);
220 
221 		*batch++ = MI_NOOP;
222 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
223 		*batch++ = lower_32_bits(vma->node.start);
224 	} else {
225 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
226 		*batch++ = lower_32_bits(hws_address(hws, rq));
227 		*batch++ = rq->fence.seqno;
228 		*batch++ = MI_NOOP;
229 
230 		memset(batch, 0, 1024);
231 		batch += 1024 / sizeof(*batch);
232 
233 		*batch++ = MI_NOOP;
234 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
235 		*batch++ = lower_32_bits(vma->node.start);
236 	}
237 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
238 	intel_gt_chipset_flush(engine->gt);
239 
240 	if (rq->engine->emit_init_breadcrumb) {
241 		err = rq->engine->emit_init_breadcrumb(rq);
242 		if (err)
243 			goto cancel_rq;
244 	}
245 
246 	flags = 0;
247 	if (GRAPHICS_VER(gt->i915) <= 5)
248 		flags |= I915_DISPATCH_SECURE;
249 
250 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
251 
252 cancel_rq:
253 	if (err) {
254 		i915_request_set_error_once(rq, err);
255 		i915_request_add(rq);
256 	}
257 unpin_hws:
258 	i915_vma_unpin(hws);
259 unpin_vma:
260 	i915_vma_unpin(vma);
261 	i915_vm_put(vm);
262 	return err ? ERR_PTR(err) : rq;
263 }
264 
265 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
266 {
267 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
268 }
269 
270 static void hang_fini(struct hang *h)
271 {
272 	*h->batch = MI_BATCH_BUFFER_END;
273 	intel_gt_chipset_flush(h->gt);
274 
275 	i915_gem_object_unpin_map(h->obj);
276 	i915_gem_object_put(h->obj);
277 
278 	i915_gem_object_unpin_map(h->hws);
279 	i915_gem_object_put(h->hws);
280 
281 	kernel_context_close(h->ctx);
282 
283 	igt_flush_test(h->gt->i915);
284 }
285 
286 static bool wait_until_running(struct hang *h, struct i915_request *rq)
287 {
288 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
289 					       rq->fence.seqno),
290 			     10) &&
291 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
292 					    rq->fence.seqno),
293 			  1000));
294 }
295 
296 static int igt_hang_sanitycheck(void *arg)
297 {
298 	struct intel_gt *gt = arg;
299 	struct i915_request *rq;
300 	struct intel_engine_cs *engine;
301 	enum intel_engine_id id;
302 	struct hang h;
303 	int err;
304 
305 	/* Basic check that we can execute our hanging batch */
306 
307 	err = hang_init(&h, gt);
308 	if (err)
309 		return err;
310 
311 	for_each_engine(engine, gt, id) {
312 		struct intel_wedge_me w;
313 		long timeout;
314 
315 		if (!intel_engine_can_store_dword(engine))
316 			continue;
317 
318 		rq = hang_create_request(&h, engine);
319 		if (IS_ERR(rq)) {
320 			err = PTR_ERR(rq);
321 			pr_err("Failed to create request for %s, err=%d\n",
322 			       engine->name, err);
323 			goto fini;
324 		}
325 
326 		i915_request_get(rq);
327 
328 		*h.batch = MI_BATCH_BUFFER_END;
329 		intel_gt_chipset_flush(engine->gt);
330 
331 		i915_request_add(rq);
332 
333 		timeout = 0;
334 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
335 			timeout = i915_request_wait(rq, 0,
336 						    MAX_SCHEDULE_TIMEOUT);
337 		if (intel_gt_is_wedged(gt))
338 			timeout = -EIO;
339 
340 		i915_request_put(rq);
341 
342 		if (timeout < 0) {
343 			err = timeout;
344 			pr_err("Wait for request failed on %s, err=%d\n",
345 			       engine->name, err);
346 			goto fini;
347 		}
348 	}
349 
350 fini:
351 	hang_fini(&h);
352 	return err;
353 }
354 
355 static bool wait_for_idle(struct intel_engine_cs *engine)
356 {
357 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
358 }
359 
360 static int igt_reset_nop(void *arg)
361 {
362 	struct intel_gt *gt = arg;
363 	struct i915_gpu_error *global = &gt->i915->gpu_error;
364 	struct intel_engine_cs *engine;
365 	unsigned int reset_count, count;
366 	enum intel_engine_id id;
367 	IGT_TIMEOUT(end_time);
368 	int err = 0;
369 
370 	/* Check that we can reset during non-user portions of requests */
371 
372 	reset_count = i915_reset_count(global);
373 	count = 0;
374 	do {
375 		for_each_engine(engine, gt, id) {
376 			struct intel_context *ce;
377 			int i;
378 
379 			ce = intel_context_create(engine);
380 			if (IS_ERR(ce)) {
381 				err = PTR_ERR(ce);
382 				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
383 				break;
384 			}
385 
386 			for (i = 0; i < 16; i++) {
387 				struct i915_request *rq;
388 
389 				rq = intel_context_create_request(ce);
390 				if (IS_ERR(rq)) {
391 					err = PTR_ERR(rq);
392 					pr_err("[%s] Create request failed: %d!\n",
393 					       engine->name, err);
394 					break;
395 				}
396 
397 				i915_request_add(rq);
398 			}
399 
400 			intel_context_put(ce);
401 		}
402 
403 		igt_global_reset_lock(gt);
404 		intel_gt_reset(gt, ALL_ENGINES, NULL);
405 		igt_global_reset_unlock(gt);
406 
407 		if (intel_gt_is_wedged(gt)) {
408 			pr_err("[%s] GT is wedged!\n", engine->name);
409 			err = -EIO;
410 			break;
411 		}
412 
413 		if (i915_reset_count(global) != reset_count + ++count) {
414 			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
415 			       engine->name, i915_reset_count(global), reset_count, count);
416 			err = -EINVAL;
417 			break;
418 		}
419 
420 		err = igt_flush_test(gt->i915);
421 		if (err) {
422 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
423 			break;
424 		}
425 	} while (time_before(jiffies, end_time));
426 	pr_info("%s: %d resets\n", __func__, count);
427 
428 	if (igt_flush_test(gt->i915)) {
429 		pr_err("Post flush failed: %d!\n", err);
430 		err = -EIO;
431 	}
432 
433 	return err;
434 }
435 
436 static int igt_reset_nop_engine(void *arg)
437 {
438 	struct intel_gt *gt = arg;
439 	struct i915_gpu_error *global = &gt->i915->gpu_error;
440 	struct intel_engine_cs *engine;
441 	enum intel_engine_id id;
442 
443 	/* Check that we can engine-reset during non-user portions */
444 
445 	if (!intel_has_reset_engine(gt))
446 		return 0;
447 
448 	for_each_engine(engine, gt, id) {
449 		unsigned int reset_count, reset_engine_count, count;
450 		struct intel_context *ce;
451 		IGT_TIMEOUT(end_time);
452 		int err;
453 
454 		if (intel_engine_uses_guc(engine)) {
455 			/* Engine level resets are triggered by GuC when a hang
456 			 * is detected. They can't be triggered by the KMD any
457 			 * more. Thus a nop batch cannot be used as a reset test
458 			 */
459 			continue;
460 		}
461 
462 		ce = intel_context_create(engine);
463 		if (IS_ERR(ce)) {
464 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
465 			return PTR_ERR(ce);
466 		}
467 
468 		reset_count = i915_reset_count(global);
469 		reset_engine_count = i915_reset_engine_count(global, engine);
470 		count = 0;
471 
472 		st_engine_heartbeat_disable(engine);
473 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
474 					    &gt->reset.flags));
475 		do {
476 			int i;
477 
478 			if (!wait_for_idle(engine)) {
479 				pr_err("%s failed to idle before reset\n",
480 				       engine->name);
481 				err = -EIO;
482 				break;
483 			}
484 
485 			for (i = 0; i < 16; i++) {
486 				struct i915_request *rq;
487 
488 				rq = intel_context_create_request(ce);
489 				if (IS_ERR(rq)) {
490 					struct drm_printer p =
491 						drm_info_printer(gt->i915->drm.dev);
492 					intel_engine_dump(engine, &p,
493 							  "%s(%s): failed to submit request\n",
494 							  __func__,
495 							  engine->name);
496 
497 					GEM_TRACE("%s(%s): failed to submit request\n",
498 						  __func__,
499 						  engine->name);
500 					GEM_TRACE_DUMP();
501 
502 					intel_gt_set_wedged(gt);
503 
504 					err = PTR_ERR(rq);
505 					break;
506 				}
507 
508 				i915_request_add(rq);
509 			}
510 			err = intel_engine_reset(engine, NULL);
511 			if (err) {
512 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
513 				       engine->name, err);
514 				break;
515 			}
516 
517 			if (i915_reset_count(global) != reset_count) {
518 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
519 				err = -EINVAL;
520 				break;
521 			}
522 
523 			if (i915_reset_engine_count(global, engine) !=
524 			    reset_engine_count + ++count) {
525 				pr_err("%s engine reset not recorded!\n",
526 				       engine->name);
527 				err = -EINVAL;
528 				break;
529 			}
530 		} while (time_before(jiffies, end_time));
531 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
532 		st_engine_heartbeat_enable(engine);
533 
534 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
535 
536 		intel_context_put(ce);
537 		if (igt_flush_test(gt->i915))
538 			err = -EIO;
539 		if (err)
540 			return err;
541 	}
542 
543 	return 0;
544 }
545 
546 static void force_reset_timeout(struct intel_engine_cs *engine)
547 {
548 	engine->reset_timeout.probability = 999;
549 	atomic_set(&engine->reset_timeout.times, -1);
550 }
551 
552 static void cancel_reset_timeout(struct intel_engine_cs *engine)
553 {
554 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
555 }
556 
557 static int igt_reset_fail_engine(void *arg)
558 {
559 	struct intel_gt *gt = arg;
560 	struct intel_engine_cs *engine;
561 	enum intel_engine_id id;
562 
563 	/* Check that we can recover from engine-reset failues */
564 
565 	if (!intel_has_reset_engine(gt))
566 		return 0;
567 
568 	for_each_engine(engine, gt, id) {
569 		unsigned int count;
570 		struct intel_context *ce;
571 		IGT_TIMEOUT(end_time);
572 		int err;
573 
574 		/* Can't manually break the reset if i915 doesn't perform it */
575 		if (intel_engine_uses_guc(engine))
576 			continue;
577 
578 		ce = intel_context_create(engine);
579 		if (IS_ERR(ce)) {
580 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
581 			return PTR_ERR(ce);
582 		}
583 
584 		st_engine_heartbeat_disable(engine);
585 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
586 					    &gt->reset.flags));
587 
588 		force_reset_timeout(engine);
589 		err = intel_engine_reset(engine, NULL);
590 		cancel_reset_timeout(engine);
591 		if (err == 0) /* timeouts only generated on gen8+ */
592 			goto skip;
593 
594 		count = 0;
595 		do {
596 			struct i915_request *last = NULL;
597 			int i;
598 
599 			if (!wait_for_idle(engine)) {
600 				pr_err("%s failed to idle before reset\n",
601 				       engine->name);
602 				err = -EIO;
603 				break;
604 			}
605 
606 			for (i = 0; i < count % 15; i++) {
607 				struct i915_request *rq;
608 
609 				rq = intel_context_create_request(ce);
610 				if (IS_ERR(rq)) {
611 					struct drm_printer p =
612 						drm_info_printer(gt->i915->drm.dev);
613 					intel_engine_dump(engine, &p,
614 							  "%s(%s): failed to submit request\n",
615 							  __func__,
616 							  engine->name);
617 
618 					GEM_TRACE("%s(%s): failed to submit request\n",
619 						  __func__,
620 						  engine->name);
621 					GEM_TRACE_DUMP();
622 
623 					intel_gt_set_wedged(gt);
624 					if (last)
625 						i915_request_put(last);
626 
627 					err = PTR_ERR(rq);
628 					goto out;
629 				}
630 
631 				if (last)
632 					i915_request_put(last);
633 				last = i915_request_get(rq);
634 				i915_request_add(rq);
635 			}
636 
637 			if (count & 1) {
638 				err = intel_engine_reset(engine, NULL);
639 				if (err) {
640 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
641 						      engine->name, err);
642 					GEM_TRACE_DUMP();
643 					i915_request_put(last);
644 					break;
645 				}
646 			} else {
647 				force_reset_timeout(engine);
648 				err = intel_engine_reset(engine, NULL);
649 				cancel_reset_timeout(engine);
650 				if (err != -ETIMEDOUT) {
651 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
652 					       engine->name, err);
653 					i915_request_put(last);
654 					break;
655 				}
656 			}
657 
658 			err = 0;
659 			if (last) {
660 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
661 					struct drm_printer p =
662 						drm_info_printer(gt->i915->drm.dev);
663 
664 					intel_engine_dump(engine, &p,
665 							  "%s(%s): failed to complete request\n",
666 							  __func__,
667 							  engine->name);
668 
669 					GEM_TRACE("%s(%s): failed to complete request\n",
670 						  __func__,
671 						  engine->name);
672 					GEM_TRACE_DUMP();
673 
674 					err = -EIO;
675 				}
676 				i915_request_put(last);
677 			}
678 			count++;
679 		} while (err == 0 && time_before(jiffies, end_time));
680 out:
681 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
682 skip:
683 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
684 		st_engine_heartbeat_enable(engine);
685 		intel_context_put(ce);
686 
687 		if (igt_flush_test(gt->i915))
688 			err = -EIO;
689 		if (err)
690 			return err;
691 	}
692 
693 	return 0;
694 }
695 
696 static int __igt_reset_engine(struct intel_gt *gt, bool active)
697 {
698 	struct i915_gpu_error *global = &gt->i915->gpu_error;
699 	struct intel_engine_cs *engine;
700 	enum intel_engine_id id;
701 	struct hang h;
702 	int err = 0;
703 
704 	/* Check that we can issue an engine reset on an idle engine (no-op) */
705 
706 	if (!intel_has_reset_engine(gt))
707 		return 0;
708 
709 	if (active) {
710 		err = hang_init(&h, gt);
711 		if (err)
712 			return err;
713 	}
714 
715 	for_each_engine(engine, gt, id) {
716 		unsigned int reset_count, reset_engine_count;
717 		unsigned long count;
718 		bool using_guc = intel_engine_uses_guc(engine);
719 		IGT_TIMEOUT(end_time);
720 
721 		if (using_guc && !active)
722 			continue;
723 
724 		if (active && !intel_engine_can_store_dword(engine))
725 			continue;
726 
727 		if (!wait_for_idle(engine)) {
728 			pr_err("%s failed to idle before reset\n",
729 			       engine->name);
730 			err = -EIO;
731 			break;
732 		}
733 
734 		reset_count = i915_reset_count(global);
735 		reset_engine_count = i915_reset_engine_count(global, engine);
736 
737 		st_engine_heartbeat_disable(engine);
738 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
739 					    &gt->reset.flags));
740 		count = 0;
741 		do {
742 			struct i915_request *rq = NULL;
743 			struct intel_selftest_saved_policy saved;
744 			int err2;
745 
746 			err = intel_selftest_modify_policy(engine, &saved,
747 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
748 			if (err) {
749 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
750 				break;
751 			}
752 
753 			if (active) {
754 				rq = hang_create_request(&h, engine);
755 				if (IS_ERR(rq)) {
756 					err = PTR_ERR(rq);
757 					pr_err("[%s] Create hang request failed: %d!\n",
758 					       engine->name, err);
759 					goto restore;
760 				}
761 
762 				i915_request_get(rq);
763 				i915_request_add(rq);
764 
765 				if (!wait_until_running(&h, rq)) {
766 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
767 
768 					pr_err("%s: Failed to start request %llx, at %x\n",
769 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
770 					intel_engine_dump(engine, &p,
771 							  "%s\n", engine->name);
772 
773 					i915_request_put(rq);
774 					err = -EIO;
775 					goto restore;
776 				}
777 			}
778 
779 			if (!using_guc) {
780 				err = intel_engine_reset(engine, NULL);
781 				if (err) {
782 					pr_err("intel_engine_reset(%s) failed, err:%d\n",
783 					       engine->name, err);
784 					goto skip;
785 				}
786 			}
787 
788 			if (rq) {
789 				/* Ensure the reset happens and kills the engine */
790 				err = intel_selftest_wait_for_rq(rq);
791 				if (err)
792 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
793 					       engine->name, rq->fence.context,
794 					       rq->fence.seqno, rq->context->guc_id.id, err);
795 			}
796 
797 skip:
798 			if (rq)
799 				i915_request_put(rq);
800 
801 			if (i915_reset_count(global) != reset_count) {
802 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
803 				err = -EINVAL;
804 				goto restore;
805 			}
806 
807 			/* GuC based resets are not logged per engine */
808 			if (!using_guc) {
809 				if (i915_reset_engine_count(global, engine) !=
810 				    ++reset_engine_count) {
811 					pr_err("%s engine reset not recorded!\n",
812 					       engine->name);
813 					err = -EINVAL;
814 					goto restore;
815 				}
816 			}
817 
818 			count++;
819 
820 restore:
821 			err2 = intel_selftest_restore_policy(engine, &saved);
822 			if (err2)
823 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
824 			if (err == 0)
825 				err = err2;
826 			if (err)
827 				break;
828 		} while (time_before(jiffies, end_time));
829 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
830 		st_engine_heartbeat_enable(engine);
831 		pr_info("%s: Completed %lu %s resets\n",
832 			engine->name, count, active ? "active" : "idle");
833 
834 		if (err)
835 			break;
836 
837 		err = igt_flush_test(gt->i915);
838 		if (err) {
839 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
840 			break;
841 		}
842 	}
843 
844 	if (intel_gt_is_wedged(gt)) {
845 		pr_err("GT is wedged!\n");
846 		err = -EIO;
847 	}
848 
849 	if (active)
850 		hang_fini(&h);
851 
852 	return err;
853 }
854 
855 static int igt_reset_idle_engine(void *arg)
856 {
857 	return __igt_reset_engine(arg, false);
858 }
859 
860 static int igt_reset_active_engine(void *arg)
861 {
862 	return __igt_reset_engine(arg, true);
863 }
864 
865 struct active_engine {
866 	struct kthread_worker *worker;
867 	struct kthread_work work;
868 	struct intel_engine_cs *engine;
869 	unsigned long resets;
870 	unsigned int flags;
871 	bool stop;
872 	int result;
873 };
874 
875 #define TEST_ACTIVE	BIT(0)
876 #define TEST_OTHERS	BIT(1)
877 #define TEST_SELF	BIT(2)
878 #define TEST_PRIORITY	BIT(3)
879 
880 static int active_request_put(struct i915_request *rq)
881 {
882 	int err = 0;
883 
884 	if (!rq)
885 		return 0;
886 
887 	if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
888 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
889 			  rq->engine->name,
890 			  rq->fence.context,
891 			  rq->fence.seqno);
892 		GEM_TRACE_DUMP();
893 
894 		intel_gt_set_wedged(rq->engine->gt);
895 		err = -EIO;
896 	}
897 
898 	i915_request_put(rq);
899 
900 	return err;
901 }
902 
903 static void active_engine(struct kthread_work *work)
904 {
905 	I915_RND_STATE(prng);
906 	struct active_engine *arg = container_of(work, typeof(*arg), work);
907 	struct intel_engine_cs *engine = arg->engine;
908 	struct i915_request *rq[8] = {};
909 	struct intel_context *ce[ARRAY_SIZE(rq)];
910 	unsigned long count;
911 	int err = 0;
912 
913 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
914 		ce[count] = intel_context_create(engine);
915 		if (IS_ERR(ce[count])) {
916 			arg->result = PTR_ERR(ce[count]);
917 			pr_err("[%s] Create context #%ld failed: %d!\n",
918 			       engine->name, count, arg->result);
919 			while (--count)
920 				intel_context_put(ce[count]);
921 			return;
922 		}
923 	}
924 
925 	count = 0;
926 	while (!READ_ONCE(arg->stop)) {
927 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
928 		struct i915_request *old = rq[idx];
929 		struct i915_request *new;
930 
931 		new = intel_context_create_request(ce[idx]);
932 		if (IS_ERR(new)) {
933 			err = PTR_ERR(new);
934 			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
935 			break;
936 		}
937 
938 		rq[idx] = i915_request_get(new);
939 		i915_request_add(new);
940 
941 		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
942 			struct i915_sched_attr attr = {
943 				.priority =
944 					i915_prandom_u32_max_state(512, &prng),
945 			};
946 			engine->sched_engine->schedule(rq[idx], &attr);
947 		}
948 
949 		err = active_request_put(old);
950 		if (err) {
951 			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
952 			break;
953 		}
954 
955 		cond_resched();
956 	}
957 
958 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
959 		int err__ = active_request_put(rq[count]);
960 
961 		if (err)
962 			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
963 
964 		/* Keep the first error */
965 		if (!err)
966 			err = err__;
967 
968 		intel_context_put(ce[count]);
969 	}
970 
971 	arg->result = err;
972 }
973 
974 static int __igt_reset_engines(struct intel_gt *gt,
975 			       const char *test_name,
976 			       unsigned int flags)
977 {
978 	struct i915_gpu_error *global = &gt->i915->gpu_error;
979 	struct intel_engine_cs *engine, *other;
980 	struct active_engine *threads;
981 	enum intel_engine_id id, tmp;
982 	struct hang h;
983 	int err = 0;
984 
985 	/* Check that issuing a reset on one engine does not interfere
986 	 * with any other engine.
987 	 */
988 
989 	if (!intel_has_reset_engine(gt))
990 		return 0;
991 
992 	if (flags & TEST_ACTIVE) {
993 		err = hang_init(&h, gt);
994 		if (err)
995 			return err;
996 
997 		if (flags & TEST_PRIORITY)
998 			h.ctx->sched.priority = 1024;
999 	}
1000 
1001 	threads = kmalloc_array(I915_NUM_ENGINES, sizeof(*threads), GFP_KERNEL);
1002 	if (!threads)
1003 		return -ENOMEM;
1004 
1005 	for_each_engine(engine, gt, id) {
1006 		unsigned long device = i915_reset_count(global);
1007 		unsigned long count = 0, reported;
1008 		bool using_guc = intel_engine_uses_guc(engine);
1009 		IGT_TIMEOUT(end_time);
1010 
1011 		if (flags & TEST_ACTIVE) {
1012 			if (!intel_engine_can_store_dword(engine))
1013 				continue;
1014 		} else if (using_guc)
1015 			continue;
1016 
1017 		if (!wait_for_idle(engine)) {
1018 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1019 			       engine->name, test_name);
1020 			err = -EIO;
1021 			break;
1022 		}
1023 
1024 		memset(threads, 0, sizeof(*threads) * I915_NUM_ENGINES);
1025 		for_each_engine(other, gt, tmp) {
1026 			struct kthread_worker *worker;
1027 
1028 			threads[tmp].resets =
1029 				i915_reset_engine_count(global, other);
1030 
1031 			if (other == engine && !(flags & TEST_SELF))
1032 				continue;
1033 
1034 			if (other != engine && !(flags & TEST_OTHERS))
1035 				continue;
1036 
1037 			threads[tmp].engine = other;
1038 			threads[tmp].flags = flags;
1039 
1040 			worker = kthread_create_worker(0, "igt/%s",
1041 						       other->name);
1042 			if (IS_ERR(worker)) {
1043 				err = PTR_ERR(worker);
1044 				pr_err("[%s] Worker create failed: %d!\n",
1045 				       engine->name, err);
1046 				goto unwind;
1047 			}
1048 
1049 			threads[tmp].worker = worker;
1050 
1051 			kthread_init_work(&threads[tmp].work, active_engine);
1052 			kthread_queue_work(threads[tmp].worker,
1053 					   &threads[tmp].work);
1054 		}
1055 
1056 		st_engine_heartbeat_disable_no_pm(engine);
1057 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1058 					    &gt->reset.flags));
1059 		do {
1060 			struct i915_request *rq = NULL;
1061 			struct intel_selftest_saved_policy saved;
1062 			int err2;
1063 
1064 			err = intel_selftest_modify_policy(engine, &saved,
1065 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1066 			if (err) {
1067 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1068 				break;
1069 			}
1070 
1071 			if (flags & TEST_ACTIVE) {
1072 				rq = hang_create_request(&h, engine);
1073 				if (IS_ERR(rq)) {
1074 					err = PTR_ERR(rq);
1075 					pr_err("[%s] Create hang request failed: %d!\n",
1076 					       engine->name, err);
1077 					goto restore;
1078 				}
1079 
1080 				i915_request_get(rq);
1081 				i915_request_add(rq);
1082 
1083 				if (!wait_until_running(&h, rq)) {
1084 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1085 
1086 					pr_err("%s: Failed to start request %llx, at %x\n",
1087 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1088 					intel_engine_dump(engine, &p,
1089 							  "%s\n", engine->name);
1090 
1091 					i915_request_put(rq);
1092 					err = -EIO;
1093 					goto restore;
1094 				}
1095 			} else {
1096 				intel_engine_pm_get(engine);
1097 			}
1098 
1099 			if (!using_guc) {
1100 				err = intel_engine_reset(engine, NULL);
1101 				if (err) {
1102 					pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1103 					       engine->name, test_name, err);
1104 					goto restore;
1105 				}
1106 			}
1107 
1108 			if (rq) {
1109 				/* Ensure the reset happens and kills the engine */
1110 				err = intel_selftest_wait_for_rq(rq);
1111 				if (err)
1112 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1113 					       engine->name, rq->fence.context,
1114 					       rq->fence.seqno, rq->context->guc_id.id, err);
1115 			}
1116 
1117 			count++;
1118 
1119 			if (rq) {
1120 				if (rq->fence.error != -EIO) {
1121 					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1122 					       engine->name, test_name,
1123 					       rq->fence.context,
1124 					       rq->fence.seqno, rq->context->guc_id.id);
1125 					i915_request_put(rq);
1126 
1127 					GEM_TRACE_DUMP();
1128 					intel_gt_set_wedged(gt);
1129 					err = -EIO;
1130 					goto restore;
1131 				}
1132 
1133 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1134 					struct drm_printer p =
1135 						drm_info_printer(gt->i915->drm.dev);
1136 
1137 					pr_err("i915_reset_engine(%s:%s):"
1138 					       " failed to complete request %llx:%lld after reset\n",
1139 					       engine->name, test_name,
1140 					       rq->fence.context,
1141 					       rq->fence.seqno);
1142 					intel_engine_dump(engine, &p,
1143 							  "%s\n", engine->name);
1144 					i915_request_put(rq);
1145 
1146 					GEM_TRACE_DUMP();
1147 					intel_gt_set_wedged(gt);
1148 					err = -EIO;
1149 					goto restore;
1150 				}
1151 
1152 				i915_request_put(rq);
1153 			}
1154 
1155 			if (!(flags & TEST_ACTIVE))
1156 				intel_engine_pm_put(engine);
1157 
1158 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1159 				struct drm_printer p =
1160 					drm_info_printer(gt->i915->drm.dev);
1161 
1162 				pr_err("i915_reset_engine(%s:%s):"
1163 				       " failed to idle after reset\n",
1164 				       engine->name, test_name);
1165 				intel_engine_dump(engine, &p,
1166 						  "%s\n", engine->name);
1167 
1168 				err = -EIO;
1169 				goto restore;
1170 			}
1171 
1172 restore:
1173 			err2 = intel_selftest_restore_policy(engine, &saved);
1174 			if (err2)
1175 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1176 			if (err == 0)
1177 				err = err2;
1178 			if (err)
1179 				break;
1180 		} while (time_before(jiffies, end_time));
1181 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1182 		st_engine_heartbeat_enable_no_pm(engine);
1183 
1184 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1185 			engine->name, test_name, count);
1186 
1187 		/* GuC based resets are not logged per engine */
1188 		if (!using_guc) {
1189 			reported = i915_reset_engine_count(global, engine);
1190 			reported -= threads[engine->id].resets;
1191 			if (reported != count) {
1192 				pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1193 				       engine->name, test_name, count, reported);
1194 				if (!err)
1195 					err = -EINVAL;
1196 			}
1197 		}
1198 
1199 unwind:
1200 		for_each_engine(other, gt, tmp) {
1201 			int ret;
1202 
1203 			if (!threads[tmp].worker)
1204 				continue;
1205 
1206 			WRITE_ONCE(threads[tmp].stop, true);
1207 			kthread_flush_work(&threads[tmp].work);
1208 			ret = READ_ONCE(threads[tmp].result);
1209 			if (ret) {
1210 				pr_err("kthread for other engine %s failed, err=%d\n",
1211 				       other->name, ret);
1212 				if (!err)
1213 					err = ret;
1214 			}
1215 
1216 			kthread_destroy_worker(threads[tmp].worker);
1217 
1218 			/* GuC based resets are not logged per engine */
1219 			if (!using_guc) {
1220 				if (other->uabi_class != engine->uabi_class &&
1221 				    threads[tmp].resets !=
1222 				    i915_reset_engine_count(global, other)) {
1223 					pr_err("Innocent engine %s was reset (count=%ld)\n",
1224 					       other->name,
1225 					       i915_reset_engine_count(global, other) -
1226 					       threads[tmp].resets);
1227 					if (!err)
1228 						err = -EINVAL;
1229 				}
1230 			}
1231 		}
1232 
1233 		if (device != i915_reset_count(global)) {
1234 			pr_err("Global reset (count=%ld)!\n",
1235 			       i915_reset_count(global) - device);
1236 			if (!err)
1237 				err = -EINVAL;
1238 		}
1239 
1240 		if (err)
1241 			break;
1242 
1243 		err = igt_flush_test(gt->i915);
1244 		if (err) {
1245 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1246 			break;
1247 		}
1248 	}
1249 	kfree(threads);
1250 
1251 	if (intel_gt_is_wedged(gt))
1252 		err = -EIO;
1253 
1254 	if (flags & TEST_ACTIVE)
1255 		hang_fini(&h);
1256 
1257 	return err;
1258 }
1259 
1260 static int igt_reset_engines(void *arg)
1261 {
1262 	static const struct {
1263 		const char *name;
1264 		unsigned int flags;
1265 	} phases[] = {
1266 		{ "idle", 0 },
1267 		{ "active", TEST_ACTIVE },
1268 		{ "others-idle", TEST_OTHERS },
1269 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1270 		{
1271 			"others-priority",
1272 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1273 		},
1274 		{
1275 			"self-priority",
1276 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1277 		},
1278 		{ }
1279 	};
1280 	struct intel_gt *gt = arg;
1281 	typeof(*phases) *p;
1282 	int err;
1283 
1284 	for (p = phases; p->name; p++) {
1285 		if (p->flags & TEST_PRIORITY) {
1286 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1287 				continue;
1288 		}
1289 
1290 		err = __igt_reset_engines(arg, p->name, p->flags);
1291 		if (err)
1292 			return err;
1293 	}
1294 
1295 	return 0;
1296 }
1297 
1298 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1299 {
1300 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1301 
1302 	intel_gt_reset(gt, mask, NULL);
1303 
1304 	return count;
1305 }
1306 
1307 static int igt_reset_wait(void *arg)
1308 {
1309 	struct intel_gt *gt = arg;
1310 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1311 	struct intel_engine_cs *engine;
1312 	struct i915_request *rq;
1313 	unsigned int reset_count;
1314 	struct hang h;
1315 	long timeout;
1316 	int err;
1317 
1318 	engine = intel_selftest_find_any_engine(gt);
1319 
1320 	if (!engine || !intel_engine_can_store_dword(engine))
1321 		return 0;
1322 
1323 	/* Check that we detect a stuck waiter and issue a reset */
1324 
1325 	igt_global_reset_lock(gt);
1326 
1327 	err = hang_init(&h, gt);
1328 	if (err) {
1329 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1330 		goto unlock;
1331 	}
1332 
1333 	rq = hang_create_request(&h, engine);
1334 	if (IS_ERR(rq)) {
1335 		err = PTR_ERR(rq);
1336 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1337 		goto fini;
1338 	}
1339 
1340 	i915_request_get(rq);
1341 	i915_request_add(rq);
1342 
1343 	if (!wait_until_running(&h, rq)) {
1344 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1345 
1346 		pr_err("%s: Failed to start request %llx, at %x\n",
1347 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1348 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1349 
1350 		intel_gt_set_wedged(gt);
1351 
1352 		err = -EIO;
1353 		goto out_rq;
1354 	}
1355 
1356 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1357 
1358 	timeout = i915_request_wait(rq, 0, 10);
1359 	if (timeout < 0) {
1360 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1361 		       timeout);
1362 		err = timeout;
1363 		goto out_rq;
1364 	}
1365 
1366 	if (i915_reset_count(global) == reset_count) {
1367 		pr_err("No GPU reset recorded!\n");
1368 		err = -EINVAL;
1369 		goto out_rq;
1370 	}
1371 
1372 out_rq:
1373 	i915_request_put(rq);
1374 fini:
1375 	hang_fini(&h);
1376 unlock:
1377 	igt_global_reset_unlock(gt);
1378 
1379 	if (intel_gt_is_wedged(gt))
1380 		return -EIO;
1381 
1382 	return err;
1383 }
1384 
1385 struct evict_vma {
1386 	struct completion completion;
1387 	struct i915_vma *vma;
1388 };
1389 
1390 static int evict_vma(void *data)
1391 {
1392 	struct evict_vma *arg = data;
1393 	struct i915_address_space *vm = arg->vma->vm;
1394 	struct drm_mm_node evict = arg->vma->node;
1395 	int err;
1396 
1397 	complete(&arg->completion);
1398 
1399 	mutex_lock(&vm->mutex);
1400 	err = i915_gem_evict_for_node(vm, NULL, &evict, 0);
1401 	mutex_unlock(&vm->mutex);
1402 
1403 	return err;
1404 }
1405 
1406 static int evict_fence(void *data)
1407 {
1408 	struct evict_vma *arg = data;
1409 	int err;
1410 
1411 	complete(&arg->completion);
1412 
1413 	/* Mark the fence register as dirty to force the mmio update. */
1414 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1415 	if (err) {
1416 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1417 		return err;
1418 	}
1419 
1420 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1421 	if (err) {
1422 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1423 		return err;
1424 	}
1425 
1426 	err = i915_vma_pin_fence(arg->vma);
1427 	i915_vma_unpin(arg->vma);
1428 	if (err) {
1429 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1430 		return err;
1431 	}
1432 
1433 	i915_vma_unpin_fence(arg->vma);
1434 
1435 	return 0;
1436 }
1437 
1438 static int __igt_reset_evict_vma(struct intel_gt *gt,
1439 				 struct i915_address_space *vm,
1440 				 int (*fn)(void *),
1441 				 unsigned int flags)
1442 {
1443 	struct intel_engine_cs *engine;
1444 	struct drm_i915_gem_object *obj;
1445 	struct task_struct *tsk = NULL;
1446 	struct i915_request *rq;
1447 	struct evict_vma arg;
1448 	struct hang h;
1449 	unsigned int pin_flags;
1450 	int err;
1451 
1452 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1453 		return 0;
1454 
1455 	engine = intel_selftest_find_any_engine(gt);
1456 
1457 	if (!engine || !intel_engine_can_store_dword(engine))
1458 		return 0;
1459 
1460 	/* Check that we can recover an unbind stuck on a hanging request */
1461 
1462 	err = hang_init(&h, gt);
1463 	if (err) {
1464 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1465 		return err;
1466 	}
1467 
1468 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1469 	if (IS_ERR(obj)) {
1470 		err = PTR_ERR(obj);
1471 		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1472 		goto fini;
1473 	}
1474 
1475 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1476 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1477 		if (err) {
1478 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1479 			goto out_obj;
1480 		}
1481 	}
1482 
1483 	arg.vma = i915_vma_instance(obj, vm, NULL);
1484 	if (IS_ERR(arg.vma)) {
1485 		err = PTR_ERR(arg.vma);
1486 		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1487 		goto out_obj;
1488 	}
1489 
1490 	rq = hang_create_request(&h, engine);
1491 	if (IS_ERR(rq)) {
1492 		err = PTR_ERR(rq);
1493 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1494 		goto out_obj;
1495 	}
1496 
1497 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1498 
1499 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1500 		pin_flags |= PIN_MAPPABLE;
1501 
1502 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1503 	if (err) {
1504 		i915_request_add(rq);
1505 		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1506 		goto out_obj;
1507 	}
1508 
1509 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1510 		err = i915_vma_pin_fence(arg.vma);
1511 		if (err) {
1512 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1513 			i915_vma_unpin(arg.vma);
1514 			i915_request_add(rq);
1515 			goto out_obj;
1516 		}
1517 	}
1518 
1519 	i915_vma_lock(arg.vma);
1520 	err = i915_vma_move_to_active(arg.vma, rq, flags);
1521 	if (err)
1522 		pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1523 
1524 	i915_vma_unlock(arg.vma);
1525 
1526 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1527 		i915_vma_unpin_fence(arg.vma);
1528 	i915_vma_unpin(arg.vma);
1529 
1530 	i915_request_get(rq);
1531 	i915_request_add(rq);
1532 	if (err)
1533 		goto out_rq;
1534 
1535 	if (!wait_until_running(&h, rq)) {
1536 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1537 
1538 		pr_err("%s: Failed to start request %llx, at %x\n",
1539 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1540 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1541 
1542 		intel_gt_set_wedged(gt);
1543 		goto out_reset;
1544 	}
1545 
1546 	init_completion(&arg.completion);
1547 
1548 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1549 	if (IS_ERR(tsk)) {
1550 		err = PTR_ERR(tsk);
1551 		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1552 		tsk = NULL;
1553 		goto out_reset;
1554 	}
1555 	get_task_struct(tsk);
1556 
1557 	wait_for_completion(&arg.completion);
1558 
1559 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1560 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1561 
1562 		pr_err("igt/evict_vma kthread did not wait\n");
1563 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1564 
1565 		intel_gt_set_wedged(gt);
1566 		goto out_reset;
1567 	}
1568 
1569 out_reset:
1570 	igt_global_reset_lock(gt);
1571 	fake_hangcheck(gt, rq->engine->mask);
1572 	igt_global_reset_unlock(gt);
1573 
1574 	if (tsk) {
1575 		struct intel_wedge_me w;
1576 
1577 		/* The reset, even indirectly, should take less than 10ms. */
1578 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1579 			err = kthread_stop(tsk);
1580 
1581 		put_task_struct(tsk);
1582 	}
1583 
1584 out_rq:
1585 	i915_request_put(rq);
1586 out_obj:
1587 	i915_gem_object_put(obj);
1588 fini:
1589 	hang_fini(&h);
1590 	if (intel_gt_is_wedged(gt))
1591 		return -EIO;
1592 
1593 	return err;
1594 }
1595 
1596 static int igt_reset_evict_ggtt(void *arg)
1597 {
1598 	struct intel_gt *gt = arg;
1599 
1600 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1601 				     evict_vma, EXEC_OBJECT_WRITE);
1602 }
1603 
1604 static int igt_reset_evict_ppgtt(void *arg)
1605 {
1606 	struct intel_gt *gt = arg;
1607 	struct i915_ppgtt *ppgtt;
1608 	int err;
1609 
1610 	/* aliasing == global gtt locking, covered above */
1611 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1612 		return 0;
1613 
1614 	ppgtt = i915_ppgtt_create(gt, 0);
1615 	if (IS_ERR(ppgtt))
1616 		return PTR_ERR(ppgtt);
1617 
1618 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1619 				    evict_vma, EXEC_OBJECT_WRITE);
1620 	i915_vm_put(&ppgtt->vm);
1621 
1622 	return err;
1623 }
1624 
1625 static int igt_reset_evict_fence(void *arg)
1626 {
1627 	struct intel_gt *gt = arg;
1628 
1629 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1630 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1631 }
1632 
1633 static int wait_for_others(struct intel_gt *gt,
1634 			   struct intel_engine_cs *exclude)
1635 {
1636 	struct intel_engine_cs *engine;
1637 	enum intel_engine_id id;
1638 
1639 	for_each_engine(engine, gt, id) {
1640 		if (engine == exclude)
1641 			continue;
1642 
1643 		if (!wait_for_idle(engine))
1644 			return -EIO;
1645 	}
1646 
1647 	return 0;
1648 }
1649 
1650 static int igt_reset_queue(void *arg)
1651 {
1652 	struct intel_gt *gt = arg;
1653 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1654 	struct intel_engine_cs *engine;
1655 	enum intel_engine_id id;
1656 	struct hang h;
1657 	int err;
1658 
1659 	/* Check that we replay pending requests following a hang */
1660 
1661 	igt_global_reset_lock(gt);
1662 
1663 	err = hang_init(&h, gt);
1664 	if (err)
1665 		goto unlock;
1666 
1667 	for_each_engine(engine, gt, id) {
1668 		struct intel_selftest_saved_policy saved;
1669 		struct i915_request *prev;
1670 		IGT_TIMEOUT(end_time);
1671 		unsigned int count;
1672 		bool using_guc = intel_engine_uses_guc(engine);
1673 
1674 		if (!intel_engine_can_store_dword(engine))
1675 			continue;
1676 
1677 		if (using_guc) {
1678 			err = intel_selftest_modify_policy(engine, &saved,
1679 							   SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1680 			if (err) {
1681 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1682 				goto fini;
1683 			}
1684 		}
1685 
1686 		prev = hang_create_request(&h, engine);
1687 		if (IS_ERR(prev)) {
1688 			err = PTR_ERR(prev);
1689 			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1690 			goto restore;
1691 		}
1692 
1693 		i915_request_get(prev);
1694 		i915_request_add(prev);
1695 
1696 		count = 0;
1697 		do {
1698 			struct i915_request *rq;
1699 			unsigned int reset_count;
1700 
1701 			rq = hang_create_request(&h, engine);
1702 			if (IS_ERR(rq)) {
1703 				err = PTR_ERR(rq);
1704 				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1705 				goto restore;
1706 			}
1707 
1708 			i915_request_get(rq);
1709 			i915_request_add(rq);
1710 
1711 			/*
1712 			 * XXX We don't handle resetting the kernel context
1713 			 * very well. If we trigger a device reset twice in
1714 			 * quick succession while the kernel context is
1715 			 * executing, we may end up skipping the breadcrumb.
1716 			 * This is really only a problem for the selftest as
1717 			 * normally there is a large interlude between resets
1718 			 * (hangcheck), or we focus on resetting just one
1719 			 * engine and so avoid repeatedly resetting innocents.
1720 			 */
1721 			err = wait_for_others(gt, engine);
1722 			if (err) {
1723 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1724 				       __func__, engine->name);
1725 				i915_request_put(rq);
1726 				i915_request_put(prev);
1727 
1728 				GEM_TRACE_DUMP();
1729 				intel_gt_set_wedged(gt);
1730 				goto restore;
1731 			}
1732 
1733 			if (!wait_until_running(&h, prev)) {
1734 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1735 
1736 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1737 				       __func__, engine->name,
1738 				       prev->fence.seqno, hws_seqno(&h, prev));
1739 				intel_engine_dump(engine, &p,
1740 						  "%s\n", engine->name);
1741 
1742 				i915_request_put(rq);
1743 				i915_request_put(prev);
1744 
1745 				intel_gt_set_wedged(gt);
1746 
1747 				err = -EIO;
1748 				goto restore;
1749 			}
1750 
1751 			reset_count = fake_hangcheck(gt, BIT(id));
1752 
1753 			if (prev->fence.error != -EIO) {
1754 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1755 				       prev->fence.error);
1756 				i915_request_put(rq);
1757 				i915_request_put(prev);
1758 				err = -EINVAL;
1759 				goto restore;
1760 			}
1761 
1762 			if (rq->fence.error) {
1763 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1764 				       rq->fence.error);
1765 				i915_request_put(rq);
1766 				i915_request_put(prev);
1767 				err = -EINVAL;
1768 				goto restore;
1769 			}
1770 
1771 			if (i915_reset_count(global) == reset_count) {
1772 				pr_err("No GPU reset recorded!\n");
1773 				i915_request_put(rq);
1774 				i915_request_put(prev);
1775 				err = -EINVAL;
1776 				goto restore;
1777 			}
1778 
1779 			i915_request_put(prev);
1780 			prev = rq;
1781 			count++;
1782 		} while (time_before(jiffies, end_time));
1783 		pr_info("%s: Completed %d queued resets\n",
1784 			engine->name, count);
1785 
1786 		*h.batch = MI_BATCH_BUFFER_END;
1787 		intel_gt_chipset_flush(engine->gt);
1788 
1789 		i915_request_put(prev);
1790 
1791 restore:
1792 		if (using_guc) {
1793 			int err2 = intel_selftest_restore_policy(engine, &saved);
1794 
1795 			if (err2)
1796 				pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1797 				       __func__, __LINE__, engine->name, err2);
1798 			if (err == 0)
1799 				err = err2;
1800 		}
1801 		if (err)
1802 			goto fini;
1803 
1804 		err = igt_flush_test(gt->i915);
1805 		if (err) {
1806 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1807 			break;
1808 		}
1809 	}
1810 
1811 fini:
1812 	hang_fini(&h);
1813 unlock:
1814 	igt_global_reset_unlock(gt);
1815 
1816 	if (intel_gt_is_wedged(gt))
1817 		return -EIO;
1818 
1819 	return err;
1820 }
1821 
1822 static int igt_handle_error(void *arg)
1823 {
1824 	struct intel_gt *gt = arg;
1825 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1826 	struct intel_engine_cs *engine;
1827 	struct hang h;
1828 	struct i915_request *rq;
1829 	struct i915_gpu_coredump *error;
1830 	int err;
1831 
1832 	engine = intel_selftest_find_any_engine(gt);
1833 
1834 	/* Check that we can issue a global GPU and engine reset */
1835 
1836 	if (!intel_has_reset_engine(gt))
1837 		return 0;
1838 
1839 	if (!engine || !intel_engine_can_store_dword(engine))
1840 		return 0;
1841 
1842 	err = hang_init(&h, gt);
1843 	if (err) {
1844 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1845 		return err;
1846 	}
1847 
1848 	rq = hang_create_request(&h, engine);
1849 	if (IS_ERR(rq)) {
1850 		err = PTR_ERR(rq);
1851 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1852 		goto err_fini;
1853 	}
1854 
1855 	i915_request_get(rq);
1856 	i915_request_add(rq);
1857 
1858 	if (!wait_until_running(&h, rq)) {
1859 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1860 
1861 		pr_err("%s: Failed to start request %llx, at %x\n",
1862 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1863 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1864 
1865 		intel_gt_set_wedged(gt);
1866 
1867 		err = -EIO;
1868 		goto err_request;
1869 	}
1870 
1871 	/* Temporarily disable error capture */
1872 	error = xchg(&global->first_error, (void *)-1);
1873 
1874 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1875 
1876 	xchg(&global->first_error, error);
1877 
1878 	if (rq->fence.error != -EIO) {
1879 		pr_err("Guilty request not identified!\n");
1880 		err = -EINVAL;
1881 		goto err_request;
1882 	}
1883 
1884 err_request:
1885 	i915_request_put(rq);
1886 err_fini:
1887 	hang_fini(&h);
1888 	return err;
1889 }
1890 
1891 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1892 				     const struct igt_atomic_section *p,
1893 				     const char *mode)
1894 {
1895 	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1896 	int err;
1897 
1898 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1899 		  engine->name, mode, p->name);
1900 
1901 	if (t->func)
1902 		tasklet_disable(t);
1903 	if (strcmp(p->name, "softirq"))
1904 		local_bh_disable();
1905 	p->critical_section_begin();
1906 
1907 	err = __intel_engine_reset_bh(engine, NULL);
1908 
1909 	p->critical_section_end();
1910 	if (strcmp(p->name, "softirq"))
1911 		local_bh_enable();
1912 	if (t->func) {
1913 		tasklet_enable(t);
1914 		tasklet_hi_schedule(t);
1915 	}
1916 
1917 	if (err)
1918 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1919 		       engine->name, mode, p->name);
1920 
1921 	return err;
1922 }
1923 
1924 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1925 				   const struct igt_atomic_section *p)
1926 {
1927 	struct i915_request *rq;
1928 	struct hang h;
1929 	int err;
1930 
1931 	err = __igt_atomic_reset_engine(engine, p, "idle");
1932 	if (err)
1933 		return err;
1934 
1935 	err = hang_init(&h, engine->gt);
1936 	if (err) {
1937 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1938 		return err;
1939 	}
1940 
1941 	rq = hang_create_request(&h, engine);
1942 	if (IS_ERR(rq)) {
1943 		err = PTR_ERR(rq);
1944 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1945 		goto out;
1946 	}
1947 
1948 	i915_request_get(rq);
1949 	i915_request_add(rq);
1950 
1951 	if (wait_until_running(&h, rq)) {
1952 		err = __igt_atomic_reset_engine(engine, p, "active");
1953 	} else {
1954 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1955 		       __func__, engine->name,
1956 		       rq->fence.seqno, hws_seqno(&h, rq));
1957 		intel_gt_set_wedged(engine->gt);
1958 		err = -EIO;
1959 	}
1960 
1961 	if (err == 0) {
1962 		struct intel_wedge_me w;
1963 
1964 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1965 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1966 		if (intel_gt_is_wedged(engine->gt))
1967 			err = -EIO;
1968 	}
1969 
1970 	i915_request_put(rq);
1971 out:
1972 	hang_fini(&h);
1973 	return err;
1974 }
1975 
1976 static int igt_reset_engines_atomic(void *arg)
1977 {
1978 	struct intel_gt *gt = arg;
1979 	const typeof(*igt_atomic_phases) *p;
1980 	int err = 0;
1981 
1982 	/* Check that the engines resets are usable from atomic context */
1983 
1984 	if (!intel_has_reset_engine(gt))
1985 		return 0;
1986 
1987 	if (intel_uc_uses_guc_submission(&gt->uc))
1988 		return 0;
1989 
1990 	igt_global_reset_lock(gt);
1991 
1992 	/* Flush any requests before we get started and check basics */
1993 	if (!igt_force_reset(gt))
1994 		goto unlock;
1995 
1996 	for (p = igt_atomic_phases; p->name; p++) {
1997 		struct intel_engine_cs *engine;
1998 		enum intel_engine_id id;
1999 
2000 		for_each_engine(engine, gt, id) {
2001 			err = igt_atomic_reset_engine(engine, p);
2002 			if (err)
2003 				goto out;
2004 		}
2005 	}
2006 
2007 out:
2008 	/* As we poke around the guts, do a full reset before continuing. */
2009 	igt_force_reset(gt);
2010 unlock:
2011 	igt_global_reset_unlock(gt);
2012 
2013 	return err;
2014 }
2015 
2016 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2017 {
2018 	static const struct i915_subtest tests[] = {
2019 		SUBTEST(igt_hang_sanitycheck),
2020 		SUBTEST(igt_reset_nop),
2021 		SUBTEST(igt_reset_nop_engine),
2022 		SUBTEST(igt_reset_idle_engine),
2023 		SUBTEST(igt_reset_active_engine),
2024 		SUBTEST(igt_reset_fail_engine),
2025 		SUBTEST(igt_reset_engines),
2026 		SUBTEST(igt_reset_engines_atomic),
2027 		SUBTEST(igt_reset_queue),
2028 		SUBTEST(igt_reset_wait),
2029 		SUBTEST(igt_reset_evict_ggtt),
2030 		SUBTEST(igt_reset_evict_ppgtt),
2031 		SUBTEST(igt_reset_evict_fence),
2032 		SUBTEST(igt_handle_error),
2033 	};
2034 	struct intel_gt *gt = to_gt(i915);
2035 	intel_wakeref_t wakeref;
2036 	int err;
2037 
2038 	if (!intel_has_gpu_reset(gt))
2039 		return 0;
2040 
2041 	if (intel_gt_is_wedged(gt))
2042 		return -EIO; /* we're long past hope of a successful reset */
2043 
2044 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2045 
2046 	err = intel_gt_live_subtests(tests, gt);
2047 
2048 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2049 
2050 	return err;
2051 }
2052