1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5 
6 #include <linux/kthread.h>
7 
8 #include "gem/i915_gem_context.h"
9 
10 #include "i915_gem_evict.h"
11 #include "intel_gt.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "selftest_engine_heartbeat.h"
15 
16 #include "i915_selftest.h"
17 #include "selftests/i915_random.h"
18 #include "selftests/igt_flush_test.h"
19 #include "selftests/igt_reset.h"
20 #include "selftests/igt_atomic.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/intel_scheduler_helpers.h"
23 
24 #include "selftests/mock_drm.h"
25 
26 #include "gem/selftests/mock_context.h"
27 #include "gem/selftests/igt_gem_utils.h"
28 
29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
30 
31 struct hang {
32 	struct intel_gt *gt;
33 	struct drm_i915_gem_object *hws;
34 	struct drm_i915_gem_object *obj;
35 	struct i915_gem_context *ctx;
36 	u32 *seqno;
37 	u32 *batch;
38 };
39 
40 static int hang_init(struct hang *h, struct intel_gt *gt)
41 {
42 	void *vaddr;
43 	int err;
44 
45 	memset(h, 0, sizeof(*h));
46 	h->gt = gt;
47 
48 	h->ctx = kernel_context(gt->i915, NULL);
49 	if (IS_ERR(h->ctx))
50 		return PTR_ERR(h->ctx);
51 
52 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
53 
54 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
55 	if (IS_ERR(h->hws)) {
56 		err = PTR_ERR(h->hws);
57 		goto err_ctx;
58 	}
59 
60 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
61 	if (IS_ERR(h->obj)) {
62 		err = PTR_ERR(h->obj);
63 		goto err_hws;
64 	}
65 
66 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
67 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
68 	if (IS_ERR(vaddr)) {
69 		err = PTR_ERR(vaddr);
70 		goto err_obj;
71 	}
72 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
73 
74 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
75 						 i915_coherent_map_type(gt->i915, h->obj, false));
76 	if (IS_ERR(vaddr)) {
77 		err = PTR_ERR(vaddr);
78 		goto err_unpin_hws;
79 	}
80 	h->batch = vaddr;
81 
82 	return 0;
83 
84 err_unpin_hws:
85 	i915_gem_object_unpin_map(h->hws);
86 err_obj:
87 	i915_gem_object_put(h->obj);
88 err_hws:
89 	i915_gem_object_put(h->hws);
90 err_ctx:
91 	kernel_context_close(h->ctx);
92 	return err;
93 }
94 
95 static u64 hws_address(const struct i915_vma *hws,
96 		       const struct i915_request *rq)
97 {
98 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
99 }
100 
101 static int move_to_active(struct i915_vma *vma,
102 			  struct i915_request *rq,
103 			  unsigned int flags)
104 {
105 	int err;
106 
107 	i915_vma_lock(vma);
108 	err = i915_request_await_object(rq, vma->obj,
109 					flags & EXEC_OBJECT_WRITE);
110 	if (err == 0)
111 		err = i915_vma_move_to_active(vma, rq, flags);
112 	i915_vma_unlock(vma);
113 
114 	return err;
115 }
116 
117 static struct i915_request *
118 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
119 {
120 	struct intel_gt *gt = h->gt;
121 	struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
122 	struct drm_i915_gem_object *obj;
123 	struct i915_request *rq = NULL;
124 	struct i915_vma *hws, *vma;
125 	unsigned int flags;
126 	void *vaddr;
127 	u32 *batch;
128 	int err;
129 
130 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
131 	if (IS_ERR(obj)) {
132 		i915_vm_put(vm);
133 		return ERR_CAST(obj);
134 	}
135 
136 	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
137 	if (IS_ERR(vaddr)) {
138 		i915_gem_object_put(obj);
139 		i915_vm_put(vm);
140 		return ERR_CAST(vaddr);
141 	}
142 
143 	i915_gem_object_unpin_map(h->obj);
144 	i915_gem_object_put(h->obj);
145 
146 	h->obj = obj;
147 	h->batch = vaddr;
148 
149 	vma = i915_vma_instance(h->obj, vm, NULL);
150 	if (IS_ERR(vma)) {
151 		i915_vm_put(vm);
152 		return ERR_CAST(vma);
153 	}
154 
155 	hws = i915_vma_instance(h->hws, vm, NULL);
156 	if (IS_ERR(hws)) {
157 		i915_vm_put(vm);
158 		return ERR_CAST(hws);
159 	}
160 
161 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
162 	if (err) {
163 		i915_vm_put(vm);
164 		return ERR_PTR(err);
165 	}
166 
167 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
168 	if (err)
169 		goto unpin_vma;
170 
171 	rq = igt_request_alloc(h->ctx, engine);
172 	if (IS_ERR(rq)) {
173 		err = PTR_ERR(rq);
174 		goto unpin_hws;
175 	}
176 
177 	err = move_to_active(vma, rq, 0);
178 	if (err)
179 		goto cancel_rq;
180 
181 	err = move_to_active(hws, rq, 0);
182 	if (err)
183 		goto cancel_rq;
184 
185 	batch = h->batch;
186 	if (GRAPHICS_VER(gt->i915) >= 8) {
187 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
188 		*batch++ = lower_32_bits(hws_address(hws, rq));
189 		*batch++ = upper_32_bits(hws_address(hws, rq));
190 		*batch++ = rq->fence.seqno;
191 		*batch++ = MI_NOOP;
192 
193 		memset(batch, 0, 1024);
194 		batch += 1024 / sizeof(*batch);
195 
196 		*batch++ = MI_NOOP;
197 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
198 		*batch++ = lower_32_bits(vma->node.start);
199 		*batch++ = upper_32_bits(vma->node.start);
200 	} else if (GRAPHICS_VER(gt->i915) >= 6) {
201 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
202 		*batch++ = 0;
203 		*batch++ = lower_32_bits(hws_address(hws, rq));
204 		*batch++ = rq->fence.seqno;
205 		*batch++ = MI_NOOP;
206 
207 		memset(batch, 0, 1024);
208 		batch += 1024 / sizeof(*batch);
209 
210 		*batch++ = MI_NOOP;
211 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
212 		*batch++ = lower_32_bits(vma->node.start);
213 	} else if (GRAPHICS_VER(gt->i915) >= 4) {
214 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
215 		*batch++ = 0;
216 		*batch++ = lower_32_bits(hws_address(hws, rq));
217 		*batch++ = rq->fence.seqno;
218 		*batch++ = MI_NOOP;
219 
220 		memset(batch, 0, 1024);
221 		batch += 1024 / sizeof(*batch);
222 
223 		*batch++ = MI_NOOP;
224 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
225 		*batch++ = lower_32_bits(vma->node.start);
226 	} else {
227 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
228 		*batch++ = lower_32_bits(hws_address(hws, rq));
229 		*batch++ = rq->fence.seqno;
230 		*batch++ = MI_NOOP;
231 
232 		memset(batch, 0, 1024);
233 		batch += 1024 / sizeof(*batch);
234 
235 		*batch++ = MI_NOOP;
236 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
237 		*batch++ = lower_32_bits(vma->node.start);
238 	}
239 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
240 	intel_gt_chipset_flush(engine->gt);
241 
242 	if (rq->engine->emit_init_breadcrumb) {
243 		err = rq->engine->emit_init_breadcrumb(rq);
244 		if (err)
245 			goto cancel_rq;
246 	}
247 
248 	flags = 0;
249 	if (GRAPHICS_VER(gt->i915) <= 5)
250 		flags |= I915_DISPATCH_SECURE;
251 
252 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
253 
254 cancel_rq:
255 	if (err) {
256 		i915_request_set_error_once(rq, err);
257 		i915_request_add(rq);
258 	}
259 unpin_hws:
260 	i915_vma_unpin(hws);
261 unpin_vma:
262 	i915_vma_unpin(vma);
263 	i915_vm_put(vm);
264 	return err ? ERR_PTR(err) : rq;
265 }
266 
267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
268 {
269 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
270 }
271 
272 static void hang_fini(struct hang *h)
273 {
274 	*h->batch = MI_BATCH_BUFFER_END;
275 	intel_gt_chipset_flush(h->gt);
276 
277 	i915_gem_object_unpin_map(h->obj);
278 	i915_gem_object_put(h->obj);
279 
280 	i915_gem_object_unpin_map(h->hws);
281 	i915_gem_object_put(h->hws);
282 
283 	kernel_context_close(h->ctx);
284 
285 	igt_flush_test(h->gt->i915);
286 }
287 
288 static bool wait_until_running(struct hang *h, struct i915_request *rq)
289 {
290 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
291 					       rq->fence.seqno),
292 			     10) &&
293 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
294 					    rq->fence.seqno),
295 			  1000));
296 }
297 
298 static int igt_hang_sanitycheck(void *arg)
299 {
300 	struct intel_gt *gt = arg;
301 	struct i915_request *rq;
302 	struct intel_engine_cs *engine;
303 	enum intel_engine_id id;
304 	struct hang h;
305 	int err;
306 
307 	/* Basic check that we can execute our hanging batch */
308 
309 	err = hang_init(&h, gt);
310 	if (err)
311 		return err;
312 
313 	for_each_engine(engine, gt, id) {
314 		struct intel_wedge_me w;
315 		long timeout;
316 
317 		if (!intel_engine_can_store_dword(engine))
318 			continue;
319 
320 		rq = hang_create_request(&h, engine);
321 		if (IS_ERR(rq)) {
322 			err = PTR_ERR(rq);
323 			pr_err("Failed to create request for %s, err=%d\n",
324 			       engine->name, err);
325 			goto fini;
326 		}
327 
328 		i915_request_get(rq);
329 
330 		*h.batch = MI_BATCH_BUFFER_END;
331 		intel_gt_chipset_flush(engine->gt);
332 
333 		i915_request_add(rq);
334 
335 		timeout = 0;
336 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
337 			timeout = i915_request_wait(rq, 0,
338 						    MAX_SCHEDULE_TIMEOUT);
339 		if (intel_gt_is_wedged(gt))
340 			timeout = -EIO;
341 
342 		i915_request_put(rq);
343 
344 		if (timeout < 0) {
345 			err = timeout;
346 			pr_err("Wait for request failed on %s, err=%d\n",
347 			       engine->name, err);
348 			goto fini;
349 		}
350 	}
351 
352 fini:
353 	hang_fini(&h);
354 	return err;
355 }
356 
357 static bool wait_for_idle(struct intel_engine_cs *engine)
358 {
359 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
360 }
361 
362 static int igt_reset_nop(void *arg)
363 {
364 	struct intel_gt *gt = arg;
365 	struct i915_gpu_error *global = &gt->i915->gpu_error;
366 	struct intel_engine_cs *engine;
367 	unsigned int reset_count, count;
368 	enum intel_engine_id id;
369 	IGT_TIMEOUT(end_time);
370 	int err = 0;
371 
372 	/* Check that we can reset during non-user portions of requests */
373 
374 	reset_count = i915_reset_count(global);
375 	count = 0;
376 	do {
377 		for_each_engine(engine, gt, id) {
378 			struct intel_context *ce;
379 			int i;
380 
381 			ce = intel_context_create(engine);
382 			if (IS_ERR(ce)) {
383 				err = PTR_ERR(ce);
384 				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
385 				break;
386 			}
387 
388 			for (i = 0; i < 16; i++) {
389 				struct i915_request *rq;
390 
391 				rq = intel_context_create_request(ce);
392 				if (IS_ERR(rq)) {
393 					err = PTR_ERR(rq);
394 					pr_err("[%s] Create request failed: %d!\n",
395 					       engine->name, err);
396 					break;
397 				}
398 
399 				i915_request_add(rq);
400 			}
401 
402 			intel_context_put(ce);
403 		}
404 
405 		igt_global_reset_lock(gt);
406 		intel_gt_reset(gt, ALL_ENGINES, NULL);
407 		igt_global_reset_unlock(gt);
408 
409 		if (intel_gt_is_wedged(gt)) {
410 			pr_err("[%s] GT is wedged!\n", engine->name);
411 			err = -EIO;
412 			break;
413 		}
414 
415 		if (i915_reset_count(global) != reset_count + ++count) {
416 			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
417 			       engine->name, i915_reset_count(global), reset_count, count);
418 			err = -EINVAL;
419 			break;
420 		}
421 
422 		err = igt_flush_test(gt->i915);
423 		if (err) {
424 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
425 			break;
426 		}
427 	} while (time_before(jiffies, end_time));
428 	pr_info("%s: %d resets\n", __func__, count);
429 
430 	if (igt_flush_test(gt->i915)) {
431 		pr_err("Post flush failed: %d!\n", err);
432 		err = -EIO;
433 	}
434 
435 	return err;
436 }
437 
438 static int igt_reset_nop_engine(void *arg)
439 {
440 	struct intel_gt *gt = arg;
441 	struct i915_gpu_error *global = &gt->i915->gpu_error;
442 	struct intel_engine_cs *engine;
443 	enum intel_engine_id id;
444 
445 	/* Check that we can engine-reset during non-user portions */
446 
447 	if (!intel_has_reset_engine(gt))
448 		return 0;
449 
450 	for_each_engine(engine, gt, id) {
451 		unsigned int reset_count, reset_engine_count, count;
452 		struct intel_context *ce;
453 		IGT_TIMEOUT(end_time);
454 		int err;
455 
456 		if (intel_engine_uses_guc(engine)) {
457 			/* Engine level resets are triggered by GuC when a hang
458 			 * is detected. They can't be triggered by the KMD any
459 			 * more. Thus a nop batch cannot be used as a reset test
460 			 */
461 			continue;
462 		}
463 
464 		ce = intel_context_create(engine);
465 		if (IS_ERR(ce)) {
466 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
467 			return PTR_ERR(ce);
468 		}
469 
470 		reset_count = i915_reset_count(global);
471 		reset_engine_count = i915_reset_engine_count(global, engine);
472 		count = 0;
473 
474 		st_engine_heartbeat_disable(engine);
475 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
476 					    &gt->reset.flags));
477 		do {
478 			int i;
479 
480 			if (!wait_for_idle(engine)) {
481 				pr_err("%s failed to idle before reset\n",
482 				       engine->name);
483 				err = -EIO;
484 				break;
485 			}
486 
487 			for (i = 0; i < 16; i++) {
488 				struct i915_request *rq;
489 
490 				rq = intel_context_create_request(ce);
491 				if (IS_ERR(rq)) {
492 					struct drm_printer p =
493 						drm_info_printer(gt->i915->drm.dev);
494 					intel_engine_dump(engine, &p,
495 							  "%s(%s): failed to submit request\n",
496 							  __func__,
497 							  engine->name);
498 
499 					GEM_TRACE("%s(%s): failed to submit request\n",
500 						  __func__,
501 						  engine->name);
502 					GEM_TRACE_DUMP();
503 
504 					intel_gt_set_wedged(gt);
505 
506 					err = PTR_ERR(rq);
507 					break;
508 				}
509 
510 				i915_request_add(rq);
511 			}
512 			err = intel_engine_reset(engine, NULL);
513 			if (err) {
514 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
515 				       engine->name, err);
516 				break;
517 			}
518 
519 			if (i915_reset_count(global) != reset_count) {
520 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
521 				err = -EINVAL;
522 				break;
523 			}
524 
525 			if (i915_reset_engine_count(global, engine) !=
526 			    reset_engine_count + ++count) {
527 				pr_err("%s engine reset not recorded!\n",
528 				       engine->name);
529 				err = -EINVAL;
530 				break;
531 			}
532 		} while (time_before(jiffies, end_time));
533 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
534 		st_engine_heartbeat_enable(engine);
535 
536 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
537 
538 		intel_context_put(ce);
539 		if (igt_flush_test(gt->i915))
540 			err = -EIO;
541 		if (err)
542 			return err;
543 	}
544 
545 	return 0;
546 }
547 
548 static void force_reset_timeout(struct intel_engine_cs *engine)
549 {
550 	engine->reset_timeout.probability = 999;
551 	atomic_set(&engine->reset_timeout.times, -1);
552 }
553 
554 static void cancel_reset_timeout(struct intel_engine_cs *engine)
555 {
556 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
557 }
558 
559 static int igt_reset_fail_engine(void *arg)
560 {
561 	struct intel_gt *gt = arg;
562 	struct intel_engine_cs *engine;
563 	enum intel_engine_id id;
564 
565 	/* Check that we can recover from engine-reset failues */
566 
567 	if (!intel_has_reset_engine(gt))
568 		return 0;
569 
570 	for_each_engine(engine, gt, id) {
571 		unsigned int count;
572 		struct intel_context *ce;
573 		IGT_TIMEOUT(end_time);
574 		int err;
575 
576 		/* Can't manually break the reset if i915 doesn't perform it */
577 		if (intel_engine_uses_guc(engine))
578 			continue;
579 
580 		ce = intel_context_create(engine);
581 		if (IS_ERR(ce)) {
582 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
583 			return PTR_ERR(ce);
584 		}
585 
586 		st_engine_heartbeat_disable(engine);
587 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
588 					    &gt->reset.flags));
589 
590 		force_reset_timeout(engine);
591 		err = intel_engine_reset(engine, NULL);
592 		cancel_reset_timeout(engine);
593 		if (err == 0) /* timeouts only generated on gen8+ */
594 			goto skip;
595 
596 		count = 0;
597 		do {
598 			struct i915_request *last = NULL;
599 			int i;
600 
601 			if (!wait_for_idle(engine)) {
602 				pr_err("%s failed to idle before reset\n",
603 				       engine->name);
604 				err = -EIO;
605 				break;
606 			}
607 
608 			for (i = 0; i < count % 15; i++) {
609 				struct i915_request *rq;
610 
611 				rq = intel_context_create_request(ce);
612 				if (IS_ERR(rq)) {
613 					struct drm_printer p =
614 						drm_info_printer(gt->i915->drm.dev);
615 					intel_engine_dump(engine, &p,
616 							  "%s(%s): failed to submit request\n",
617 							  __func__,
618 							  engine->name);
619 
620 					GEM_TRACE("%s(%s): failed to submit request\n",
621 						  __func__,
622 						  engine->name);
623 					GEM_TRACE_DUMP();
624 
625 					intel_gt_set_wedged(gt);
626 					if (last)
627 						i915_request_put(last);
628 
629 					err = PTR_ERR(rq);
630 					goto out;
631 				}
632 
633 				if (last)
634 					i915_request_put(last);
635 				last = i915_request_get(rq);
636 				i915_request_add(rq);
637 			}
638 
639 			if (count & 1) {
640 				err = intel_engine_reset(engine, NULL);
641 				if (err) {
642 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
643 						      engine->name, err);
644 					GEM_TRACE_DUMP();
645 					i915_request_put(last);
646 					break;
647 				}
648 			} else {
649 				force_reset_timeout(engine);
650 				err = intel_engine_reset(engine, NULL);
651 				cancel_reset_timeout(engine);
652 				if (err != -ETIMEDOUT) {
653 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
654 					       engine->name, err);
655 					i915_request_put(last);
656 					break;
657 				}
658 			}
659 
660 			err = 0;
661 			if (last) {
662 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
663 					struct drm_printer p =
664 						drm_info_printer(gt->i915->drm.dev);
665 
666 					intel_engine_dump(engine, &p,
667 							  "%s(%s): failed to complete request\n",
668 							  __func__,
669 							  engine->name);
670 
671 					GEM_TRACE("%s(%s): failed to complete request\n",
672 						  __func__,
673 						  engine->name);
674 					GEM_TRACE_DUMP();
675 
676 					err = -EIO;
677 				}
678 				i915_request_put(last);
679 			}
680 			count++;
681 		} while (err == 0 && time_before(jiffies, end_time));
682 out:
683 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
684 skip:
685 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
686 		st_engine_heartbeat_enable(engine);
687 		intel_context_put(ce);
688 
689 		if (igt_flush_test(gt->i915))
690 			err = -EIO;
691 		if (err)
692 			return err;
693 	}
694 
695 	return 0;
696 }
697 
698 static int __igt_reset_engine(struct intel_gt *gt, bool active)
699 {
700 	struct i915_gpu_error *global = &gt->i915->gpu_error;
701 	struct intel_engine_cs *engine;
702 	enum intel_engine_id id;
703 	struct hang h;
704 	int err = 0;
705 
706 	/* Check that we can issue an engine reset on an idle engine (no-op) */
707 
708 	if (!intel_has_reset_engine(gt))
709 		return 0;
710 
711 	if (active) {
712 		err = hang_init(&h, gt);
713 		if (err)
714 			return err;
715 	}
716 
717 	for_each_engine(engine, gt, id) {
718 		unsigned int reset_count, reset_engine_count;
719 		unsigned long count;
720 		bool using_guc = intel_engine_uses_guc(engine);
721 		IGT_TIMEOUT(end_time);
722 
723 		if (using_guc && !active)
724 			continue;
725 
726 		if (active && !intel_engine_can_store_dword(engine))
727 			continue;
728 
729 		if (!wait_for_idle(engine)) {
730 			pr_err("%s failed to idle before reset\n",
731 			       engine->name);
732 			err = -EIO;
733 			break;
734 		}
735 
736 		reset_count = i915_reset_count(global);
737 		reset_engine_count = i915_reset_engine_count(global, engine);
738 
739 		st_engine_heartbeat_disable(engine);
740 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
741 					    &gt->reset.flags));
742 		count = 0;
743 		do {
744 			struct i915_request *rq = NULL;
745 			struct intel_selftest_saved_policy saved;
746 			int err2;
747 
748 			err = intel_selftest_modify_policy(engine, &saved,
749 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
750 			if (err) {
751 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
752 				break;
753 			}
754 
755 			if (active) {
756 				rq = hang_create_request(&h, engine);
757 				if (IS_ERR(rq)) {
758 					err = PTR_ERR(rq);
759 					pr_err("[%s] Create hang request failed: %d!\n",
760 					       engine->name, err);
761 					goto restore;
762 				}
763 
764 				i915_request_get(rq);
765 				i915_request_add(rq);
766 
767 				if (!wait_until_running(&h, rq)) {
768 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
769 
770 					pr_err("%s: Failed to start request %llx, at %x\n",
771 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
772 					intel_engine_dump(engine, &p,
773 							  "%s\n", engine->name);
774 
775 					i915_request_put(rq);
776 					err = -EIO;
777 					goto restore;
778 				}
779 			}
780 
781 			if (!using_guc) {
782 				err = intel_engine_reset(engine, NULL);
783 				if (err) {
784 					pr_err("intel_engine_reset(%s) failed, err:%d\n",
785 					       engine->name, err);
786 					goto skip;
787 				}
788 			}
789 
790 			if (rq) {
791 				/* Ensure the reset happens and kills the engine */
792 				err = intel_selftest_wait_for_rq(rq);
793 				if (err)
794 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
795 					       engine->name, rq->fence.context,
796 					       rq->fence.seqno, rq->context->guc_id.id, err);
797 			}
798 
799 skip:
800 			if (rq)
801 				i915_request_put(rq);
802 
803 			if (i915_reset_count(global) != reset_count) {
804 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
805 				err = -EINVAL;
806 				goto restore;
807 			}
808 
809 			/* GuC based resets are not logged per engine */
810 			if (!using_guc) {
811 				if (i915_reset_engine_count(global, engine) !=
812 				    ++reset_engine_count) {
813 					pr_err("%s engine reset not recorded!\n",
814 					       engine->name);
815 					err = -EINVAL;
816 					goto restore;
817 				}
818 			}
819 
820 			count++;
821 
822 restore:
823 			err2 = intel_selftest_restore_policy(engine, &saved);
824 			if (err2)
825 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
826 			if (err == 0)
827 				err = err2;
828 			if (err)
829 				break;
830 		} while (time_before(jiffies, end_time));
831 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
832 		st_engine_heartbeat_enable(engine);
833 		pr_info("%s: Completed %lu %s resets\n",
834 			engine->name, count, active ? "active" : "idle");
835 
836 		if (err)
837 			break;
838 
839 		err = igt_flush_test(gt->i915);
840 		if (err) {
841 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
842 			break;
843 		}
844 	}
845 
846 	if (intel_gt_is_wedged(gt)) {
847 		pr_err("GT is wedged!\n");
848 		err = -EIO;
849 	}
850 
851 	if (active)
852 		hang_fini(&h);
853 
854 	return err;
855 }
856 
857 static int igt_reset_idle_engine(void *arg)
858 {
859 	return __igt_reset_engine(arg, false);
860 }
861 
862 static int igt_reset_active_engine(void *arg)
863 {
864 	return __igt_reset_engine(arg, true);
865 }
866 
867 struct active_engine {
868 	struct task_struct *task;
869 	struct intel_engine_cs *engine;
870 	unsigned long resets;
871 	unsigned int flags;
872 };
873 
874 #define TEST_ACTIVE	BIT(0)
875 #define TEST_OTHERS	BIT(1)
876 #define TEST_SELF	BIT(2)
877 #define TEST_PRIORITY	BIT(3)
878 
879 static int active_request_put(struct i915_request *rq)
880 {
881 	int err = 0;
882 
883 	if (!rq)
884 		return 0;
885 
886 	if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
887 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
888 			  rq->engine->name,
889 			  rq->fence.context,
890 			  rq->fence.seqno);
891 		GEM_TRACE_DUMP();
892 
893 		intel_gt_set_wedged(rq->engine->gt);
894 		err = -EIO;
895 	}
896 
897 	i915_request_put(rq);
898 
899 	return err;
900 }
901 
902 static int active_engine(void *data)
903 {
904 	I915_RND_STATE(prng);
905 	struct active_engine *arg = data;
906 	struct intel_engine_cs *engine = arg->engine;
907 	struct i915_request *rq[8] = {};
908 	struct intel_context *ce[ARRAY_SIZE(rq)];
909 	unsigned long count;
910 	int err = 0;
911 
912 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
913 		ce[count] = intel_context_create(engine);
914 		if (IS_ERR(ce[count])) {
915 			err = PTR_ERR(ce[count]);
916 			pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
917 			while (--count)
918 				intel_context_put(ce[count]);
919 			return err;
920 		}
921 	}
922 
923 	count = 0;
924 	while (!kthread_should_stop()) {
925 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
926 		struct i915_request *old = rq[idx];
927 		struct i915_request *new;
928 
929 		new = intel_context_create_request(ce[idx]);
930 		if (IS_ERR(new)) {
931 			err = PTR_ERR(new);
932 			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
933 			break;
934 		}
935 
936 		rq[idx] = i915_request_get(new);
937 		i915_request_add(new);
938 
939 		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
940 			struct i915_sched_attr attr = {
941 				.priority =
942 					i915_prandom_u32_max_state(512, &prng),
943 			};
944 			engine->sched_engine->schedule(rq[idx], &attr);
945 		}
946 
947 		err = active_request_put(old);
948 		if (err) {
949 			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
950 			break;
951 		}
952 
953 		cond_resched();
954 	}
955 
956 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
957 		int err__ = active_request_put(rq[count]);
958 
959 		if (err)
960 			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
961 
962 		/* Keep the first error */
963 		if (!err)
964 			err = err__;
965 
966 		intel_context_put(ce[count]);
967 	}
968 
969 	return err;
970 }
971 
972 static int __igt_reset_engines(struct intel_gt *gt,
973 			       const char *test_name,
974 			       unsigned int flags)
975 {
976 	struct i915_gpu_error *global = &gt->i915->gpu_error;
977 	struct intel_engine_cs *engine, *other;
978 	enum intel_engine_id id, tmp;
979 	struct hang h;
980 	int err = 0;
981 
982 	/* Check that issuing a reset on one engine does not interfere
983 	 * with any other engine.
984 	 */
985 
986 	if (!intel_has_reset_engine(gt))
987 		return 0;
988 
989 	if (flags & TEST_ACTIVE) {
990 		err = hang_init(&h, gt);
991 		if (err)
992 			return err;
993 
994 		if (flags & TEST_PRIORITY)
995 			h.ctx->sched.priority = 1024;
996 	}
997 
998 	for_each_engine(engine, gt, id) {
999 		struct active_engine threads[I915_NUM_ENGINES] = {};
1000 		unsigned long device = i915_reset_count(global);
1001 		unsigned long count = 0, reported;
1002 		bool using_guc = intel_engine_uses_guc(engine);
1003 		IGT_TIMEOUT(end_time);
1004 
1005 		if (flags & TEST_ACTIVE) {
1006 			if (!intel_engine_can_store_dword(engine))
1007 				continue;
1008 		} else if (using_guc)
1009 			continue;
1010 
1011 		if (!wait_for_idle(engine)) {
1012 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1013 			       engine->name, test_name);
1014 			err = -EIO;
1015 			break;
1016 		}
1017 
1018 		memset(threads, 0, sizeof(threads));
1019 		for_each_engine(other, gt, tmp) {
1020 			struct task_struct *tsk;
1021 
1022 			threads[tmp].resets =
1023 				i915_reset_engine_count(global, other);
1024 
1025 			if (other == engine && !(flags & TEST_SELF))
1026 				continue;
1027 
1028 			if (other != engine && !(flags & TEST_OTHERS))
1029 				continue;
1030 
1031 			threads[tmp].engine = other;
1032 			threads[tmp].flags = flags;
1033 
1034 			tsk = kthread_run(active_engine, &threads[tmp],
1035 					  "igt/%s", other->name);
1036 			if (IS_ERR(tsk)) {
1037 				err = PTR_ERR(tsk);
1038 				pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1039 				goto unwind;
1040 			}
1041 
1042 			threads[tmp].task = tsk;
1043 			get_task_struct(tsk);
1044 		}
1045 
1046 		yield(); /* start all threads before we begin */
1047 
1048 		st_engine_heartbeat_disable_no_pm(engine);
1049 		GEM_BUG_ON(test_and_set_bit(I915_RESET_ENGINE + id,
1050 					    &gt->reset.flags));
1051 		do {
1052 			struct i915_request *rq = NULL;
1053 			struct intel_selftest_saved_policy saved;
1054 			int err2;
1055 
1056 			err = intel_selftest_modify_policy(engine, &saved,
1057 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1058 			if (err) {
1059 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1060 				break;
1061 			}
1062 
1063 			if (flags & TEST_ACTIVE) {
1064 				rq = hang_create_request(&h, engine);
1065 				if (IS_ERR(rq)) {
1066 					err = PTR_ERR(rq);
1067 					pr_err("[%s] Create hang request failed: %d!\n",
1068 					       engine->name, err);
1069 					goto restore;
1070 				}
1071 
1072 				i915_request_get(rq);
1073 				i915_request_add(rq);
1074 
1075 				if (!wait_until_running(&h, rq)) {
1076 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1077 
1078 					pr_err("%s: Failed to start request %llx, at %x\n",
1079 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1080 					intel_engine_dump(engine, &p,
1081 							  "%s\n", engine->name);
1082 
1083 					i915_request_put(rq);
1084 					err = -EIO;
1085 					goto restore;
1086 				}
1087 			} else {
1088 				intel_engine_pm_get(engine);
1089 			}
1090 
1091 			if (!using_guc) {
1092 				err = intel_engine_reset(engine, NULL);
1093 				if (err) {
1094 					pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1095 					       engine->name, test_name, err);
1096 					goto restore;
1097 				}
1098 			}
1099 
1100 			if (rq) {
1101 				/* Ensure the reset happens and kills the engine */
1102 				err = intel_selftest_wait_for_rq(rq);
1103 				if (err)
1104 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1105 					       engine->name, rq->fence.context,
1106 					       rq->fence.seqno, rq->context->guc_id.id, err);
1107 			}
1108 
1109 			count++;
1110 
1111 			if (rq) {
1112 				if (rq->fence.error != -EIO) {
1113 					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1114 					       engine->name, test_name,
1115 					       rq->fence.context,
1116 					       rq->fence.seqno, rq->context->guc_id.id);
1117 					i915_request_put(rq);
1118 
1119 					GEM_TRACE_DUMP();
1120 					intel_gt_set_wedged(gt);
1121 					err = -EIO;
1122 					goto restore;
1123 				}
1124 
1125 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1126 					struct drm_printer p =
1127 						drm_info_printer(gt->i915->drm.dev);
1128 
1129 					pr_err("i915_reset_engine(%s:%s):"
1130 					       " failed to complete request %llx:%lld after reset\n",
1131 					       engine->name, test_name,
1132 					       rq->fence.context,
1133 					       rq->fence.seqno);
1134 					intel_engine_dump(engine, &p,
1135 							  "%s\n", engine->name);
1136 					i915_request_put(rq);
1137 
1138 					GEM_TRACE_DUMP();
1139 					intel_gt_set_wedged(gt);
1140 					err = -EIO;
1141 					goto restore;
1142 				}
1143 
1144 				i915_request_put(rq);
1145 			}
1146 
1147 			if (!(flags & TEST_ACTIVE))
1148 				intel_engine_pm_put(engine);
1149 
1150 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1151 				struct drm_printer p =
1152 					drm_info_printer(gt->i915->drm.dev);
1153 
1154 				pr_err("i915_reset_engine(%s:%s):"
1155 				       " failed to idle after reset\n",
1156 				       engine->name, test_name);
1157 				intel_engine_dump(engine, &p,
1158 						  "%s\n", engine->name);
1159 
1160 				err = -EIO;
1161 				goto restore;
1162 			}
1163 
1164 restore:
1165 			err2 = intel_selftest_restore_policy(engine, &saved);
1166 			if (err2)
1167 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1168 			if (err == 0)
1169 				err = err2;
1170 			if (err)
1171 				break;
1172 		} while (time_before(jiffies, end_time));
1173 		clear_and_wake_up_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1174 		st_engine_heartbeat_enable_no_pm(engine);
1175 
1176 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1177 			engine->name, test_name, count);
1178 
1179 		/* GuC based resets are not logged per engine */
1180 		if (!using_guc) {
1181 			reported = i915_reset_engine_count(global, engine);
1182 			reported -= threads[engine->id].resets;
1183 			if (reported != count) {
1184 				pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1185 				       engine->name, test_name, count, reported);
1186 				if (!err)
1187 					err = -EINVAL;
1188 			}
1189 		}
1190 
1191 unwind:
1192 		for_each_engine(other, gt, tmp) {
1193 			int ret;
1194 
1195 			if (!threads[tmp].task)
1196 				continue;
1197 
1198 			ret = kthread_stop(threads[tmp].task);
1199 			if (ret) {
1200 				pr_err("kthread for other engine %s failed, err=%d\n",
1201 				       other->name, ret);
1202 				if (!err)
1203 					err = ret;
1204 			}
1205 			put_task_struct(threads[tmp].task);
1206 
1207 			/* GuC based resets are not logged per engine */
1208 			if (!using_guc) {
1209 				if (other->uabi_class != engine->uabi_class &&
1210 				    threads[tmp].resets !=
1211 				    i915_reset_engine_count(global, other)) {
1212 					pr_err("Innocent engine %s was reset (count=%ld)\n",
1213 					       other->name,
1214 					       i915_reset_engine_count(global, other) -
1215 					       threads[tmp].resets);
1216 					if (!err)
1217 						err = -EINVAL;
1218 				}
1219 			}
1220 		}
1221 
1222 		if (device != i915_reset_count(global)) {
1223 			pr_err("Global reset (count=%ld)!\n",
1224 			       i915_reset_count(global) - device);
1225 			if (!err)
1226 				err = -EINVAL;
1227 		}
1228 
1229 		if (err)
1230 			break;
1231 
1232 		err = igt_flush_test(gt->i915);
1233 		if (err) {
1234 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1235 			break;
1236 		}
1237 	}
1238 
1239 	if (intel_gt_is_wedged(gt))
1240 		err = -EIO;
1241 
1242 	if (flags & TEST_ACTIVE)
1243 		hang_fini(&h);
1244 
1245 	return err;
1246 }
1247 
1248 static int igt_reset_engines(void *arg)
1249 {
1250 	static const struct {
1251 		const char *name;
1252 		unsigned int flags;
1253 	} phases[] = {
1254 		{ "idle", 0 },
1255 		{ "active", TEST_ACTIVE },
1256 		{ "others-idle", TEST_OTHERS },
1257 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1258 		{
1259 			"others-priority",
1260 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1261 		},
1262 		{
1263 			"self-priority",
1264 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1265 		},
1266 		{ }
1267 	};
1268 	struct intel_gt *gt = arg;
1269 	typeof(*phases) *p;
1270 	int err;
1271 
1272 	for (p = phases; p->name; p++) {
1273 		if (p->flags & TEST_PRIORITY) {
1274 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1275 				continue;
1276 		}
1277 
1278 		err = __igt_reset_engines(arg, p->name, p->flags);
1279 		if (err)
1280 			return err;
1281 	}
1282 
1283 	return 0;
1284 }
1285 
1286 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1287 {
1288 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1289 
1290 	intel_gt_reset(gt, mask, NULL);
1291 
1292 	return count;
1293 }
1294 
1295 static int igt_reset_wait(void *arg)
1296 {
1297 	struct intel_gt *gt = arg;
1298 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1299 	struct intel_engine_cs *engine = gt->engine[RCS0];
1300 	struct i915_request *rq;
1301 	unsigned int reset_count;
1302 	struct hang h;
1303 	long timeout;
1304 	int err;
1305 
1306 	if (!engine || !intel_engine_can_store_dword(engine))
1307 		return 0;
1308 
1309 	/* Check that we detect a stuck waiter and issue a reset */
1310 
1311 	igt_global_reset_lock(gt);
1312 
1313 	err = hang_init(&h, gt);
1314 	if (err) {
1315 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1316 		goto unlock;
1317 	}
1318 
1319 	rq = hang_create_request(&h, engine);
1320 	if (IS_ERR(rq)) {
1321 		err = PTR_ERR(rq);
1322 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1323 		goto fini;
1324 	}
1325 
1326 	i915_request_get(rq);
1327 	i915_request_add(rq);
1328 
1329 	if (!wait_until_running(&h, rq)) {
1330 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1331 
1332 		pr_err("%s: Failed to start request %llx, at %x\n",
1333 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1334 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1335 
1336 		intel_gt_set_wedged(gt);
1337 
1338 		err = -EIO;
1339 		goto out_rq;
1340 	}
1341 
1342 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1343 
1344 	timeout = i915_request_wait(rq, 0, 10);
1345 	if (timeout < 0) {
1346 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1347 		       timeout);
1348 		err = timeout;
1349 		goto out_rq;
1350 	}
1351 
1352 	if (i915_reset_count(global) == reset_count) {
1353 		pr_err("No GPU reset recorded!\n");
1354 		err = -EINVAL;
1355 		goto out_rq;
1356 	}
1357 
1358 out_rq:
1359 	i915_request_put(rq);
1360 fini:
1361 	hang_fini(&h);
1362 unlock:
1363 	igt_global_reset_unlock(gt);
1364 
1365 	if (intel_gt_is_wedged(gt))
1366 		return -EIO;
1367 
1368 	return err;
1369 }
1370 
1371 struct evict_vma {
1372 	struct completion completion;
1373 	struct i915_vma *vma;
1374 };
1375 
1376 static int evict_vma(void *data)
1377 {
1378 	struct evict_vma *arg = data;
1379 	struct i915_address_space *vm = arg->vma->vm;
1380 	struct drm_mm_node evict = arg->vma->node;
1381 	int err;
1382 
1383 	complete(&arg->completion);
1384 
1385 	mutex_lock(&vm->mutex);
1386 	err = i915_gem_evict_for_node(vm, &evict, 0);
1387 	mutex_unlock(&vm->mutex);
1388 
1389 	return err;
1390 }
1391 
1392 static int evict_fence(void *data)
1393 {
1394 	struct evict_vma *arg = data;
1395 	int err;
1396 
1397 	complete(&arg->completion);
1398 
1399 	/* Mark the fence register as dirty to force the mmio update. */
1400 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1401 	if (err) {
1402 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1403 		return err;
1404 	}
1405 
1406 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1407 	if (err) {
1408 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1409 		return err;
1410 	}
1411 
1412 	err = i915_vma_pin_fence(arg->vma);
1413 	i915_vma_unpin(arg->vma);
1414 	if (err) {
1415 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1416 		return err;
1417 	}
1418 
1419 	i915_vma_unpin_fence(arg->vma);
1420 
1421 	return 0;
1422 }
1423 
1424 static int __igt_reset_evict_vma(struct intel_gt *gt,
1425 				 struct i915_address_space *vm,
1426 				 int (*fn)(void *),
1427 				 unsigned int flags)
1428 {
1429 	struct intel_engine_cs *engine = gt->engine[RCS0];
1430 	struct drm_i915_gem_object *obj;
1431 	struct task_struct *tsk = NULL;
1432 	struct i915_request *rq;
1433 	struct evict_vma arg;
1434 	struct hang h;
1435 	unsigned int pin_flags;
1436 	int err;
1437 
1438 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1439 		return 0;
1440 
1441 	if (!engine || !intel_engine_can_store_dword(engine))
1442 		return 0;
1443 
1444 	/* Check that we can recover an unbind stuck on a hanging request */
1445 
1446 	err = hang_init(&h, gt);
1447 	if (err) {
1448 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1449 		return err;
1450 	}
1451 
1452 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1453 	if (IS_ERR(obj)) {
1454 		err = PTR_ERR(obj);
1455 		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1456 		goto fini;
1457 	}
1458 
1459 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1460 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1461 		if (err) {
1462 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1463 			goto out_obj;
1464 		}
1465 	}
1466 
1467 	arg.vma = i915_vma_instance(obj, vm, NULL);
1468 	if (IS_ERR(arg.vma)) {
1469 		err = PTR_ERR(arg.vma);
1470 		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1471 		goto out_obj;
1472 	}
1473 
1474 	rq = hang_create_request(&h, engine);
1475 	if (IS_ERR(rq)) {
1476 		err = PTR_ERR(rq);
1477 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1478 		goto out_obj;
1479 	}
1480 
1481 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1482 
1483 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1484 		pin_flags |= PIN_MAPPABLE;
1485 
1486 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1487 	if (err) {
1488 		i915_request_add(rq);
1489 		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1490 		goto out_obj;
1491 	}
1492 
1493 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1494 		err = i915_vma_pin_fence(arg.vma);
1495 		if (err) {
1496 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1497 			i915_vma_unpin(arg.vma);
1498 			i915_request_add(rq);
1499 			goto out_obj;
1500 		}
1501 	}
1502 
1503 	i915_vma_lock(arg.vma);
1504 	err = i915_request_await_object(rq, arg.vma->obj,
1505 					flags & EXEC_OBJECT_WRITE);
1506 	if (err == 0) {
1507 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1508 		if (err)
1509 			pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1510 	} else {
1511 		pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1512 	}
1513 
1514 	i915_vma_unlock(arg.vma);
1515 
1516 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1517 		i915_vma_unpin_fence(arg.vma);
1518 	i915_vma_unpin(arg.vma);
1519 
1520 	i915_request_get(rq);
1521 	i915_request_add(rq);
1522 	if (err)
1523 		goto out_rq;
1524 
1525 	if (!wait_until_running(&h, rq)) {
1526 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1527 
1528 		pr_err("%s: Failed to start request %llx, at %x\n",
1529 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1530 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1531 
1532 		intel_gt_set_wedged(gt);
1533 		goto out_reset;
1534 	}
1535 
1536 	init_completion(&arg.completion);
1537 
1538 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1539 	if (IS_ERR(tsk)) {
1540 		err = PTR_ERR(tsk);
1541 		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1542 		tsk = NULL;
1543 		goto out_reset;
1544 	}
1545 	get_task_struct(tsk);
1546 
1547 	wait_for_completion(&arg.completion);
1548 
1549 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1550 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1551 
1552 		pr_err("igt/evict_vma kthread did not wait\n");
1553 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1554 
1555 		intel_gt_set_wedged(gt);
1556 		goto out_reset;
1557 	}
1558 
1559 out_reset:
1560 	igt_global_reset_lock(gt);
1561 	fake_hangcheck(gt, rq->engine->mask);
1562 	igt_global_reset_unlock(gt);
1563 
1564 	if (tsk) {
1565 		struct intel_wedge_me w;
1566 
1567 		/* The reset, even indirectly, should take less than 10ms. */
1568 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1569 			err = kthread_stop(tsk);
1570 
1571 		put_task_struct(tsk);
1572 	}
1573 
1574 out_rq:
1575 	i915_request_put(rq);
1576 out_obj:
1577 	i915_gem_object_put(obj);
1578 fini:
1579 	hang_fini(&h);
1580 	if (intel_gt_is_wedged(gt))
1581 		return -EIO;
1582 
1583 	return err;
1584 }
1585 
1586 static int igt_reset_evict_ggtt(void *arg)
1587 {
1588 	struct intel_gt *gt = arg;
1589 
1590 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1591 				     evict_vma, EXEC_OBJECT_WRITE);
1592 }
1593 
1594 static int igt_reset_evict_ppgtt(void *arg)
1595 {
1596 	struct intel_gt *gt = arg;
1597 	struct i915_ppgtt *ppgtt;
1598 	int err;
1599 
1600 	/* aliasing == global gtt locking, covered above */
1601 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1602 		return 0;
1603 
1604 	ppgtt = i915_ppgtt_create(gt, 0);
1605 	if (IS_ERR(ppgtt))
1606 		return PTR_ERR(ppgtt);
1607 
1608 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1609 				    evict_vma, EXEC_OBJECT_WRITE);
1610 	i915_vm_put(&ppgtt->vm);
1611 
1612 	return err;
1613 }
1614 
1615 static int igt_reset_evict_fence(void *arg)
1616 {
1617 	struct intel_gt *gt = arg;
1618 
1619 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1620 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1621 }
1622 
1623 static int wait_for_others(struct intel_gt *gt,
1624 			   struct intel_engine_cs *exclude)
1625 {
1626 	struct intel_engine_cs *engine;
1627 	enum intel_engine_id id;
1628 
1629 	for_each_engine(engine, gt, id) {
1630 		if (engine == exclude)
1631 			continue;
1632 
1633 		if (!wait_for_idle(engine))
1634 			return -EIO;
1635 	}
1636 
1637 	return 0;
1638 }
1639 
1640 static int igt_reset_queue(void *arg)
1641 {
1642 	struct intel_gt *gt = arg;
1643 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1644 	struct intel_engine_cs *engine;
1645 	enum intel_engine_id id;
1646 	struct hang h;
1647 	int err;
1648 
1649 	/* Check that we replay pending requests following a hang */
1650 
1651 	igt_global_reset_lock(gt);
1652 
1653 	err = hang_init(&h, gt);
1654 	if (err)
1655 		goto unlock;
1656 
1657 	for_each_engine(engine, gt, id) {
1658 		struct intel_selftest_saved_policy saved;
1659 		struct i915_request *prev;
1660 		IGT_TIMEOUT(end_time);
1661 		unsigned int count;
1662 		bool using_guc = intel_engine_uses_guc(engine);
1663 
1664 		if (!intel_engine_can_store_dword(engine))
1665 			continue;
1666 
1667 		if (using_guc) {
1668 			err = intel_selftest_modify_policy(engine, &saved,
1669 							   SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1670 			if (err) {
1671 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1672 				goto fini;
1673 			}
1674 		}
1675 
1676 		prev = hang_create_request(&h, engine);
1677 		if (IS_ERR(prev)) {
1678 			err = PTR_ERR(prev);
1679 			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1680 			goto restore;
1681 		}
1682 
1683 		i915_request_get(prev);
1684 		i915_request_add(prev);
1685 
1686 		count = 0;
1687 		do {
1688 			struct i915_request *rq;
1689 			unsigned int reset_count;
1690 
1691 			rq = hang_create_request(&h, engine);
1692 			if (IS_ERR(rq)) {
1693 				err = PTR_ERR(rq);
1694 				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1695 				goto restore;
1696 			}
1697 
1698 			i915_request_get(rq);
1699 			i915_request_add(rq);
1700 
1701 			/*
1702 			 * XXX We don't handle resetting the kernel context
1703 			 * very well. If we trigger a device reset twice in
1704 			 * quick succession while the kernel context is
1705 			 * executing, we may end up skipping the breadcrumb.
1706 			 * This is really only a problem for the selftest as
1707 			 * normally there is a large interlude between resets
1708 			 * (hangcheck), or we focus on resetting just one
1709 			 * engine and so avoid repeatedly resetting innocents.
1710 			 */
1711 			err = wait_for_others(gt, engine);
1712 			if (err) {
1713 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1714 				       __func__, engine->name);
1715 				i915_request_put(rq);
1716 				i915_request_put(prev);
1717 
1718 				GEM_TRACE_DUMP();
1719 				intel_gt_set_wedged(gt);
1720 				goto restore;
1721 			}
1722 
1723 			if (!wait_until_running(&h, prev)) {
1724 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1725 
1726 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1727 				       __func__, engine->name,
1728 				       prev->fence.seqno, hws_seqno(&h, prev));
1729 				intel_engine_dump(engine, &p,
1730 						  "%s\n", engine->name);
1731 
1732 				i915_request_put(rq);
1733 				i915_request_put(prev);
1734 
1735 				intel_gt_set_wedged(gt);
1736 
1737 				err = -EIO;
1738 				goto restore;
1739 			}
1740 
1741 			reset_count = fake_hangcheck(gt, BIT(id));
1742 
1743 			if (prev->fence.error != -EIO) {
1744 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1745 				       prev->fence.error);
1746 				i915_request_put(rq);
1747 				i915_request_put(prev);
1748 				err = -EINVAL;
1749 				goto restore;
1750 			}
1751 
1752 			if (rq->fence.error) {
1753 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1754 				       rq->fence.error);
1755 				i915_request_put(rq);
1756 				i915_request_put(prev);
1757 				err = -EINVAL;
1758 				goto restore;
1759 			}
1760 
1761 			if (i915_reset_count(global) == reset_count) {
1762 				pr_err("No GPU reset recorded!\n");
1763 				i915_request_put(rq);
1764 				i915_request_put(prev);
1765 				err = -EINVAL;
1766 				goto restore;
1767 			}
1768 
1769 			i915_request_put(prev);
1770 			prev = rq;
1771 			count++;
1772 		} while (time_before(jiffies, end_time));
1773 		pr_info("%s: Completed %d queued resets\n",
1774 			engine->name, count);
1775 
1776 		*h.batch = MI_BATCH_BUFFER_END;
1777 		intel_gt_chipset_flush(engine->gt);
1778 
1779 		i915_request_put(prev);
1780 
1781 restore:
1782 		if (using_guc) {
1783 			int err2 = intel_selftest_restore_policy(engine, &saved);
1784 
1785 			if (err2)
1786 				pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1787 				       __func__, __LINE__, engine->name, err2);
1788 			if (err == 0)
1789 				err = err2;
1790 		}
1791 		if (err)
1792 			goto fini;
1793 
1794 		err = igt_flush_test(gt->i915);
1795 		if (err) {
1796 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1797 			break;
1798 		}
1799 	}
1800 
1801 fini:
1802 	hang_fini(&h);
1803 unlock:
1804 	igt_global_reset_unlock(gt);
1805 
1806 	if (intel_gt_is_wedged(gt))
1807 		return -EIO;
1808 
1809 	return err;
1810 }
1811 
1812 static int igt_handle_error(void *arg)
1813 {
1814 	struct intel_gt *gt = arg;
1815 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1816 	struct intel_engine_cs *engine = gt->engine[RCS0];
1817 	struct hang h;
1818 	struct i915_request *rq;
1819 	struct i915_gpu_coredump *error;
1820 	int err;
1821 
1822 	/* Check that we can issue a global GPU and engine reset */
1823 
1824 	if (!intel_has_reset_engine(gt))
1825 		return 0;
1826 
1827 	if (!engine || !intel_engine_can_store_dword(engine))
1828 		return 0;
1829 
1830 	err = hang_init(&h, gt);
1831 	if (err) {
1832 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1833 		return err;
1834 	}
1835 
1836 	rq = hang_create_request(&h, engine);
1837 	if (IS_ERR(rq)) {
1838 		err = PTR_ERR(rq);
1839 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1840 		goto err_fini;
1841 	}
1842 
1843 	i915_request_get(rq);
1844 	i915_request_add(rq);
1845 
1846 	if (!wait_until_running(&h, rq)) {
1847 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1848 
1849 		pr_err("%s: Failed to start request %llx, at %x\n",
1850 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1851 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1852 
1853 		intel_gt_set_wedged(gt);
1854 
1855 		err = -EIO;
1856 		goto err_request;
1857 	}
1858 
1859 	/* Temporarily disable error capture */
1860 	error = xchg(&global->first_error, (void *)-1);
1861 
1862 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1863 
1864 	xchg(&global->first_error, error);
1865 
1866 	if (rq->fence.error != -EIO) {
1867 		pr_err("Guilty request not identified!\n");
1868 		err = -EINVAL;
1869 		goto err_request;
1870 	}
1871 
1872 err_request:
1873 	i915_request_put(rq);
1874 err_fini:
1875 	hang_fini(&h);
1876 	return err;
1877 }
1878 
1879 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1880 				     const struct igt_atomic_section *p,
1881 				     const char *mode)
1882 {
1883 	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1884 	int err;
1885 
1886 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1887 		  engine->name, mode, p->name);
1888 
1889 	if (t->func)
1890 		tasklet_disable(t);
1891 	if (strcmp(p->name, "softirq"))
1892 		local_bh_disable();
1893 	p->critical_section_begin();
1894 
1895 	err = __intel_engine_reset_bh(engine, NULL);
1896 
1897 	p->critical_section_end();
1898 	if (strcmp(p->name, "softirq"))
1899 		local_bh_enable();
1900 	if (t->func) {
1901 		tasklet_enable(t);
1902 		tasklet_hi_schedule(t);
1903 	}
1904 
1905 	if (err)
1906 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1907 		       engine->name, mode, p->name);
1908 
1909 	return err;
1910 }
1911 
1912 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1913 				   const struct igt_atomic_section *p)
1914 {
1915 	struct i915_request *rq;
1916 	struct hang h;
1917 	int err;
1918 
1919 	err = __igt_atomic_reset_engine(engine, p, "idle");
1920 	if (err)
1921 		return err;
1922 
1923 	err = hang_init(&h, engine->gt);
1924 	if (err) {
1925 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1926 		return err;
1927 	}
1928 
1929 	rq = hang_create_request(&h, engine);
1930 	if (IS_ERR(rq)) {
1931 		err = PTR_ERR(rq);
1932 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1933 		goto out;
1934 	}
1935 
1936 	i915_request_get(rq);
1937 	i915_request_add(rq);
1938 
1939 	if (wait_until_running(&h, rq)) {
1940 		err = __igt_atomic_reset_engine(engine, p, "active");
1941 	} else {
1942 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1943 		       __func__, engine->name,
1944 		       rq->fence.seqno, hws_seqno(&h, rq));
1945 		intel_gt_set_wedged(engine->gt);
1946 		err = -EIO;
1947 	}
1948 
1949 	if (err == 0) {
1950 		struct intel_wedge_me w;
1951 
1952 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1953 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1954 		if (intel_gt_is_wedged(engine->gt))
1955 			err = -EIO;
1956 	}
1957 
1958 	i915_request_put(rq);
1959 out:
1960 	hang_fini(&h);
1961 	return err;
1962 }
1963 
1964 static int igt_reset_engines_atomic(void *arg)
1965 {
1966 	struct intel_gt *gt = arg;
1967 	const typeof(*igt_atomic_phases) *p;
1968 	int err = 0;
1969 
1970 	/* Check that the engines resets are usable from atomic context */
1971 
1972 	if (!intel_has_reset_engine(gt))
1973 		return 0;
1974 
1975 	if (intel_uc_uses_guc_submission(&gt->uc))
1976 		return 0;
1977 
1978 	igt_global_reset_lock(gt);
1979 
1980 	/* Flush any requests before we get started and check basics */
1981 	if (!igt_force_reset(gt))
1982 		goto unlock;
1983 
1984 	for (p = igt_atomic_phases; p->name; p++) {
1985 		struct intel_engine_cs *engine;
1986 		enum intel_engine_id id;
1987 
1988 		for_each_engine(engine, gt, id) {
1989 			err = igt_atomic_reset_engine(engine, p);
1990 			if (err)
1991 				goto out;
1992 		}
1993 	}
1994 
1995 out:
1996 	/* As we poke around the guts, do a full reset before continuing. */
1997 	igt_force_reset(gt);
1998 unlock:
1999 	igt_global_reset_unlock(gt);
2000 
2001 	return err;
2002 }
2003 
2004 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2005 {
2006 	static const struct i915_subtest tests[] = {
2007 		SUBTEST(igt_hang_sanitycheck),
2008 		SUBTEST(igt_reset_nop),
2009 		SUBTEST(igt_reset_nop_engine),
2010 		SUBTEST(igt_reset_idle_engine),
2011 		SUBTEST(igt_reset_active_engine),
2012 		SUBTEST(igt_reset_fail_engine),
2013 		SUBTEST(igt_reset_engines),
2014 		SUBTEST(igt_reset_engines_atomic),
2015 		SUBTEST(igt_reset_queue),
2016 		SUBTEST(igt_reset_wait),
2017 		SUBTEST(igt_reset_evict_ggtt),
2018 		SUBTEST(igt_reset_evict_ppgtt),
2019 		SUBTEST(igt_reset_evict_fence),
2020 		SUBTEST(igt_handle_error),
2021 	};
2022 	struct intel_gt *gt = to_gt(i915);
2023 	intel_wakeref_t wakeref;
2024 	int err;
2025 
2026 	if (!intel_has_gpu_reset(gt))
2027 		return 0;
2028 
2029 	if (intel_gt_is_wedged(gt))
2030 		return -EIO; /* we're long past hope of a successful reset */
2031 
2032 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2033 
2034 	err = intel_gt_live_subtests(tests, gt);
2035 
2036 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2037 
2038 	return err;
2039 }
2040