1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2016 Intel Corporation
4  */
5 
6 #include <linux/kthread.h>
7 
8 #include "gem/i915_gem_context.h"
9 
10 #include "i915_gem_evict.h"
11 #include "intel_gt.h"
12 #include "intel_engine_heartbeat.h"
13 #include "intel_engine_pm.h"
14 #include "selftest_engine_heartbeat.h"
15 
16 #include "i915_selftest.h"
17 #include "selftests/i915_random.h"
18 #include "selftests/igt_flush_test.h"
19 #include "selftests/igt_reset.h"
20 #include "selftests/igt_atomic.h"
21 #include "selftests/igt_spinner.h"
22 #include "selftests/intel_scheduler_helpers.h"
23 
24 #include "selftests/mock_drm.h"
25 
26 #include "gem/selftests/mock_context.h"
27 #include "gem/selftests/igt_gem_utils.h"
28 
29 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
30 
31 struct hang {
32 	struct intel_gt *gt;
33 	struct drm_i915_gem_object *hws;
34 	struct drm_i915_gem_object *obj;
35 	struct i915_gem_context *ctx;
36 	u32 *seqno;
37 	u32 *batch;
38 };
39 
40 static int hang_init(struct hang *h, struct intel_gt *gt)
41 {
42 	void *vaddr;
43 	int err;
44 
45 	memset(h, 0, sizeof(*h));
46 	h->gt = gt;
47 
48 	h->ctx = kernel_context(gt->i915, NULL);
49 	if (IS_ERR(h->ctx))
50 		return PTR_ERR(h->ctx);
51 
52 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
53 
54 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
55 	if (IS_ERR(h->hws)) {
56 		err = PTR_ERR(h->hws);
57 		goto err_ctx;
58 	}
59 
60 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
61 	if (IS_ERR(h->obj)) {
62 		err = PTR_ERR(h->obj);
63 		goto err_hws;
64 	}
65 
66 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
67 	vaddr = i915_gem_object_pin_map_unlocked(h->hws, I915_MAP_WB);
68 	if (IS_ERR(vaddr)) {
69 		err = PTR_ERR(vaddr);
70 		goto err_obj;
71 	}
72 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
73 
74 	vaddr = i915_gem_object_pin_map_unlocked(h->obj,
75 						 i915_coherent_map_type(gt->i915, h->obj, false));
76 	if (IS_ERR(vaddr)) {
77 		err = PTR_ERR(vaddr);
78 		goto err_unpin_hws;
79 	}
80 	h->batch = vaddr;
81 
82 	return 0;
83 
84 err_unpin_hws:
85 	i915_gem_object_unpin_map(h->hws);
86 err_obj:
87 	i915_gem_object_put(h->obj);
88 err_hws:
89 	i915_gem_object_put(h->hws);
90 err_ctx:
91 	kernel_context_close(h->ctx);
92 	return err;
93 }
94 
95 static u64 hws_address(const struct i915_vma *hws,
96 		       const struct i915_request *rq)
97 {
98 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
99 }
100 
101 static int move_to_active(struct i915_vma *vma,
102 			  struct i915_request *rq,
103 			  unsigned int flags)
104 {
105 	int err;
106 
107 	i915_vma_lock(vma);
108 	err = i915_request_await_object(rq, vma->obj,
109 					flags & EXEC_OBJECT_WRITE);
110 	if (err == 0)
111 		err = i915_vma_move_to_active(vma, rq, flags);
112 	i915_vma_unlock(vma);
113 
114 	return err;
115 }
116 
117 static struct i915_request *
118 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
119 {
120 	struct intel_gt *gt = h->gt;
121 	struct i915_address_space *vm = i915_gem_context_get_eb_vm(h->ctx);
122 	struct drm_i915_gem_object *obj;
123 	struct i915_request *rq = NULL;
124 	struct i915_vma *hws, *vma;
125 	unsigned int flags;
126 	void *vaddr;
127 	u32 *batch;
128 	int err;
129 
130 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
131 	if (IS_ERR(obj)) {
132 		i915_vm_put(vm);
133 		return ERR_CAST(obj);
134 	}
135 
136 	vaddr = i915_gem_object_pin_map_unlocked(obj, i915_coherent_map_type(gt->i915, obj, false));
137 	if (IS_ERR(vaddr)) {
138 		i915_gem_object_put(obj);
139 		i915_vm_put(vm);
140 		return ERR_CAST(vaddr);
141 	}
142 
143 	i915_gem_object_unpin_map(h->obj);
144 	i915_gem_object_put(h->obj);
145 
146 	h->obj = obj;
147 	h->batch = vaddr;
148 
149 	vma = i915_vma_instance(h->obj, vm, NULL);
150 	if (IS_ERR(vma)) {
151 		i915_vm_put(vm);
152 		return ERR_CAST(vma);
153 	}
154 
155 	hws = i915_vma_instance(h->hws, vm, NULL);
156 	if (IS_ERR(hws)) {
157 		i915_vm_put(vm);
158 		return ERR_CAST(hws);
159 	}
160 
161 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
162 	if (err) {
163 		i915_vm_put(vm);
164 		return ERR_PTR(err);
165 	}
166 
167 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
168 	if (err)
169 		goto unpin_vma;
170 
171 	rq = igt_request_alloc(h->ctx, engine);
172 	if (IS_ERR(rq)) {
173 		err = PTR_ERR(rq);
174 		goto unpin_hws;
175 	}
176 
177 	err = move_to_active(vma, rq, 0);
178 	if (err)
179 		goto cancel_rq;
180 
181 	err = move_to_active(hws, rq, 0);
182 	if (err)
183 		goto cancel_rq;
184 
185 	batch = h->batch;
186 	if (GRAPHICS_VER(gt->i915) >= 8) {
187 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
188 		*batch++ = lower_32_bits(hws_address(hws, rq));
189 		*batch++ = upper_32_bits(hws_address(hws, rq));
190 		*batch++ = rq->fence.seqno;
191 		*batch++ = MI_NOOP;
192 
193 		memset(batch, 0, 1024);
194 		batch += 1024 / sizeof(*batch);
195 
196 		*batch++ = MI_NOOP;
197 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
198 		*batch++ = lower_32_bits(vma->node.start);
199 		*batch++ = upper_32_bits(vma->node.start);
200 	} else if (GRAPHICS_VER(gt->i915) >= 6) {
201 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
202 		*batch++ = 0;
203 		*batch++ = lower_32_bits(hws_address(hws, rq));
204 		*batch++ = rq->fence.seqno;
205 		*batch++ = MI_NOOP;
206 
207 		memset(batch, 0, 1024);
208 		batch += 1024 / sizeof(*batch);
209 
210 		*batch++ = MI_NOOP;
211 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
212 		*batch++ = lower_32_bits(vma->node.start);
213 	} else if (GRAPHICS_VER(gt->i915) >= 4) {
214 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
215 		*batch++ = 0;
216 		*batch++ = lower_32_bits(hws_address(hws, rq));
217 		*batch++ = rq->fence.seqno;
218 		*batch++ = MI_NOOP;
219 
220 		memset(batch, 0, 1024);
221 		batch += 1024 / sizeof(*batch);
222 
223 		*batch++ = MI_NOOP;
224 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
225 		*batch++ = lower_32_bits(vma->node.start);
226 	} else {
227 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
228 		*batch++ = lower_32_bits(hws_address(hws, rq));
229 		*batch++ = rq->fence.seqno;
230 		*batch++ = MI_NOOP;
231 
232 		memset(batch, 0, 1024);
233 		batch += 1024 / sizeof(*batch);
234 
235 		*batch++ = MI_NOOP;
236 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
237 		*batch++ = lower_32_bits(vma->node.start);
238 	}
239 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
240 	intel_gt_chipset_flush(engine->gt);
241 
242 	if (rq->engine->emit_init_breadcrumb) {
243 		err = rq->engine->emit_init_breadcrumb(rq);
244 		if (err)
245 			goto cancel_rq;
246 	}
247 
248 	flags = 0;
249 	if (GRAPHICS_VER(gt->i915) <= 5)
250 		flags |= I915_DISPATCH_SECURE;
251 
252 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
253 
254 cancel_rq:
255 	if (err) {
256 		i915_request_set_error_once(rq, err);
257 		i915_request_add(rq);
258 	}
259 unpin_hws:
260 	i915_vma_unpin(hws);
261 unpin_vma:
262 	i915_vma_unpin(vma);
263 	i915_vm_put(vm);
264 	return err ? ERR_PTR(err) : rq;
265 }
266 
267 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
268 {
269 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
270 }
271 
272 static void hang_fini(struct hang *h)
273 {
274 	*h->batch = MI_BATCH_BUFFER_END;
275 	intel_gt_chipset_flush(h->gt);
276 
277 	i915_gem_object_unpin_map(h->obj);
278 	i915_gem_object_put(h->obj);
279 
280 	i915_gem_object_unpin_map(h->hws);
281 	i915_gem_object_put(h->hws);
282 
283 	kernel_context_close(h->ctx);
284 
285 	igt_flush_test(h->gt->i915);
286 }
287 
288 static bool wait_until_running(struct hang *h, struct i915_request *rq)
289 {
290 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
291 					       rq->fence.seqno),
292 			     10) &&
293 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
294 					    rq->fence.seqno),
295 			  1000));
296 }
297 
298 static int igt_hang_sanitycheck(void *arg)
299 {
300 	struct intel_gt *gt = arg;
301 	struct i915_request *rq;
302 	struct intel_engine_cs *engine;
303 	enum intel_engine_id id;
304 	struct hang h;
305 	int err;
306 
307 	/* Basic check that we can execute our hanging batch */
308 
309 	err = hang_init(&h, gt);
310 	if (err)
311 		return err;
312 
313 	for_each_engine(engine, gt, id) {
314 		struct intel_wedge_me w;
315 		long timeout;
316 
317 		if (!intel_engine_can_store_dword(engine))
318 			continue;
319 
320 		rq = hang_create_request(&h, engine);
321 		if (IS_ERR(rq)) {
322 			err = PTR_ERR(rq);
323 			pr_err("Failed to create request for %s, err=%d\n",
324 			       engine->name, err);
325 			goto fini;
326 		}
327 
328 		i915_request_get(rq);
329 
330 		*h.batch = MI_BATCH_BUFFER_END;
331 		intel_gt_chipset_flush(engine->gt);
332 
333 		i915_request_add(rq);
334 
335 		timeout = 0;
336 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
337 			timeout = i915_request_wait(rq, 0,
338 						    MAX_SCHEDULE_TIMEOUT);
339 		if (intel_gt_is_wedged(gt))
340 			timeout = -EIO;
341 
342 		i915_request_put(rq);
343 
344 		if (timeout < 0) {
345 			err = timeout;
346 			pr_err("Wait for request failed on %s, err=%d\n",
347 			       engine->name, err);
348 			goto fini;
349 		}
350 	}
351 
352 fini:
353 	hang_fini(&h);
354 	return err;
355 }
356 
357 static bool wait_for_idle(struct intel_engine_cs *engine)
358 {
359 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
360 }
361 
362 static int igt_reset_nop(void *arg)
363 {
364 	struct intel_gt *gt = arg;
365 	struct i915_gpu_error *global = &gt->i915->gpu_error;
366 	struct intel_engine_cs *engine;
367 	unsigned int reset_count, count;
368 	enum intel_engine_id id;
369 	IGT_TIMEOUT(end_time);
370 	int err = 0;
371 
372 	/* Check that we can reset during non-user portions of requests */
373 
374 	reset_count = i915_reset_count(global);
375 	count = 0;
376 	do {
377 		for_each_engine(engine, gt, id) {
378 			struct intel_context *ce;
379 			int i;
380 
381 			ce = intel_context_create(engine);
382 			if (IS_ERR(ce)) {
383 				err = PTR_ERR(ce);
384 				pr_err("[%s] Create context failed: %d!\n", engine->name, err);
385 				break;
386 			}
387 
388 			for (i = 0; i < 16; i++) {
389 				struct i915_request *rq;
390 
391 				rq = intel_context_create_request(ce);
392 				if (IS_ERR(rq)) {
393 					err = PTR_ERR(rq);
394 					pr_err("[%s] Create request failed: %d!\n",
395 					       engine->name, err);
396 					break;
397 				}
398 
399 				i915_request_add(rq);
400 			}
401 
402 			intel_context_put(ce);
403 		}
404 
405 		igt_global_reset_lock(gt);
406 		intel_gt_reset(gt, ALL_ENGINES, NULL);
407 		igt_global_reset_unlock(gt);
408 
409 		if (intel_gt_is_wedged(gt)) {
410 			pr_err("[%s] GT is wedged!\n", engine->name);
411 			err = -EIO;
412 			break;
413 		}
414 
415 		if (i915_reset_count(global) != reset_count + ++count) {
416 			pr_err("[%s] Reset not recorded: %d vs %d + %d!\n",
417 			       engine->name, i915_reset_count(global), reset_count, count);
418 			err = -EINVAL;
419 			break;
420 		}
421 
422 		err = igt_flush_test(gt->i915);
423 		if (err) {
424 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
425 			break;
426 		}
427 	} while (time_before(jiffies, end_time));
428 	pr_info("%s: %d resets\n", __func__, count);
429 
430 	if (igt_flush_test(gt->i915)) {
431 		pr_err("Post flush failed: %d!\n", err);
432 		err = -EIO;
433 	}
434 
435 	return err;
436 }
437 
438 static int igt_reset_nop_engine(void *arg)
439 {
440 	struct intel_gt *gt = arg;
441 	struct i915_gpu_error *global = &gt->i915->gpu_error;
442 	struct intel_engine_cs *engine;
443 	enum intel_engine_id id;
444 
445 	/* Check that we can engine-reset during non-user portions */
446 
447 	if (!intel_has_reset_engine(gt))
448 		return 0;
449 
450 	for_each_engine(engine, gt, id) {
451 		unsigned int reset_count, reset_engine_count, count;
452 		struct intel_context *ce;
453 		IGT_TIMEOUT(end_time);
454 		int err;
455 
456 		if (intel_engine_uses_guc(engine)) {
457 			/* Engine level resets are triggered by GuC when a hang
458 			 * is detected. They can't be triggered by the KMD any
459 			 * more. Thus a nop batch cannot be used as a reset test
460 			 */
461 			continue;
462 		}
463 
464 		ce = intel_context_create(engine);
465 		if (IS_ERR(ce)) {
466 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
467 			return PTR_ERR(ce);
468 		}
469 
470 		reset_count = i915_reset_count(global);
471 		reset_engine_count = i915_reset_engine_count(global, engine);
472 		count = 0;
473 
474 		st_engine_heartbeat_disable(engine);
475 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
476 		do {
477 			int i;
478 
479 			if (!wait_for_idle(engine)) {
480 				pr_err("%s failed to idle before reset\n",
481 				       engine->name);
482 				err = -EIO;
483 				break;
484 			}
485 
486 			for (i = 0; i < 16; i++) {
487 				struct i915_request *rq;
488 
489 				rq = intel_context_create_request(ce);
490 				if (IS_ERR(rq)) {
491 					struct drm_printer p =
492 						drm_info_printer(gt->i915->drm.dev);
493 					intel_engine_dump(engine, &p,
494 							  "%s(%s): failed to submit request\n",
495 							  __func__,
496 							  engine->name);
497 
498 					GEM_TRACE("%s(%s): failed to submit request\n",
499 						  __func__,
500 						  engine->name);
501 					GEM_TRACE_DUMP();
502 
503 					intel_gt_set_wedged(gt);
504 
505 					err = PTR_ERR(rq);
506 					break;
507 				}
508 
509 				i915_request_add(rq);
510 			}
511 			err = intel_engine_reset(engine, NULL);
512 			if (err) {
513 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
514 				       engine->name, err);
515 				break;
516 			}
517 
518 			if (i915_reset_count(global) != reset_count) {
519 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
520 				err = -EINVAL;
521 				break;
522 			}
523 
524 			if (i915_reset_engine_count(global, engine) !=
525 			    reset_engine_count + ++count) {
526 				pr_err("%s engine reset not recorded!\n",
527 				       engine->name);
528 				err = -EINVAL;
529 				break;
530 			}
531 		} while (time_before(jiffies, end_time));
532 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
533 		st_engine_heartbeat_enable(engine);
534 
535 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
536 
537 		intel_context_put(ce);
538 		if (igt_flush_test(gt->i915))
539 			err = -EIO;
540 		if (err)
541 			return err;
542 	}
543 
544 	return 0;
545 }
546 
547 static void force_reset_timeout(struct intel_engine_cs *engine)
548 {
549 	engine->reset_timeout.probability = 999;
550 	atomic_set(&engine->reset_timeout.times, -1);
551 }
552 
553 static void cancel_reset_timeout(struct intel_engine_cs *engine)
554 {
555 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
556 }
557 
558 static int igt_reset_fail_engine(void *arg)
559 {
560 	struct intel_gt *gt = arg;
561 	struct intel_engine_cs *engine;
562 	enum intel_engine_id id;
563 
564 	/* Check that we can recover from engine-reset failues */
565 
566 	if (!intel_has_reset_engine(gt))
567 		return 0;
568 
569 	for_each_engine(engine, gt, id) {
570 		unsigned int count;
571 		struct intel_context *ce;
572 		IGT_TIMEOUT(end_time);
573 		int err;
574 
575 		/* Can't manually break the reset if i915 doesn't perform it */
576 		if (intel_engine_uses_guc(engine))
577 			continue;
578 
579 		ce = intel_context_create(engine);
580 		if (IS_ERR(ce)) {
581 			pr_err("[%s] Create context failed: %pe!\n", engine->name, ce);
582 			return PTR_ERR(ce);
583 		}
584 
585 		st_engine_heartbeat_disable(engine);
586 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
587 
588 		force_reset_timeout(engine);
589 		err = intel_engine_reset(engine, NULL);
590 		cancel_reset_timeout(engine);
591 		if (err == 0) /* timeouts only generated on gen8+ */
592 			goto skip;
593 
594 		count = 0;
595 		do {
596 			struct i915_request *last = NULL;
597 			int i;
598 
599 			if (!wait_for_idle(engine)) {
600 				pr_err("%s failed to idle before reset\n",
601 				       engine->name);
602 				err = -EIO;
603 				break;
604 			}
605 
606 			for (i = 0; i < count % 15; i++) {
607 				struct i915_request *rq;
608 
609 				rq = intel_context_create_request(ce);
610 				if (IS_ERR(rq)) {
611 					struct drm_printer p =
612 						drm_info_printer(gt->i915->drm.dev);
613 					intel_engine_dump(engine, &p,
614 							  "%s(%s): failed to submit request\n",
615 							  __func__,
616 							  engine->name);
617 
618 					GEM_TRACE("%s(%s): failed to submit request\n",
619 						  __func__,
620 						  engine->name);
621 					GEM_TRACE_DUMP();
622 
623 					intel_gt_set_wedged(gt);
624 					if (last)
625 						i915_request_put(last);
626 
627 					err = PTR_ERR(rq);
628 					goto out;
629 				}
630 
631 				if (last)
632 					i915_request_put(last);
633 				last = i915_request_get(rq);
634 				i915_request_add(rq);
635 			}
636 
637 			if (count & 1) {
638 				err = intel_engine_reset(engine, NULL);
639 				if (err) {
640 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
641 						      engine->name, err);
642 					GEM_TRACE_DUMP();
643 					i915_request_put(last);
644 					break;
645 				}
646 			} else {
647 				force_reset_timeout(engine);
648 				err = intel_engine_reset(engine, NULL);
649 				cancel_reset_timeout(engine);
650 				if (err != -ETIMEDOUT) {
651 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
652 					       engine->name, err);
653 					i915_request_put(last);
654 					break;
655 				}
656 			}
657 
658 			err = 0;
659 			if (last) {
660 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
661 					struct drm_printer p =
662 						drm_info_printer(gt->i915->drm.dev);
663 
664 					intel_engine_dump(engine, &p,
665 							  "%s(%s): failed to complete request\n",
666 							  __func__,
667 							  engine->name);
668 
669 					GEM_TRACE("%s(%s): failed to complete request\n",
670 						  __func__,
671 						  engine->name);
672 					GEM_TRACE_DUMP();
673 
674 					err = -EIO;
675 				}
676 				i915_request_put(last);
677 			}
678 			count++;
679 		} while (err == 0 && time_before(jiffies, end_time));
680 out:
681 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
682 skip:
683 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
684 		st_engine_heartbeat_enable(engine);
685 		intel_context_put(ce);
686 
687 		if (igt_flush_test(gt->i915))
688 			err = -EIO;
689 		if (err)
690 			return err;
691 	}
692 
693 	return 0;
694 }
695 
696 static int __igt_reset_engine(struct intel_gt *gt, bool active)
697 {
698 	struct i915_gpu_error *global = &gt->i915->gpu_error;
699 	struct intel_engine_cs *engine;
700 	enum intel_engine_id id;
701 	struct hang h;
702 	int err = 0;
703 
704 	/* Check that we can issue an engine reset on an idle engine (no-op) */
705 
706 	if (!intel_has_reset_engine(gt))
707 		return 0;
708 
709 	if (active) {
710 		err = hang_init(&h, gt);
711 		if (err)
712 			return err;
713 	}
714 
715 	for_each_engine(engine, gt, id) {
716 		unsigned int reset_count, reset_engine_count;
717 		unsigned long count;
718 		bool using_guc = intel_engine_uses_guc(engine);
719 		IGT_TIMEOUT(end_time);
720 
721 		if (using_guc && !active)
722 			continue;
723 
724 		if (active && !intel_engine_can_store_dword(engine))
725 			continue;
726 
727 		if (!wait_for_idle(engine)) {
728 			pr_err("%s failed to idle before reset\n",
729 			       engine->name);
730 			err = -EIO;
731 			break;
732 		}
733 
734 		reset_count = i915_reset_count(global);
735 		reset_engine_count = i915_reset_engine_count(global, engine);
736 
737 		st_engine_heartbeat_disable(engine);
738 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
739 		count = 0;
740 		do {
741 			struct i915_request *rq = NULL;
742 			struct intel_selftest_saved_policy saved;
743 			int err2;
744 
745 			err = intel_selftest_modify_policy(engine, &saved,
746 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
747 			if (err) {
748 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
749 				break;
750 			}
751 
752 			if (active) {
753 				rq = hang_create_request(&h, engine);
754 				if (IS_ERR(rq)) {
755 					err = PTR_ERR(rq);
756 					pr_err("[%s] Create hang request failed: %d!\n",
757 					       engine->name, err);
758 					goto restore;
759 				}
760 
761 				i915_request_get(rq);
762 				i915_request_add(rq);
763 
764 				if (!wait_until_running(&h, rq)) {
765 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
766 
767 					pr_err("%s: Failed to start request %llx, at %x\n",
768 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
769 					intel_engine_dump(engine, &p,
770 							  "%s\n", engine->name);
771 
772 					i915_request_put(rq);
773 					err = -EIO;
774 					goto restore;
775 				}
776 			}
777 
778 			if (!using_guc) {
779 				err = intel_engine_reset(engine, NULL);
780 				if (err) {
781 					pr_err("intel_engine_reset(%s) failed, err:%d\n",
782 					       engine->name, err);
783 					goto skip;
784 				}
785 			}
786 
787 			if (rq) {
788 				/* Ensure the reset happens and kills the engine */
789 				err = intel_selftest_wait_for_rq(rq);
790 				if (err)
791 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
792 					       engine->name, rq->fence.context,
793 					       rq->fence.seqno, rq->context->guc_id.id, err);
794 			}
795 
796 skip:
797 			if (rq)
798 				i915_request_put(rq);
799 
800 			if (i915_reset_count(global) != reset_count) {
801 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
802 				err = -EINVAL;
803 				goto restore;
804 			}
805 
806 			/* GuC based resets are not logged per engine */
807 			if (!using_guc) {
808 				if (i915_reset_engine_count(global, engine) !=
809 				    ++reset_engine_count) {
810 					pr_err("%s engine reset not recorded!\n",
811 					       engine->name);
812 					err = -EINVAL;
813 					goto restore;
814 				}
815 			}
816 
817 			count++;
818 
819 restore:
820 			err2 = intel_selftest_restore_policy(engine, &saved);
821 			if (err2)
822 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err);
823 			if (err == 0)
824 				err = err2;
825 			if (err)
826 				break;
827 		} while (time_before(jiffies, end_time));
828 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
829 		st_engine_heartbeat_enable(engine);
830 		pr_info("%s: Completed %lu %s resets\n",
831 			engine->name, count, active ? "active" : "idle");
832 
833 		if (err)
834 			break;
835 
836 		err = igt_flush_test(gt->i915);
837 		if (err) {
838 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
839 			break;
840 		}
841 	}
842 
843 	if (intel_gt_is_wedged(gt)) {
844 		pr_err("GT is wedged!\n");
845 		err = -EIO;
846 	}
847 
848 	if (active)
849 		hang_fini(&h);
850 
851 	return err;
852 }
853 
854 static int igt_reset_idle_engine(void *arg)
855 {
856 	return __igt_reset_engine(arg, false);
857 }
858 
859 static int igt_reset_active_engine(void *arg)
860 {
861 	return __igt_reset_engine(arg, true);
862 }
863 
864 struct active_engine {
865 	struct task_struct *task;
866 	struct intel_engine_cs *engine;
867 	unsigned long resets;
868 	unsigned int flags;
869 };
870 
871 #define TEST_ACTIVE	BIT(0)
872 #define TEST_OTHERS	BIT(1)
873 #define TEST_SELF	BIT(2)
874 #define TEST_PRIORITY	BIT(3)
875 
876 static int active_request_put(struct i915_request *rq)
877 {
878 	int err = 0;
879 
880 	if (!rq)
881 		return 0;
882 
883 	if (i915_request_wait(rq, 0, 10 * HZ) < 0) {
884 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
885 			  rq->engine->name,
886 			  rq->fence.context,
887 			  rq->fence.seqno);
888 		GEM_TRACE_DUMP();
889 
890 		intel_gt_set_wedged(rq->engine->gt);
891 		err = -EIO;
892 	}
893 
894 	i915_request_put(rq);
895 
896 	return err;
897 }
898 
899 static int active_engine(void *data)
900 {
901 	I915_RND_STATE(prng);
902 	struct active_engine *arg = data;
903 	struct intel_engine_cs *engine = arg->engine;
904 	struct i915_request *rq[8] = {};
905 	struct intel_context *ce[ARRAY_SIZE(rq)];
906 	unsigned long count;
907 	int err = 0;
908 
909 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
910 		ce[count] = intel_context_create(engine);
911 		if (IS_ERR(ce[count])) {
912 			err = PTR_ERR(ce[count]);
913 			pr_err("[%s] Create context #%ld failed: %d!\n", engine->name, count, err);
914 			while (--count)
915 				intel_context_put(ce[count]);
916 			return err;
917 		}
918 	}
919 
920 	count = 0;
921 	while (!kthread_should_stop()) {
922 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
923 		struct i915_request *old = rq[idx];
924 		struct i915_request *new;
925 
926 		new = intel_context_create_request(ce[idx]);
927 		if (IS_ERR(new)) {
928 			err = PTR_ERR(new);
929 			pr_err("[%s] Create request #%d failed: %d!\n", engine->name, idx, err);
930 			break;
931 		}
932 
933 		rq[idx] = i915_request_get(new);
934 		i915_request_add(new);
935 
936 		if (engine->sched_engine->schedule && arg->flags & TEST_PRIORITY) {
937 			struct i915_sched_attr attr = {
938 				.priority =
939 					i915_prandom_u32_max_state(512, &prng),
940 			};
941 			engine->sched_engine->schedule(rq[idx], &attr);
942 		}
943 
944 		err = active_request_put(old);
945 		if (err) {
946 			pr_err("[%s] Request put failed: %d!\n", engine->name, err);
947 			break;
948 		}
949 
950 		cond_resched();
951 	}
952 
953 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
954 		int err__ = active_request_put(rq[count]);
955 
956 		if (err)
957 			pr_err("[%s] Request put #%ld failed: %d!\n", engine->name, count, err);
958 
959 		/* Keep the first error */
960 		if (!err)
961 			err = err__;
962 
963 		intel_context_put(ce[count]);
964 	}
965 
966 	return err;
967 }
968 
969 static int __igt_reset_engines(struct intel_gt *gt,
970 			       const char *test_name,
971 			       unsigned int flags)
972 {
973 	struct i915_gpu_error *global = &gt->i915->gpu_error;
974 	struct intel_engine_cs *engine, *other;
975 	enum intel_engine_id id, tmp;
976 	struct hang h;
977 	int err = 0;
978 
979 	/* Check that issuing a reset on one engine does not interfere
980 	 * with any other engine.
981 	 */
982 
983 	if (!intel_has_reset_engine(gt))
984 		return 0;
985 
986 	if (flags & TEST_ACTIVE) {
987 		err = hang_init(&h, gt);
988 		if (err)
989 			return err;
990 
991 		if (flags & TEST_PRIORITY)
992 			h.ctx->sched.priority = 1024;
993 	}
994 
995 	for_each_engine(engine, gt, id) {
996 		struct active_engine threads[I915_NUM_ENGINES] = {};
997 		unsigned long device = i915_reset_count(global);
998 		unsigned long count = 0, reported;
999 		bool using_guc = intel_engine_uses_guc(engine);
1000 		IGT_TIMEOUT(end_time);
1001 
1002 		if (flags & TEST_ACTIVE) {
1003 			if (!intel_engine_can_store_dword(engine))
1004 				continue;
1005 		} else if (using_guc)
1006 			continue;
1007 
1008 		if (!wait_for_idle(engine)) {
1009 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
1010 			       engine->name, test_name);
1011 			err = -EIO;
1012 			break;
1013 		}
1014 
1015 		memset(threads, 0, sizeof(threads));
1016 		for_each_engine(other, gt, tmp) {
1017 			struct task_struct *tsk;
1018 
1019 			threads[tmp].resets =
1020 				i915_reset_engine_count(global, other);
1021 
1022 			if (other == engine && !(flags & TEST_SELF))
1023 				continue;
1024 
1025 			if (other != engine && !(flags & TEST_OTHERS))
1026 				continue;
1027 
1028 			threads[tmp].engine = other;
1029 			threads[tmp].flags = flags;
1030 
1031 			tsk = kthread_run(active_engine, &threads[tmp],
1032 					  "igt/%s", other->name);
1033 			if (IS_ERR(tsk)) {
1034 				err = PTR_ERR(tsk);
1035 				pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1036 				goto unwind;
1037 			}
1038 
1039 			threads[tmp].task = tsk;
1040 			get_task_struct(tsk);
1041 		}
1042 
1043 		yield(); /* start all threads before we begin */
1044 
1045 		st_engine_heartbeat_disable_no_pm(engine);
1046 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1047 		do {
1048 			struct i915_request *rq = NULL;
1049 			struct intel_selftest_saved_policy saved;
1050 			int err2;
1051 
1052 			err = intel_selftest_modify_policy(engine, &saved,
1053 							   SELFTEST_SCHEDULER_MODIFY_FAST_RESET);
1054 			if (err) {
1055 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1056 				break;
1057 			}
1058 
1059 			if (flags & TEST_ACTIVE) {
1060 				rq = hang_create_request(&h, engine);
1061 				if (IS_ERR(rq)) {
1062 					err = PTR_ERR(rq);
1063 					pr_err("[%s] Create hang request failed: %d!\n",
1064 					       engine->name, err);
1065 					goto restore;
1066 				}
1067 
1068 				i915_request_get(rq);
1069 				i915_request_add(rq);
1070 
1071 				if (!wait_until_running(&h, rq)) {
1072 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1073 
1074 					pr_err("%s: Failed to start request %llx, at %x\n",
1075 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1076 					intel_engine_dump(engine, &p,
1077 							  "%s\n", engine->name);
1078 
1079 					i915_request_put(rq);
1080 					err = -EIO;
1081 					goto restore;
1082 				}
1083 			} else {
1084 				intel_engine_pm_get(engine);
1085 			}
1086 
1087 			if (!using_guc) {
1088 				err = intel_engine_reset(engine, NULL);
1089 				if (err) {
1090 					pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1091 					       engine->name, test_name, err);
1092 					goto restore;
1093 				}
1094 			}
1095 
1096 			if (rq) {
1097 				/* Ensure the reset happens and kills the engine */
1098 				err = intel_selftest_wait_for_rq(rq);
1099 				if (err)
1100 					pr_err("[%s] Wait for request %lld:%lld [0x%04X] failed: %d!\n",
1101 					       engine->name, rq->fence.context,
1102 					       rq->fence.seqno, rq->context->guc_id.id, err);
1103 			}
1104 
1105 			count++;
1106 
1107 			if (rq) {
1108 				if (rq->fence.error != -EIO) {
1109 					pr_err("i915_reset_engine(%s:%s): failed to reset request %lld:%lld [0x%04X]\n",
1110 					       engine->name, test_name,
1111 					       rq->fence.context,
1112 					       rq->fence.seqno, rq->context->guc_id.id);
1113 					i915_request_put(rq);
1114 
1115 					GEM_TRACE_DUMP();
1116 					intel_gt_set_wedged(gt);
1117 					err = -EIO;
1118 					goto restore;
1119 				}
1120 
1121 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1122 					struct drm_printer p =
1123 						drm_info_printer(gt->i915->drm.dev);
1124 
1125 					pr_err("i915_reset_engine(%s:%s):"
1126 					       " failed to complete request %llx:%lld after reset\n",
1127 					       engine->name, test_name,
1128 					       rq->fence.context,
1129 					       rq->fence.seqno);
1130 					intel_engine_dump(engine, &p,
1131 							  "%s\n", engine->name);
1132 					i915_request_put(rq);
1133 
1134 					GEM_TRACE_DUMP();
1135 					intel_gt_set_wedged(gt);
1136 					err = -EIO;
1137 					goto restore;
1138 				}
1139 
1140 				i915_request_put(rq);
1141 			}
1142 
1143 			if (!(flags & TEST_ACTIVE))
1144 				intel_engine_pm_put(engine);
1145 
1146 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1147 				struct drm_printer p =
1148 					drm_info_printer(gt->i915->drm.dev);
1149 
1150 				pr_err("i915_reset_engine(%s:%s):"
1151 				       " failed to idle after reset\n",
1152 				       engine->name, test_name);
1153 				intel_engine_dump(engine, &p,
1154 						  "%s\n", engine->name);
1155 
1156 				err = -EIO;
1157 				goto restore;
1158 			}
1159 
1160 restore:
1161 			err2 = intel_selftest_restore_policy(engine, &saved);
1162 			if (err2)
1163 				pr_err("[%s] Restore policy failed: %d!\n", engine->name, err2);
1164 			if (err == 0)
1165 				err = err2;
1166 			if (err)
1167 				break;
1168 		} while (time_before(jiffies, end_time));
1169 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1170 		st_engine_heartbeat_enable_no_pm(engine);
1171 
1172 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1173 			engine->name, test_name, count);
1174 
1175 		/* GuC based resets are not logged per engine */
1176 		if (!using_guc) {
1177 			reported = i915_reset_engine_count(global, engine);
1178 			reported -= threads[engine->id].resets;
1179 			if (reported != count) {
1180 				pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1181 				       engine->name, test_name, count, reported);
1182 				if (!err)
1183 					err = -EINVAL;
1184 			}
1185 		}
1186 
1187 unwind:
1188 		for_each_engine(other, gt, tmp) {
1189 			int ret;
1190 
1191 			if (!threads[tmp].task)
1192 				continue;
1193 
1194 			ret = kthread_stop(threads[tmp].task);
1195 			if (ret) {
1196 				pr_err("kthread for other engine %s failed, err=%d\n",
1197 				       other->name, ret);
1198 				if (!err)
1199 					err = ret;
1200 			}
1201 			put_task_struct(threads[tmp].task);
1202 
1203 			/* GuC based resets are not logged per engine */
1204 			if (!using_guc) {
1205 				if (other->uabi_class != engine->uabi_class &&
1206 				    threads[tmp].resets !=
1207 				    i915_reset_engine_count(global, other)) {
1208 					pr_err("Innocent engine %s was reset (count=%ld)\n",
1209 					       other->name,
1210 					       i915_reset_engine_count(global, other) -
1211 					       threads[tmp].resets);
1212 					if (!err)
1213 						err = -EINVAL;
1214 				}
1215 			}
1216 		}
1217 
1218 		if (device != i915_reset_count(global)) {
1219 			pr_err("Global reset (count=%ld)!\n",
1220 			       i915_reset_count(global) - device);
1221 			if (!err)
1222 				err = -EINVAL;
1223 		}
1224 
1225 		if (err)
1226 			break;
1227 
1228 		err = igt_flush_test(gt->i915);
1229 		if (err) {
1230 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1231 			break;
1232 		}
1233 	}
1234 
1235 	if (intel_gt_is_wedged(gt))
1236 		err = -EIO;
1237 
1238 	if (flags & TEST_ACTIVE)
1239 		hang_fini(&h);
1240 
1241 	return err;
1242 }
1243 
1244 static int igt_reset_engines(void *arg)
1245 {
1246 	static const struct {
1247 		const char *name;
1248 		unsigned int flags;
1249 	} phases[] = {
1250 		{ "idle", 0 },
1251 		{ "active", TEST_ACTIVE },
1252 		{ "others-idle", TEST_OTHERS },
1253 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1254 		{
1255 			"others-priority",
1256 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1257 		},
1258 		{
1259 			"self-priority",
1260 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1261 		},
1262 		{ }
1263 	};
1264 	struct intel_gt *gt = arg;
1265 	typeof(*phases) *p;
1266 	int err;
1267 
1268 	for (p = phases; p->name; p++) {
1269 		if (p->flags & TEST_PRIORITY) {
1270 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1271 				continue;
1272 		}
1273 
1274 		err = __igt_reset_engines(arg, p->name, p->flags);
1275 		if (err)
1276 			return err;
1277 	}
1278 
1279 	return 0;
1280 }
1281 
1282 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1283 {
1284 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1285 
1286 	intel_gt_reset(gt, mask, NULL);
1287 
1288 	return count;
1289 }
1290 
1291 static int igt_reset_wait(void *arg)
1292 {
1293 	struct intel_gt *gt = arg;
1294 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1295 	struct intel_engine_cs *engine = gt->engine[RCS0];
1296 	struct i915_request *rq;
1297 	unsigned int reset_count;
1298 	struct hang h;
1299 	long timeout;
1300 	int err;
1301 
1302 	if (!engine || !intel_engine_can_store_dword(engine))
1303 		return 0;
1304 
1305 	/* Check that we detect a stuck waiter and issue a reset */
1306 
1307 	igt_global_reset_lock(gt);
1308 
1309 	err = hang_init(&h, gt);
1310 	if (err) {
1311 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1312 		goto unlock;
1313 	}
1314 
1315 	rq = hang_create_request(&h, engine);
1316 	if (IS_ERR(rq)) {
1317 		err = PTR_ERR(rq);
1318 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1319 		goto fini;
1320 	}
1321 
1322 	i915_request_get(rq);
1323 	i915_request_add(rq);
1324 
1325 	if (!wait_until_running(&h, rq)) {
1326 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1327 
1328 		pr_err("%s: Failed to start request %llx, at %x\n",
1329 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1330 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1331 
1332 		intel_gt_set_wedged(gt);
1333 
1334 		err = -EIO;
1335 		goto out_rq;
1336 	}
1337 
1338 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1339 
1340 	timeout = i915_request_wait(rq, 0, 10);
1341 	if (timeout < 0) {
1342 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1343 		       timeout);
1344 		err = timeout;
1345 		goto out_rq;
1346 	}
1347 
1348 	if (i915_reset_count(global) == reset_count) {
1349 		pr_err("No GPU reset recorded!\n");
1350 		err = -EINVAL;
1351 		goto out_rq;
1352 	}
1353 
1354 out_rq:
1355 	i915_request_put(rq);
1356 fini:
1357 	hang_fini(&h);
1358 unlock:
1359 	igt_global_reset_unlock(gt);
1360 
1361 	if (intel_gt_is_wedged(gt))
1362 		return -EIO;
1363 
1364 	return err;
1365 }
1366 
1367 struct evict_vma {
1368 	struct completion completion;
1369 	struct i915_vma *vma;
1370 };
1371 
1372 static int evict_vma(void *data)
1373 {
1374 	struct evict_vma *arg = data;
1375 	struct i915_address_space *vm = arg->vma->vm;
1376 	struct drm_mm_node evict = arg->vma->node;
1377 	int err;
1378 
1379 	complete(&arg->completion);
1380 
1381 	mutex_lock(&vm->mutex);
1382 	err = i915_gem_evict_for_node(vm, &evict, 0);
1383 	mutex_unlock(&vm->mutex);
1384 
1385 	return err;
1386 }
1387 
1388 static int evict_fence(void *data)
1389 {
1390 	struct evict_vma *arg = data;
1391 	int err;
1392 
1393 	complete(&arg->completion);
1394 
1395 	/* Mark the fence register as dirty to force the mmio update. */
1396 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1397 	if (err) {
1398 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1399 		return err;
1400 	}
1401 
1402 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1403 	if (err) {
1404 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1405 		return err;
1406 	}
1407 
1408 	err = i915_vma_pin_fence(arg->vma);
1409 	i915_vma_unpin(arg->vma);
1410 	if (err) {
1411 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1412 		return err;
1413 	}
1414 
1415 	i915_vma_unpin_fence(arg->vma);
1416 
1417 	return 0;
1418 }
1419 
1420 static int __igt_reset_evict_vma(struct intel_gt *gt,
1421 				 struct i915_address_space *vm,
1422 				 int (*fn)(void *),
1423 				 unsigned int flags)
1424 {
1425 	struct intel_engine_cs *engine = gt->engine[RCS0];
1426 	struct drm_i915_gem_object *obj;
1427 	struct task_struct *tsk = NULL;
1428 	struct i915_request *rq;
1429 	struct evict_vma arg;
1430 	struct hang h;
1431 	unsigned int pin_flags;
1432 	int err;
1433 
1434 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1435 		return 0;
1436 
1437 	if (!engine || !intel_engine_can_store_dword(engine))
1438 		return 0;
1439 
1440 	/* Check that we can recover an unbind stuck on a hanging request */
1441 
1442 	err = hang_init(&h, gt);
1443 	if (err) {
1444 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1445 		return err;
1446 	}
1447 
1448 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1449 	if (IS_ERR(obj)) {
1450 		err = PTR_ERR(obj);
1451 		pr_err("[%s] Create object failed: %d!\n", engine->name, err);
1452 		goto fini;
1453 	}
1454 
1455 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1456 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1457 		if (err) {
1458 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1459 			goto out_obj;
1460 		}
1461 	}
1462 
1463 	arg.vma = i915_vma_instance(obj, vm, NULL);
1464 	if (IS_ERR(arg.vma)) {
1465 		err = PTR_ERR(arg.vma);
1466 		pr_err("[%s] VMA instance failed: %d!\n", engine->name, err);
1467 		goto out_obj;
1468 	}
1469 
1470 	rq = hang_create_request(&h, engine);
1471 	if (IS_ERR(rq)) {
1472 		err = PTR_ERR(rq);
1473 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1474 		goto out_obj;
1475 	}
1476 
1477 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1478 
1479 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1480 		pin_flags |= PIN_MAPPABLE;
1481 
1482 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1483 	if (err) {
1484 		i915_request_add(rq);
1485 		pr_err("[%s] VMA pin failed: %d!\n", engine->name, err);
1486 		goto out_obj;
1487 	}
1488 
1489 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1490 		err = i915_vma_pin_fence(arg.vma);
1491 		if (err) {
1492 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1493 			i915_vma_unpin(arg.vma);
1494 			i915_request_add(rq);
1495 			goto out_obj;
1496 		}
1497 	}
1498 
1499 	i915_vma_lock(arg.vma);
1500 	err = i915_request_await_object(rq, arg.vma->obj,
1501 					flags & EXEC_OBJECT_WRITE);
1502 	if (err == 0) {
1503 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1504 		if (err)
1505 			pr_err("[%s] Move to active failed: %d!\n", engine->name, err);
1506 	} else {
1507 		pr_err("[%s] Request await failed: %d!\n", engine->name, err);
1508 	}
1509 
1510 	i915_vma_unlock(arg.vma);
1511 
1512 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1513 		i915_vma_unpin_fence(arg.vma);
1514 	i915_vma_unpin(arg.vma);
1515 
1516 	i915_request_get(rq);
1517 	i915_request_add(rq);
1518 	if (err)
1519 		goto out_rq;
1520 
1521 	if (!wait_until_running(&h, rq)) {
1522 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1523 
1524 		pr_err("%s: Failed to start request %llx, at %x\n",
1525 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1526 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1527 
1528 		intel_gt_set_wedged(gt);
1529 		goto out_reset;
1530 	}
1531 
1532 	init_completion(&arg.completion);
1533 
1534 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1535 	if (IS_ERR(tsk)) {
1536 		err = PTR_ERR(tsk);
1537 		pr_err("[%s] Thread spawn failed: %d!\n", engine->name, err);
1538 		tsk = NULL;
1539 		goto out_reset;
1540 	}
1541 	get_task_struct(tsk);
1542 
1543 	wait_for_completion(&arg.completion);
1544 
1545 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1546 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1547 
1548 		pr_err("igt/evict_vma kthread did not wait\n");
1549 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1550 
1551 		intel_gt_set_wedged(gt);
1552 		goto out_reset;
1553 	}
1554 
1555 out_reset:
1556 	igt_global_reset_lock(gt);
1557 	fake_hangcheck(gt, rq->engine->mask);
1558 	igt_global_reset_unlock(gt);
1559 
1560 	if (tsk) {
1561 		struct intel_wedge_me w;
1562 
1563 		/* The reset, even indirectly, should take less than 10ms. */
1564 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1565 			err = kthread_stop(tsk);
1566 
1567 		put_task_struct(tsk);
1568 	}
1569 
1570 out_rq:
1571 	i915_request_put(rq);
1572 out_obj:
1573 	i915_gem_object_put(obj);
1574 fini:
1575 	hang_fini(&h);
1576 	if (intel_gt_is_wedged(gt))
1577 		return -EIO;
1578 
1579 	return err;
1580 }
1581 
1582 static int igt_reset_evict_ggtt(void *arg)
1583 {
1584 	struct intel_gt *gt = arg;
1585 
1586 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1587 				     evict_vma, EXEC_OBJECT_WRITE);
1588 }
1589 
1590 static int igt_reset_evict_ppgtt(void *arg)
1591 {
1592 	struct intel_gt *gt = arg;
1593 	struct i915_ppgtt *ppgtt;
1594 	int err;
1595 
1596 	/* aliasing == global gtt locking, covered above */
1597 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1598 		return 0;
1599 
1600 	ppgtt = i915_ppgtt_create(gt, 0);
1601 	if (IS_ERR(ppgtt))
1602 		return PTR_ERR(ppgtt);
1603 
1604 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1605 				    evict_vma, EXEC_OBJECT_WRITE);
1606 	i915_vm_put(&ppgtt->vm);
1607 
1608 	return err;
1609 }
1610 
1611 static int igt_reset_evict_fence(void *arg)
1612 {
1613 	struct intel_gt *gt = arg;
1614 
1615 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1616 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1617 }
1618 
1619 static int wait_for_others(struct intel_gt *gt,
1620 			   struct intel_engine_cs *exclude)
1621 {
1622 	struct intel_engine_cs *engine;
1623 	enum intel_engine_id id;
1624 
1625 	for_each_engine(engine, gt, id) {
1626 		if (engine == exclude)
1627 			continue;
1628 
1629 		if (!wait_for_idle(engine))
1630 			return -EIO;
1631 	}
1632 
1633 	return 0;
1634 }
1635 
1636 static int igt_reset_queue(void *arg)
1637 {
1638 	struct intel_gt *gt = arg;
1639 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1640 	struct intel_engine_cs *engine;
1641 	enum intel_engine_id id;
1642 	struct hang h;
1643 	int err;
1644 
1645 	/* Check that we replay pending requests following a hang */
1646 
1647 	igt_global_reset_lock(gt);
1648 
1649 	err = hang_init(&h, gt);
1650 	if (err)
1651 		goto unlock;
1652 
1653 	for_each_engine(engine, gt, id) {
1654 		struct intel_selftest_saved_policy saved;
1655 		struct i915_request *prev;
1656 		IGT_TIMEOUT(end_time);
1657 		unsigned int count;
1658 		bool using_guc = intel_engine_uses_guc(engine);
1659 
1660 		if (!intel_engine_can_store_dword(engine))
1661 			continue;
1662 
1663 		if (using_guc) {
1664 			err = intel_selftest_modify_policy(engine, &saved,
1665 							   SELFTEST_SCHEDULER_MODIFY_NO_HANGCHECK);
1666 			if (err) {
1667 				pr_err("[%s] Modify policy failed: %d!\n", engine->name, err);
1668 				goto fini;
1669 			}
1670 		}
1671 
1672 		prev = hang_create_request(&h, engine);
1673 		if (IS_ERR(prev)) {
1674 			err = PTR_ERR(prev);
1675 			pr_err("[%s] Create 'prev' hang request failed: %d!\n", engine->name, err);
1676 			goto restore;
1677 		}
1678 
1679 		i915_request_get(prev);
1680 		i915_request_add(prev);
1681 
1682 		count = 0;
1683 		do {
1684 			struct i915_request *rq;
1685 			unsigned int reset_count;
1686 
1687 			rq = hang_create_request(&h, engine);
1688 			if (IS_ERR(rq)) {
1689 				err = PTR_ERR(rq);
1690 				pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1691 				goto restore;
1692 			}
1693 
1694 			i915_request_get(rq);
1695 			i915_request_add(rq);
1696 
1697 			/*
1698 			 * XXX We don't handle resetting the kernel context
1699 			 * very well. If we trigger a device reset twice in
1700 			 * quick succession while the kernel context is
1701 			 * executing, we may end up skipping the breadcrumb.
1702 			 * This is really only a problem for the selftest as
1703 			 * normally there is a large interlude between resets
1704 			 * (hangcheck), or we focus on resetting just one
1705 			 * engine and so avoid repeatedly resetting innocents.
1706 			 */
1707 			err = wait_for_others(gt, engine);
1708 			if (err) {
1709 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1710 				       __func__, engine->name);
1711 				i915_request_put(rq);
1712 				i915_request_put(prev);
1713 
1714 				GEM_TRACE_DUMP();
1715 				intel_gt_set_wedged(gt);
1716 				goto restore;
1717 			}
1718 
1719 			if (!wait_until_running(&h, prev)) {
1720 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1721 
1722 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1723 				       __func__, engine->name,
1724 				       prev->fence.seqno, hws_seqno(&h, prev));
1725 				intel_engine_dump(engine, &p,
1726 						  "%s\n", engine->name);
1727 
1728 				i915_request_put(rq);
1729 				i915_request_put(prev);
1730 
1731 				intel_gt_set_wedged(gt);
1732 
1733 				err = -EIO;
1734 				goto restore;
1735 			}
1736 
1737 			reset_count = fake_hangcheck(gt, BIT(id));
1738 
1739 			if (prev->fence.error != -EIO) {
1740 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1741 				       prev->fence.error);
1742 				i915_request_put(rq);
1743 				i915_request_put(prev);
1744 				err = -EINVAL;
1745 				goto restore;
1746 			}
1747 
1748 			if (rq->fence.error) {
1749 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1750 				       rq->fence.error);
1751 				i915_request_put(rq);
1752 				i915_request_put(prev);
1753 				err = -EINVAL;
1754 				goto restore;
1755 			}
1756 
1757 			if (i915_reset_count(global) == reset_count) {
1758 				pr_err("No GPU reset recorded!\n");
1759 				i915_request_put(rq);
1760 				i915_request_put(prev);
1761 				err = -EINVAL;
1762 				goto restore;
1763 			}
1764 
1765 			i915_request_put(prev);
1766 			prev = rq;
1767 			count++;
1768 		} while (time_before(jiffies, end_time));
1769 		pr_info("%s: Completed %d queued resets\n",
1770 			engine->name, count);
1771 
1772 		*h.batch = MI_BATCH_BUFFER_END;
1773 		intel_gt_chipset_flush(engine->gt);
1774 
1775 		i915_request_put(prev);
1776 
1777 restore:
1778 		if (using_guc) {
1779 			int err2 = intel_selftest_restore_policy(engine, &saved);
1780 
1781 			if (err2)
1782 				pr_err("%s:%d> [%s] Restore policy failed: %d!\n",
1783 				       __func__, __LINE__, engine->name, err2);
1784 			if (err == 0)
1785 				err = err2;
1786 		}
1787 		if (err)
1788 			goto fini;
1789 
1790 		err = igt_flush_test(gt->i915);
1791 		if (err) {
1792 			pr_err("[%s] Flush failed: %d!\n", engine->name, err);
1793 			break;
1794 		}
1795 	}
1796 
1797 fini:
1798 	hang_fini(&h);
1799 unlock:
1800 	igt_global_reset_unlock(gt);
1801 
1802 	if (intel_gt_is_wedged(gt))
1803 		return -EIO;
1804 
1805 	return err;
1806 }
1807 
1808 static int igt_handle_error(void *arg)
1809 {
1810 	struct intel_gt *gt = arg;
1811 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1812 	struct intel_engine_cs *engine = gt->engine[RCS0];
1813 	struct hang h;
1814 	struct i915_request *rq;
1815 	struct i915_gpu_coredump *error;
1816 	int err;
1817 
1818 	/* Check that we can issue a global GPU and engine reset */
1819 
1820 	if (!intel_has_reset_engine(gt))
1821 		return 0;
1822 
1823 	if (!engine || !intel_engine_can_store_dword(engine))
1824 		return 0;
1825 
1826 	err = hang_init(&h, gt);
1827 	if (err) {
1828 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1829 		return err;
1830 	}
1831 
1832 	rq = hang_create_request(&h, engine);
1833 	if (IS_ERR(rq)) {
1834 		err = PTR_ERR(rq);
1835 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1836 		goto err_fini;
1837 	}
1838 
1839 	i915_request_get(rq);
1840 	i915_request_add(rq);
1841 
1842 	if (!wait_until_running(&h, rq)) {
1843 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1844 
1845 		pr_err("%s: Failed to start request %llx, at %x\n",
1846 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1847 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1848 
1849 		intel_gt_set_wedged(gt);
1850 
1851 		err = -EIO;
1852 		goto err_request;
1853 	}
1854 
1855 	/* Temporarily disable error capture */
1856 	error = xchg(&global->first_error, (void *)-1);
1857 
1858 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1859 
1860 	xchg(&global->first_error, error);
1861 
1862 	if (rq->fence.error != -EIO) {
1863 		pr_err("Guilty request not identified!\n");
1864 		err = -EINVAL;
1865 		goto err_request;
1866 	}
1867 
1868 err_request:
1869 	i915_request_put(rq);
1870 err_fini:
1871 	hang_fini(&h);
1872 	return err;
1873 }
1874 
1875 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1876 				     const struct igt_atomic_section *p,
1877 				     const char *mode)
1878 {
1879 	struct tasklet_struct * const t = &engine->sched_engine->tasklet;
1880 	int err;
1881 
1882 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1883 		  engine->name, mode, p->name);
1884 
1885 	if (t->func)
1886 		tasklet_disable(t);
1887 	if (strcmp(p->name, "softirq"))
1888 		local_bh_disable();
1889 	p->critical_section_begin();
1890 
1891 	err = __intel_engine_reset_bh(engine, NULL);
1892 
1893 	p->critical_section_end();
1894 	if (strcmp(p->name, "softirq"))
1895 		local_bh_enable();
1896 	if (t->func) {
1897 		tasklet_enable(t);
1898 		tasklet_hi_schedule(t);
1899 	}
1900 
1901 	if (err)
1902 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1903 		       engine->name, mode, p->name);
1904 
1905 	return err;
1906 }
1907 
1908 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1909 				   const struct igt_atomic_section *p)
1910 {
1911 	struct i915_request *rq;
1912 	struct hang h;
1913 	int err;
1914 
1915 	err = __igt_atomic_reset_engine(engine, p, "idle");
1916 	if (err)
1917 		return err;
1918 
1919 	err = hang_init(&h, engine->gt);
1920 	if (err) {
1921 		pr_err("[%s] Hang init failed: %d!\n", engine->name, err);
1922 		return err;
1923 	}
1924 
1925 	rq = hang_create_request(&h, engine);
1926 	if (IS_ERR(rq)) {
1927 		err = PTR_ERR(rq);
1928 		pr_err("[%s] Create hang request failed: %d!\n", engine->name, err);
1929 		goto out;
1930 	}
1931 
1932 	i915_request_get(rq);
1933 	i915_request_add(rq);
1934 
1935 	if (wait_until_running(&h, rq)) {
1936 		err = __igt_atomic_reset_engine(engine, p, "active");
1937 	} else {
1938 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1939 		       __func__, engine->name,
1940 		       rq->fence.seqno, hws_seqno(&h, rq));
1941 		intel_gt_set_wedged(engine->gt);
1942 		err = -EIO;
1943 	}
1944 
1945 	if (err == 0) {
1946 		struct intel_wedge_me w;
1947 
1948 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1949 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1950 		if (intel_gt_is_wedged(engine->gt))
1951 			err = -EIO;
1952 	}
1953 
1954 	i915_request_put(rq);
1955 out:
1956 	hang_fini(&h);
1957 	return err;
1958 }
1959 
1960 static int igt_reset_engines_atomic(void *arg)
1961 {
1962 	struct intel_gt *gt = arg;
1963 	const typeof(*igt_atomic_phases) *p;
1964 	int err = 0;
1965 
1966 	/* Check that the engines resets are usable from atomic context */
1967 
1968 	if (!intel_has_reset_engine(gt))
1969 		return 0;
1970 
1971 	if (intel_uc_uses_guc_submission(&gt->uc))
1972 		return 0;
1973 
1974 	igt_global_reset_lock(gt);
1975 
1976 	/* Flush any requests before we get started and check basics */
1977 	if (!igt_force_reset(gt))
1978 		goto unlock;
1979 
1980 	for (p = igt_atomic_phases; p->name; p++) {
1981 		struct intel_engine_cs *engine;
1982 		enum intel_engine_id id;
1983 
1984 		for_each_engine(engine, gt, id) {
1985 			err = igt_atomic_reset_engine(engine, p);
1986 			if (err)
1987 				goto out;
1988 		}
1989 	}
1990 
1991 out:
1992 	/* As we poke around the guts, do a full reset before continuing. */
1993 	igt_force_reset(gt);
1994 unlock:
1995 	igt_global_reset_unlock(gt);
1996 
1997 	return err;
1998 }
1999 
2000 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
2001 {
2002 	static const struct i915_subtest tests[] = {
2003 		SUBTEST(igt_hang_sanitycheck),
2004 		SUBTEST(igt_reset_nop),
2005 		SUBTEST(igt_reset_nop_engine),
2006 		SUBTEST(igt_reset_idle_engine),
2007 		SUBTEST(igt_reset_active_engine),
2008 		SUBTEST(igt_reset_fail_engine),
2009 		SUBTEST(igt_reset_engines),
2010 		SUBTEST(igt_reset_engines_atomic),
2011 		SUBTEST(igt_reset_queue),
2012 		SUBTEST(igt_reset_wait),
2013 		SUBTEST(igt_reset_evict_ggtt),
2014 		SUBTEST(igt_reset_evict_ppgtt),
2015 		SUBTEST(igt_reset_evict_fence),
2016 		SUBTEST(igt_handle_error),
2017 	};
2018 	struct intel_gt *gt = &i915->gt;
2019 	intel_wakeref_t wakeref;
2020 	int err;
2021 
2022 	if (!intel_has_gpu_reset(gt))
2023 		return 0;
2024 
2025 	if (intel_gt_is_wedged(gt))
2026 		return -EIO; /* we're long past hope of a successful reset */
2027 
2028 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
2029 
2030 	err = intel_gt_live_subtests(tests, gt);
2031 
2032 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
2033 
2034 	return err;
2035 }
2036