1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 #include "selftest_engine_heartbeat.h"
33 
34 #include "i915_selftest.h"
35 #include "selftests/i915_random.h"
36 #include "selftests/igt_flush_test.h"
37 #include "selftests/igt_reset.h"
38 #include "selftests/igt_atomic.h"
39 
40 #include "selftests/mock_drm.h"
41 
42 #include "gem/selftests/mock_context.h"
43 #include "gem/selftests/igt_gem_utils.h"
44 
45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
46 
47 struct hang {
48 	struct intel_gt *gt;
49 	struct drm_i915_gem_object *hws;
50 	struct drm_i915_gem_object *obj;
51 	struct i915_gem_context *ctx;
52 	u32 *seqno;
53 	u32 *batch;
54 };
55 
56 static int hang_init(struct hang *h, struct intel_gt *gt)
57 {
58 	void *vaddr;
59 	int err;
60 
61 	memset(h, 0, sizeof(*h));
62 	h->gt = gt;
63 
64 	h->ctx = kernel_context(gt->i915);
65 	if (IS_ERR(h->ctx))
66 		return PTR_ERR(h->ctx);
67 
68 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
69 
70 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
71 	if (IS_ERR(h->hws)) {
72 		err = PTR_ERR(h->hws);
73 		goto err_ctx;
74 	}
75 
76 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
77 	if (IS_ERR(h->obj)) {
78 		err = PTR_ERR(h->obj);
79 		goto err_hws;
80 	}
81 
82 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
83 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
84 	if (IS_ERR(vaddr)) {
85 		err = PTR_ERR(vaddr);
86 		goto err_obj;
87 	}
88 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
89 
90 	vaddr = i915_gem_object_pin_map(h->obj,
91 					i915_coherent_map_type(gt->i915));
92 	if (IS_ERR(vaddr)) {
93 		err = PTR_ERR(vaddr);
94 		goto err_unpin_hws;
95 	}
96 	h->batch = vaddr;
97 
98 	return 0;
99 
100 err_unpin_hws:
101 	i915_gem_object_unpin_map(h->hws);
102 err_obj:
103 	i915_gem_object_put(h->obj);
104 err_hws:
105 	i915_gem_object_put(h->hws);
106 err_ctx:
107 	kernel_context_close(h->ctx);
108 	return err;
109 }
110 
111 static u64 hws_address(const struct i915_vma *hws,
112 		       const struct i915_request *rq)
113 {
114 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
115 }
116 
117 static int move_to_active(struct i915_vma *vma,
118 			  struct i915_request *rq,
119 			  unsigned int flags)
120 {
121 	int err;
122 
123 	i915_vma_lock(vma);
124 	err = i915_request_await_object(rq, vma->obj,
125 					flags & EXEC_OBJECT_WRITE);
126 	if (err == 0)
127 		err = i915_vma_move_to_active(vma, rq, flags);
128 	i915_vma_unlock(vma);
129 
130 	return err;
131 }
132 
133 static struct i915_request *
134 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
135 {
136 	struct intel_gt *gt = h->gt;
137 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
138 	struct drm_i915_gem_object *obj;
139 	struct i915_request *rq = NULL;
140 	struct i915_vma *hws, *vma;
141 	unsigned int flags;
142 	void *vaddr;
143 	u32 *batch;
144 	int err;
145 
146 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
147 	if (IS_ERR(obj)) {
148 		i915_vm_put(vm);
149 		return ERR_CAST(obj);
150 	}
151 
152 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
153 	if (IS_ERR(vaddr)) {
154 		i915_gem_object_put(obj);
155 		i915_vm_put(vm);
156 		return ERR_CAST(vaddr);
157 	}
158 
159 	i915_gem_object_unpin_map(h->obj);
160 	i915_gem_object_put(h->obj);
161 
162 	h->obj = obj;
163 	h->batch = vaddr;
164 
165 	vma = i915_vma_instance(h->obj, vm, NULL);
166 	if (IS_ERR(vma)) {
167 		i915_vm_put(vm);
168 		return ERR_CAST(vma);
169 	}
170 
171 	hws = i915_vma_instance(h->hws, vm, NULL);
172 	if (IS_ERR(hws)) {
173 		i915_vm_put(vm);
174 		return ERR_CAST(hws);
175 	}
176 
177 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
178 	if (err) {
179 		i915_vm_put(vm);
180 		return ERR_PTR(err);
181 	}
182 
183 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
184 	if (err)
185 		goto unpin_vma;
186 
187 	rq = igt_request_alloc(h->ctx, engine);
188 	if (IS_ERR(rq)) {
189 		err = PTR_ERR(rq);
190 		goto unpin_hws;
191 	}
192 
193 	err = move_to_active(vma, rq, 0);
194 	if (err)
195 		goto cancel_rq;
196 
197 	err = move_to_active(hws, rq, 0);
198 	if (err)
199 		goto cancel_rq;
200 
201 	batch = h->batch;
202 	if (INTEL_GEN(gt->i915) >= 8) {
203 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
204 		*batch++ = lower_32_bits(hws_address(hws, rq));
205 		*batch++ = upper_32_bits(hws_address(hws, rq));
206 		*batch++ = rq->fence.seqno;
207 		*batch++ = MI_NOOP;
208 
209 		memset(batch, 0, 1024);
210 		batch += 1024 / sizeof(*batch);
211 
212 		*batch++ = MI_NOOP;
213 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
214 		*batch++ = lower_32_bits(vma->node.start);
215 		*batch++ = upper_32_bits(vma->node.start);
216 	} else if (INTEL_GEN(gt->i915) >= 6) {
217 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
218 		*batch++ = 0;
219 		*batch++ = lower_32_bits(hws_address(hws, rq));
220 		*batch++ = rq->fence.seqno;
221 		*batch++ = MI_NOOP;
222 
223 		memset(batch, 0, 1024);
224 		batch += 1024 / sizeof(*batch);
225 
226 		*batch++ = MI_NOOP;
227 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
228 		*batch++ = lower_32_bits(vma->node.start);
229 	} else if (INTEL_GEN(gt->i915) >= 4) {
230 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
231 		*batch++ = 0;
232 		*batch++ = lower_32_bits(hws_address(hws, rq));
233 		*batch++ = rq->fence.seqno;
234 		*batch++ = MI_NOOP;
235 
236 		memset(batch, 0, 1024);
237 		batch += 1024 / sizeof(*batch);
238 
239 		*batch++ = MI_NOOP;
240 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 		*batch++ = lower_32_bits(vma->node.start);
242 	} else {
243 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
244 		*batch++ = lower_32_bits(hws_address(hws, rq));
245 		*batch++ = rq->fence.seqno;
246 		*batch++ = MI_NOOP;
247 
248 		memset(batch, 0, 1024);
249 		batch += 1024 / sizeof(*batch);
250 
251 		*batch++ = MI_NOOP;
252 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
253 		*batch++ = lower_32_bits(vma->node.start);
254 	}
255 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
256 	intel_gt_chipset_flush(engine->gt);
257 
258 	if (rq->engine->emit_init_breadcrumb) {
259 		err = rq->engine->emit_init_breadcrumb(rq);
260 		if (err)
261 			goto cancel_rq;
262 	}
263 
264 	flags = 0;
265 	if (INTEL_GEN(gt->i915) <= 5)
266 		flags |= I915_DISPATCH_SECURE;
267 
268 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
269 
270 cancel_rq:
271 	if (err) {
272 		i915_request_set_error_once(rq, err);
273 		i915_request_add(rq);
274 	}
275 unpin_hws:
276 	i915_vma_unpin(hws);
277 unpin_vma:
278 	i915_vma_unpin(vma);
279 	i915_vm_put(vm);
280 	return err ? ERR_PTR(err) : rq;
281 }
282 
283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
284 {
285 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
286 }
287 
288 static void hang_fini(struct hang *h)
289 {
290 	*h->batch = MI_BATCH_BUFFER_END;
291 	intel_gt_chipset_flush(h->gt);
292 
293 	i915_gem_object_unpin_map(h->obj);
294 	i915_gem_object_put(h->obj);
295 
296 	i915_gem_object_unpin_map(h->hws);
297 	i915_gem_object_put(h->hws);
298 
299 	kernel_context_close(h->ctx);
300 
301 	igt_flush_test(h->gt->i915);
302 }
303 
304 static bool wait_until_running(struct hang *h, struct i915_request *rq)
305 {
306 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
307 					       rq->fence.seqno),
308 			     10) &&
309 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
310 					    rq->fence.seqno),
311 			  1000));
312 }
313 
314 static int igt_hang_sanitycheck(void *arg)
315 {
316 	struct intel_gt *gt = arg;
317 	struct i915_request *rq;
318 	struct intel_engine_cs *engine;
319 	enum intel_engine_id id;
320 	struct hang h;
321 	int err;
322 
323 	/* Basic check that we can execute our hanging batch */
324 
325 	err = hang_init(&h, gt);
326 	if (err)
327 		return err;
328 
329 	for_each_engine(engine, gt, id) {
330 		struct intel_wedge_me w;
331 		long timeout;
332 
333 		if (!intel_engine_can_store_dword(engine))
334 			continue;
335 
336 		rq = hang_create_request(&h, engine);
337 		if (IS_ERR(rq)) {
338 			err = PTR_ERR(rq);
339 			pr_err("Failed to create request for %s, err=%d\n",
340 			       engine->name, err);
341 			goto fini;
342 		}
343 
344 		i915_request_get(rq);
345 
346 		*h.batch = MI_BATCH_BUFFER_END;
347 		intel_gt_chipset_flush(engine->gt);
348 
349 		i915_request_add(rq);
350 
351 		timeout = 0;
352 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
353 			timeout = i915_request_wait(rq, 0,
354 						    MAX_SCHEDULE_TIMEOUT);
355 		if (intel_gt_is_wedged(gt))
356 			timeout = -EIO;
357 
358 		i915_request_put(rq);
359 
360 		if (timeout < 0) {
361 			err = timeout;
362 			pr_err("Wait for request failed on %s, err=%d\n",
363 			       engine->name, err);
364 			goto fini;
365 		}
366 	}
367 
368 fini:
369 	hang_fini(&h);
370 	return err;
371 }
372 
373 static bool wait_for_idle(struct intel_engine_cs *engine)
374 {
375 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
376 }
377 
378 static int igt_reset_nop(void *arg)
379 {
380 	struct intel_gt *gt = arg;
381 	struct i915_gpu_error *global = &gt->i915->gpu_error;
382 	struct intel_engine_cs *engine;
383 	unsigned int reset_count, count;
384 	enum intel_engine_id id;
385 	IGT_TIMEOUT(end_time);
386 	int err = 0;
387 
388 	/* Check that we can reset during non-user portions of requests */
389 
390 	reset_count = i915_reset_count(global);
391 	count = 0;
392 	do {
393 		for_each_engine(engine, gt, id) {
394 			struct intel_context *ce;
395 			int i;
396 
397 			ce = intel_context_create(engine);
398 			if (IS_ERR(ce)) {
399 				err = PTR_ERR(ce);
400 				break;
401 			}
402 
403 			for (i = 0; i < 16; i++) {
404 				struct i915_request *rq;
405 
406 				rq = intel_context_create_request(ce);
407 				if (IS_ERR(rq)) {
408 					err = PTR_ERR(rq);
409 					break;
410 				}
411 
412 				i915_request_add(rq);
413 			}
414 
415 			intel_context_put(ce);
416 		}
417 
418 		igt_global_reset_lock(gt);
419 		intel_gt_reset(gt, ALL_ENGINES, NULL);
420 		igt_global_reset_unlock(gt);
421 
422 		if (intel_gt_is_wedged(gt)) {
423 			err = -EIO;
424 			break;
425 		}
426 
427 		if (i915_reset_count(global) != reset_count + ++count) {
428 			pr_err("Full GPU reset not recorded!\n");
429 			err = -EINVAL;
430 			break;
431 		}
432 
433 		err = igt_flush_test(gt->i915);
434 		if (err)
435 			break;
436 	} while (time_before(jiffies, end_time));
437 	pr_info("%s: %d resets\n", __func__, count);
438 
439 	if (igt_flush_test(gt->i915))
440 		err = -EIO;
441 	return err;
442 }
443 
444 static int igt_reset_nop_engine(void *arg)
445 {
446 	struct intel_gt *gt = arg;
447 	struct i915_gpu_error *global = &gt->i915->gpu_error;
448 	struct intel_engine_cs *engine;
449 	enum intel_engine_id id;
450 
451 	/* Check that we can engine-reset during non-user portions */
452 
453 	if (!intel_has_reset_engine(gt))
454 		return 0;
455 
456 	for_each_engine(engine, gt, id) {
457 		unsigned int reset_count, reset_engine_count, count;
458 		struct intel_context *ce;
459 		IGT_TIMEOUT(end_time);
460 		int err;
461 
462 		ce = intel_context_create(engine);
463 		if (IS_ERR(ce))
464 			return PTR_ERR(ce);
465 
466 		reset_count = i915_reset_count(global);
467 		reset_engine_count = i915_reset_engine_count(global, engine);
468 		count = 0;
469 
470 		st_engine_heartbeat_disable(engine);
471 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
472 		do {
473 			int i;
474 
475 			if (!wait_for_idle(engine)) {
476 				pr_err("%s failed to idle before reset\n",
477 				       engine->name);
478 				err = -EIO;
479 				break;
480 			}
481 
482 			for (i = 0; i < 16; i++) {
483 				struct i915_request *rq;
484 
485 				rq = intel_context_create_request(ce);
486 				if (IS_ERR(rq)) {
487 					struct drm_printer p =
488 						drm_info_printer(gt->i915->drm.dev);
489 					intel_engine_dump(engine, &p,
490 							  "%s(%s): failed to submit request\n",
491 							  __func__,
492 							  engine->name);
493 
494 					GEM_TRACE("%s(%s): failed to submit request\n",
495 						  __func__,
496 						  engine->name);
497 					GEM_TRACE_DUMP();
498 
499 					intel_gt_set_wedged(gt);
500 
501 					err = PTR_ERR(rq);
502 					break;
503 				}
504 
505 				i915_request_add(rq);
506 			}
507 			err = intel_engine_reset(engine, NULL);
508 			if (err) {
509 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
510 				       engine->name, err);
511 				break;
512 			}
513 
514 			if (i915_reset_count(global) != reset_count) {
515 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
516 				err = -EINVAL;
517 				break;
518 			}
519 
520 			if (i915_reset_engine_count(global, engine) !=
521 			    reset_engine_count + ++count) {
522 				pr_err("%s engine reset not recorded!\n",
523 				       engine->name);
524 				err = -EINVAL;
525 				break;
526 			}
527 		} while (time_before(jiffies, end_time));
528 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
529 		st_engine_heartbeat_enable(engine);
530 
531 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532 
533 		intel_context_put(ce);
534 		if (igt_flush_test(gt->i915))
535 			err = -EIO;
536 		if (err)
537 			return err;
538 	}
539 
540 	return 0;
541 }
542 
543 static void force_reset_timeout(struct intel_engine_cs *engine)
544 {
545 	engine->reset_timeout.probability = 999;
546 	atomic_set(&engine->reset_timeout.times, -1);
547 }
548 
549 static void cancel_reset_timeout(struct intel_engine_cs *engine)
550 {
551 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
552 }
553 
554 static int igt_reset_fail_engine(void *arg)
555 {
556 	struct intel_gt *gt = arg;
557 	struct intel_engine_cs *engine;
558 	enum intel_engine_id id;
559 
560 	/* Check that we can recover from engine-reset failues */
561 
562 	if (!intel_has_reset_engine(gt))
563 		return 0;
564 
565 	for_each_engine(engine, gt, id) {
566 		unsigned int count;
567 		struct intel_context *ce;
568 		IGT_TIMEOUT(end_time);
569 		int err;
570 
571 		ce = intel_context_create(engine);
572 		if (IS_ERR(ce))
573 			return PTR_ERR(ce);
574 
575 		st_engine_heartbeat_disable(engine);
576 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
577 
578 		force_reset_timeout(engine);
579 		err = intel_engine_reset(engine, NULL);
580 		cancel_reset_timeout(engine);
581 		if (err == 0) /* timeouts only generated on gen8+ */
582 			goto skip;
583 
584 		count = 0;
585 		do {
586 			struct i915_request *last = NULL;
587 			int i;
588 
589 			if (!wait_for_idle(engine)) {
590 				pr_err("%s failed to idle before reset\n",
591 				       engine->name);
592 				err = -EIO;
593 				break;
594 			}
595 
596 			for (i = 0; i < count % 15; i++) {
597 				struct i915_request *rq;
598 
599 				rq = intel_context_create_request(ce);
600 				if (IS_ERR(rq)) {
601 					struct drm_printer p =
602 						drm_info_printer(gt->i915->drm.dev);
603 					intel_engine_dump(engine, &p,
604 							  "%s(%s): failed to submit request\n",
605 							  __func__,
606 							  engine->name);
607 
608 					GEM_TRACE("%s(%s): failed to submit request\n",
609 						  __func__,
610 						  engine->name);
611 					GEM_TRACE_DUMP();
612 
613 					intel_gt_set_wedged(gt);
614 					if (last)
615 						i915_request_put(last);
616 
617 					err = PTR_ERR(rq);
618 					goto out;
619 				}
620 
621 				if (last)
622 					i915_request_put(last);
623 				last = i915_request_get(rq);
624 				i915_request_add(rq);
625 			}
626 
627 			if (count & 1) {
628 				err = intel_engine_reset(engine, NULL);
629 				if (err) {
630 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
631 						      engine->name, err);
632 					GEM_TRACE_DUMP();
633 					i915_request_put(last);
634 					break;
635 				}
636 			} else {
637 				force_reset_timeout(engine);
638 				err = intel_engine_reset(engine, NULL);
639 				cancel_reset_timeout(engine);
640 				if (err != -ETIMEDOUT) {
641 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
642 					       engine->name, err);
643 					i915_request_put(last);
644 					break;
645 				}
646 			}
647 
648 			err = 0;
649 			if (last) {
650 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
651 					struct drm_printer p =
652 						drm_info_printer(gt->i915->drm.dev);
653 
654 					intel_engine_dump(engine, &p,
655 							  "%s(%s): failed to complete request\n",
656 							  __func__,
657 							  engine->name);
658 
659 					GEM_TRACE("%s(%s): failed to complete request\n",
660 						  __func__,
661 						  engine->name);
662 					GEM_TRACE_DUMP();
663 
664 					err = -EIO;
665 				}
666 				i915_request_put(last);
667 			}
668 			count++;
669 		} while (err == 0 && time_before(jiffies, end_time));
670 out:
671 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
672 skip:
673 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
674 		st_engine_heartbeat_enable(engine);
675 		intel_context_put(ce);
676 
677 		if (igt_flush_test(gt->i915))
678 			err = -EIO;
679 		if (err)
680 			return err;
681 	}
682 
683 	return 0;
684 }
685 
686 static int __igt_reset_engine(struct intel_gt *gt, bool active)
687 {
688 	struct i915_gpu_error *global = &gt->i915->gpu_error;
689 	struct intel_engine_cs *engine;
690 	enum intel_engine_id id;
691 	struct hang h;
692 	int err = 0;
693 
694 	/* Check that we can issue an engine reset on an idle engine (no-op) */
695 
696 	if (!intel_has_reset_engine(gt))
697 		return 0;
698 
699 	if (active) {
700 		err = hang_init(&h, gt);
701 		if (err)
702 			return err;
703 	}
704 
705 	for_each_engine(engine, gt, id) {
706 		unsigned int reset_count, reset_engine_count;
707 		IGT_TIMEOUT(end_time);
708 
709 		if (active && !intel_engine_can_store_dword(engine))
710 			continue;
711 
712 		if (!wait_for_idle(engine)) {
713 			pr_err("%s failed to idle before reset\n",
714 			       engine->name);
715 			err = -EIO;
716 			break;
717 		}
718 
719 		reset_count = i915_reset_count(global);
720 		reset_engine_count = i915_reset_engine_count(global, engine);
721 
722 		st_engine_heartbeat_disable(engine);
723 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
724 		do {
725 			if (active) {
726 				struct i915_request *rq;
727 
728 				rq = hang_create_request(&h, engine);
729 				if (IS_ERR(rq)) {
730 					err = PTR_ERR(rq);
731 					break;
732 				}
733 
734 				i915_request_get(rq);
735 				i915_request_add(rq);
736 
737 				if (!wait_until_running(&h, rq)) {
738 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
739 
740 					pr_err("%s: Failed to start request %llx, at %x\n",
741 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
742 					intel_engine_dump(engine, &p,
743 							  "%s\n", engine->name);
744 
745 					i915_request_put(rq);
746 					err = -EIO;
747 					break;
748 				}
749 
750 				i915_request_put(rq);
751 			}
752 
753 			err = intel_engine_reset(engine, NULL);
754 			if (err) {
755 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
756 				       engine->name, err);
757 				break;
758 			}
759 
760 			if (i915_reset_count(global) != reset_count) {
761 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
762 				err = -EINVAL;
763 				break;
764 			}
765 
766 			if (i915_reset_engine_count(global, engine) !=
767 			    ++reset_engine_count) {
768 				pr_err("%s engine reset not recorded!\n",
769 				       engine->name);
770 				err = -EINVAL;
771 				break;
772 			}
773 		} while (time_before(jiffies, end_time));
774 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
775 		st_engine_heartbeat_enable(engine);
776 
777 		if (err)
778 			break;
779 
780 		err = igt_flush_test(gt->i915);
781 		if (err)
782 			break;
783 	}
784 
785 	if (intel_gt_is_wedged(gt))
786 		err = -EIO;
787 
788 	if (active)
789 		hang_fini(&h);
790 
791 	return err;
792 }
793 
794 static int igt_reset_idle_engine(void *arg)
795 {
796 	return __igt_reset_engine(arg, false);
797 }
798 
799 static int igt_reset_active_engine(void *arg)
800 {
801 	return __igt_reset_engine(arg, true);
802 }
803 
804 struct active_engine {
805 	struct task_struct *task;
806 	struct intel_engine_cs *engine;
807 	unsigned long resets;
808 	unsigned int flags;
809 };
810 
811 #define TEST_ACTIVE	BIT(0)
812 #define TEST_OTHERS	BIT(1)
813 #define TEST_SELF	BIT(2)
814 #define TEST_PRIORITY	BIT(3)
815 
816 static int active_request_put(struct i915_request *rq)
817 {
818 	int err = 0;
819 
820 	if (!rq)
821 		return 0;
822 
823 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
824 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
825 			  rq->engine->name,
826 			  rq->fence.context,
827 			  rq->fence.seqno);
828 		GEM_TRACE_DUMP();
829 
830 		intel_gt_set_wedged(rq->engine->gt);
831 		err = -EIO;
832 	}
833 
834 	i915_request_put(rq);
835 
836 	return err;
837 }
838 
839 static int active_engine(void *data)
840 {
841 	I915_RND_STATE(prng);
842 	struct active_engine *arg = data;
843 	struct intel_engine_cs *engine = arg->engine;
844 	struct i915_request *rq[8] = {};
845 	struct intel_context *ce[ARRAY_SIZE(rq)];
846 	unsigned long count;
847 	int err = 0;
848 
849 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
850 		ce[count] = intel_context_create(engine);
851 		if (IS_ERR(ce[count])) {
852 			err = PTR_ERR(ce[count]);
853 			while (--count)
854 				intel_context_put(ce[count]);
855 			return err;
856 		}
857 	}
858 
859 	count = 0;
860 	while (!kthread_should_stop()) {
861 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
862 		struct i915_request *old = rq[idx];
863 		struct i915_request *new;
864 
865 		new = intel_context_create_request(ce[idx]);
866 		if (IS_ERR(new)) {
867 			err = PTR_ERR(new);
868 			break;
869 		}
870 
871 		rq[idx] = i915_request_get(new);
872 		i915_request_add(new);
873 
874 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
875 			struct i915_sched_attr attr = {
876 				.priority =
877 					i915_prandom_u32_max_state(512, &prng),
878 			};
879 			engine->schedule(rq[idx], &attr);
880 		}
881 
882 		err = active_request_put(old);
883 		if (err)
884 			break;
885 
886 		cond_resched();
887 	}
888 
889 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
890 		int err__ = active_request_put(rq[count]);
891 
892 		/* Keep the first error */
893 		if (!err)
894 			err = err__;
895 
896 		intel_context_put(ce[count]);
897 	}
898 
899 	return err;
900 }
901 
902 static int __igt_reset_engines(struct intel_gt *gt,
903 			       const char *test_name,
904 			       unsigned int flags)
905 {
906 	struct i915_gpu_error *global = &gt->i915->gpu_error;
907 	struct intel_engine_cs *engine, *other;
908 	enum intel_engine_id id, tmp;
909 	struct hang h;
910 	int err = 0;
911 
912 	/* Check that issuing a reset on one engine does not interfere
913 	 * with any other engine.
914 	 */
915 
916 	if (!intel_has_reset_engine(gt))
917 		return 0;
918 
919 	if (flags & TEST_ACTIVE) {
920 		err = hang_init(&h, gt);
921 		if (err)
922 			return err;
923 
924 		if (flags & TEST_PRIORITY)
925 			h.ctx->sched.priority = 1024;
926 	}
927 
928 	for_each_engine(engine, gt, id) {
929 		struct active_engine threads[I915_NUM_ENGINES] = {};
930 		unsigned long device = i915_reset_count(global);
931 		unsigned long count = 0, reported;
932 		IGT_TIMEOUT(end_time);
933 
934 		if (flags & TEST_ACTIVE &&
935 		    !intel_engine_can_store_dword(engine))
936 			continue;
937 
938 		if (!wait_for_idle(engine)) {
939 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
940 			       engine->name, test_name);
941 			err = -EIO;
942 			break;
943 		}
944 
945 		memset(threads, 0, sizeof(threads));
946 		for_each_engine(other, gt, tmp) {
947 			struct task_struct *tsk;
948 
949 			threads[tmp].resets =
950 				i915_reset_engine_count(global, other);
951 
952 			if (other == engine && !(flags & TEST_SELF))
953 				continue;
954 
955 			if (other != engine && !(flags & TEST_OTHERS))
956 				continue;
957 
958 			threads[tmp].engine = other;
959 			threads[tmp].flags = flags;
960 
961 			tsk = kthread_run(active_engine, &threads[tmp],
962 					  "igt/%s", other->name);
963 			if (IS_ERR(tsk)) {
964 				err = PTR_ERR(tsk);
965 				goto unwind;
966 			}
967 
968 			threads[tmp].task = tsk;
969 			get_task_struct(tsk);
970 		}
971 
972 		yield(); /* start all threads before we begin */
973 
974 		st_engine_heartbeat_disable(engine);
975 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
976 		do {
977 			struct i915_request *rq = NULL;
978 
979 			if (flags & TEST_ACTIVE) {
980 				rq = hang_create_request(&h, engine);
981 				if (IS_ERR(rq)) {
982 					err = PTR_ERR(rq);
983 					break;
984 				}
985 
986 				i915_request_get(rq);
987 				i915_request_add(rq);
988 
989 				if (!wait_until_running(&h, rq)) {
990 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
991 
992 					pr_err("%s: Failed to start request %llx, at %x\n",
993 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
994 					intel_engine_dump(engine, &p,
995 							  "%s\n", engine->name);
996 
997 					i915_request_put(rq);
998 					err = -EIO;
999 					break;
1000 				}
1001 			}
1002 
1003 			err = intel_engine_reset(engine, NULL);
1004 			if (err) {
1005 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1006 				       engine->name, test_name, err);
1007 				break;
1008 			}
1009 
1010 			count++;
1011 
1012 			if (rq) {
1013 				if (rq->fence.error != -EIO) {
1014 					pr_err("i915_reset_engine(%s:%s):"
1015 					       " failed to reset request %llx:%lld\n",
1016 					       engine->name, test_name,
1017 					       rq->fence.context,
1018 					       rq->fence.seqno);
1019 					i915_request_put(rq);
1020 
1021 					GEM_TRACE_DUMP();
1022 					intel_gt_set_wedged(gt);
1023 					err = -EIO;
1024 					break;
1025 				}
1026 
1027 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1028 					struct drm_printer p =
1029 						drm_info_printer(gt->i915->drm.dev);
1030 
1031 					pr_err("i915_reset_engine(%s:%s):"
1032 					       " failed to complete request %llx:%lld after reset\n",
1033 					       engine->name, test_name,
1034 					       rq->fence.context,
1035 					       rq->fence.seqno);
1036 					intel_engine_dump(engine, &p,
1037 							  "%s\n", engine->name);
1038 					i915_request_put(rq);
1039 
1040 					GEM_TRACE_DUMP();
1041 					intel_gt_set_wedged(gt);
1042 					err = -EIO;
1043 					break;
1044 				}
1045 
1046 				i915_request_put(rq);
1047 			}
1048 
1049 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1050 				struct drm_printer p =
1051 					drm_info_printer(gt->i915->drm.dev);
1052 
1053 				pr_err("i915_reset_engine(%s:%s):"
1054 				       " failed to idle after reset\n",
1055 				       engine->name, test_name);
1056 				intel_engine_dump(engine, &p,
1057 						  "%s\n", engine->name);
1058 
1059 				err = -EIO;
1060 				break;
1061 			}
1062 		} while (time_before(jiffies, end_time));
1063 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1064 		st_engine_heartbeat_enable(engine);
1065 
1066 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1067 			engine->name, test_name, count);
1068 
1069 		reported = i915_reset_engine_count(global, engine);
1070 		reported -= threads[engine->id].resets;
1071 		if (reported != count) {
1072 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1073 			       engine->name, test_name, count, reported);
1074 			if (!err)
1075 				err = -EINVAL;
1076 		}
1077 
1078 unwind:
1079 		for_each_engine(other, gt, tmp) {
1080 			int ret;
1081 
1082 			if (!threads[tmp].task)
1083 				continue;
1084 
1085 			ret = kthread_stop(threads[tmp].task);
1086 			if (ret) {
1087 				pr_err("kthread for other engine %s failed, err=%d\n",
1088 				       other->name, ret);
1089 				if (!err)
1090 					err = ret;
1091 			}
1092 			put_task_struct(threads[tmp].task);
1093 
1094 			if (other->uabi_class != engine->uabi_class &&
1095 			    threads[tmp].resets !=
1096 			    i915_reset_engine_count(global, other)) {
1097 				pr_err("Innocent engine %s was reset (count=%ld)\n",
1098 				       other->name,
1099 				       i915_reset_engine_count(global, other) -
1100 				       threads[tmp].resets);
1101 				if (!err)
1102 					err = -EINVAL;
1103 			}
1104 		}
1105 
1106 		if (device != i915_reset_count(global)) {
1107 			pr_err("Global reset (count=%ld)!\n",
1108 			       i915_reset_count(global) - device);
1109 			if (!err)
1110 				err = -EINVAL;
1111 		}
1112 
1113 		if (err)
1114 			break;
1115 
1116 		err = igt_flush_test(gt->i915);
1117 		if (err)
1118 			break;
1119 	}
1120 
1121 	if (intel_gt_is_wedged(gt))
1122 		err = -EIO;
1123 
1124 	if (flags & TEST_ACTIVE)
1125 		hang_fini(&h);
1126 
1127 	return err;
1128 }
1129 
1130 static int igt_reset_engines(void *arg)
1131 {
1132 	static const struct {
1133 		const char *name;
1134 		unsigned int flags;
1135 	} phases[] = {
1136 		{ "idle", 0 },
1137 		{ "active", TEST_ACTIVE },
1138 		{ "others-idle", TEST_OTHERS },
1139 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1140 		{
1141 			"others-priority",
1142 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1143 		},
1144 		{
1145 			"self-priority",
1146 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1147 		},
1148 		{ }
1149 	};
1150 	struct intel_gt *gt = arg;
1151 	typeof(*phases) *p;
1152 	int err;
1153 
1154 	for (p = phases; p->name; p++) {
1155 		if (p->flags & TEST_PRIORITY) {
1156 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1157 				continue;
1158 		}
1159 
1160 		err = __igt_reset_engines(arg, p->name, p->flags);
1161 		if (err)
1162 			return err;
1163 	}
1164 
1165 	return 0;
1166 }
1167 
1168 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1169 {
1170 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1171 
1172 	intel_gt_reset(gt, mask, NULL);
1173 
1174 	return count;
1175 }
1176 
1177 static int igt_reset_wait(void *arg)
1178 {
1179 	struct intel_gt *gt = arg;
1180 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1181 	struct intel_engine_cs *engine = gt->engine[RCS0];
1182 	struct i915_request *rq;
1183 	unsigned int reset_count;
1184 	struct hang h;
1185 	long timeout;
1186 	int err;
1187 
1188 	if (!engine || !intel_engine_can_store_dword(engine))
1189 		return 0;
1190 
1191 	/* Check that we detect a stuck waiter and issue a reset */
1192 
1193 	igt_global_reset_lock(gt);
1194 
1195 	err = hang_init(&h, gt);
1196 	if (err)
1197 		goto unlock;
1198 
1199 	rq = hang_create_request(&h, engine);
1200 	if (IS_ERR(rq)) {
1201 		err = PTR_ERR(rq);
1202 		goto fini;
1203 	}
1204 
1205 	i915_request_get(rq);
1206 	i915_request_add(rq);
1207 
1208 	if (!wait_until_running(&h, rq)) {
1209 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1210 
1211 		pr_err("%s: Failed to start request %llx, at %x\n",
1212 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1213 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1214 
1215 		intel_gt_set_wedged(gt);
1216 
1217 		err = -EIO;
1218 		goto out_rq;
1219 	}
1220 
1221 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1222 
1223 	timeout = i915_request_wait(rq, 0, 10);
1224 	if (timeout < 0) {
1225 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1226 		       timeout);
1227 		err = timeout;
1228 		goto out_rq;
1229 	}
1230 
1231 	if (i915_reset_count(global) == reset_count) {
1232 		pr_err("No GPU reset recorded!\n");
1233 		err = -EINVAL;
1234 		goto out_rq;
1235 	}
1236 
1237 out_rq:
1238 	i915_request_put(rq);
1239 fini:
1240 	hang_fini(&h);
1241 unlock:
1242 	igt_global_reset_unlock(gt);
1243 
1244 	if (intel_gt_is_wedged(gt))
1245 		return -EIO;
1246 
1247 	return err;
1248 }
1249 
1250 struct evict_vma {
1251 	struct completion completion;
1252 	struct i915_vma *vma;
1253 };
1254 
1255 static int evict_vma(void *data)
1256 {
1257 	struct evict_vma *arg = data;
1258 	struct i915_address_space *vm = arg->vma->vm;
1259 	struct drm_mm_node evict = arg->vma->node;
1260 	int err;
1261 
1262 	complete(&arg->completion);
1263 
1264 	mutex_lock(&vm->mutex);
1265 	err = i915_gem_evict_for_node(vm, &evict, 0);
1266 	mutex_unlock(&vm->mutex);
1267 
1268 	return err;
1269 }
1270 
1271 static int evict_fence(void *data)
1272 {
1273 	struct evict_vma *arg = data;
1274 	int err;
1275 
1276 	complete(&arg->completion);
1277 
1278 	/* Mark the fence register as dirty to force the mmio update. */
1279 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1280 	if (err) {
1281 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1282 		return err;
1283 	}
1284 
1285 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1286 	if (err) {
1287 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1288 		return err;
1289 	}
1290 
1291 	err = i915_vma_pin_fence(arg->vma);
1292 	i915_vma_unpin(arg->vma);
1293 	if (err) {
1294 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1295 		return err;
1296 	}
1297 
1298 	i915_vma_unpin_fence(arg->vma);
1299 
1300 	return 0;
1301 }
1302 
1303 static int __igt_reset_evict_vma(struct intel_gt *gt,
1304 				 struct i915_address_space *vm,
1305 				 int (*fn)(void *),
1306 				 unsigned int flags)
1307 {
1308 	struct intel_engine_cs *engine = gt->engine[RCS0];
1309 	struct drm_i915_gem_object *obj;
1310 	struct task_struct *tsk = NULL;
1311 	struct i915_request *rq;
1312 	struct evict_vma arg;
1313 	struct hang h;
1314 	unsigned int pin_flags;
1315 	int err;
1316 
1317 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1318 		return 0;
1319 
1320 	if (!engine || !intel_engine_can_store_dword(engine))
1321 		return 0;
1322 
1323 	/* Check that we can recover an unbind stuck on a hanging request */
1324 
1325 	err = hang_init(&h, gt);
1326 	if (err)
1327 		return err;
1328 
1329 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1330 	if (IS_ERR(obj)) {
1331 		err = PTR_ERR(obj);
1332 		goto fini;
1333 	}
1334 
1335 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1336 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1337 		if (err) {
1338 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1339 			goto out_obj;
1340 		}
1341 	}
1342 
1343 	arg.vma = i915_vma_instance(obj, vm, NULL);
1344 	if (IS_ERR(arg.vma)) {
1345 		err = PTR_ERR(arg.vma);
1346 		goto out_obj;
1347 	}
1348 
1349 	rq = hang_create_request(&h, engine);
1350 	if (IS_ERR(rq)) {
1351 		err = PTR_ERR(rq);
1352 		goto out_obj;
1353 	}
1354 
1355 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1356 
1357 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1358 		pin_flags |= PIN_MAPPABLE;
1359 
1360 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1361 	if (err) {
1362 		i915_request_add(rq);
1363 		goto out_obj;
1364 	}
1365 
1366 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1367 		err = i915_vma_pin_fence(arg.vma);
1368 		if (err) {
1369 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1370 			i915_vma_unpin(arg.vma);
1371 			i915_request_add(rq);
1372 			goto out_obj;
1373 		}
1374 	}
1375 
1376 	i915_vma_lock(arg.vma);
1377 	err = i915_request_await_object(rq, arg.vma->obj,
1378 					flags & EXEC_OBJECT_WRITE);
1379 	if (err == 0)
1380 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1381 	i915_vma_unlock(arg.vma);
1382 
1383 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1384 		i915_vma_unpin_fence(arg.vma);
1385 	i915_vma_unpin(arg.vma);
1386 
1387 	i915_request_get(rq);
1388 	i915_request_add(rq);
1389 	if (err)
1390 		goto out_rq;
1391 
1392 	if (!wait_until_running(&h, rq)) {
1393 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1394 
1395 		pr_err("%s: Failed to start request %llx, at %x\n",
1396 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1397 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1398 
1399 		intel_gt_set_wedged(gt);
1400 		goto out_reset;
1401 	}
1402 
1403 	init_completion(&arg.completion);
1404 
1405 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1406 	if (IS_ERR(tsk)) {
1407 		err = PTR_ERR(tsk);
1408 		tsk = NULL;
1409 		goto out_reset;
1410 	}
1411 	get_task_struct(tsk);
1412 
1413 	wait_for_completion(&arg.completion);
1414 
1415 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1416 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1417 
1418 		pr_err("igt/evict_vma kthread did not wait\n");
1419 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1420 
1421 		intel_gt_set_wedged(gt);
1422 		goto out_reset;
1423 	}
1424 
1425 out_reset:
1426 	igt_global_reset_lock(gt);
1427 	fake_hangcheck(gt, rq->engine->mask);
1428 	igt_global_reset_unlock(gt);
1429 
1430 	if (tsk) {
1431 		struct intel_wedge_me w;
1432 
1433 		/* The reset, even indirectly, should take less than 10ms. */
1434 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1435 			err = kthread_stop(tsk);
1436 
1437 		put_task_struct(tsk);
1438 	}
1439 
1440 out_rq:
1441 	i915_request_put(rq);
1442 out_obj:
1443 	i915_gem_object_put(obj);
1444 fini:
1445 	hang_fini(&h);
1446 	if (intel_gt_is_wedged(gt))
1447 		return -EIO;
1448 
1449 	return err;
1450 }
1451 
1452 static int igt_reset_evict_ggtt(void *arg)
1453 {
1454 	struct intel_gt *gt = arg;
1455 
1456 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1457 				     evict_vma, EXEC_OBJECT_WRITE);
1458 }
1459 
1460 static int igt_reset_evict_ppgtt(void *arg)
1461 {
1462 	struct intel_gt *gt = arg;
1463 	struct i915_ppgtt *ppgtt;
1464 	int err;
1465 
1466 	/* aliasing == global gtt locking, covered above */
1467 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1468 		return 0;
1469 
1470 	ppgtt = i915_ppgtt_create(gt);
1471 	if (IS_ERR(ppgtt))
1472 		return PTR_ERR(ppgtt);
1473 
1474 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1475 				    evict_vma, EXEC_OBJECT_WRITE);
1476 	i915_vm_put(&ppgtt->vm);
1477 
1478 	return err;
1479 }
1480 
1481 static int igt_reset_evict_fence(void *arg)
1482 {
1483 	struct intel_gt *gt = arg;
1484 
1485 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1486 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1487 }
1488 
1489 static int wait_for_others(struct intel_gt *gt,
1490 			   struct intel_engine_cs *exclude)
1491 {
1492 	struct intel_engine_cs *engine;
1493 	enum intel_engine_id id;
1494 
1495 	for_each_engine(engine, gt, id) {
1496 		if (engine == exclude)
1497 			continue;
1498 
1499 		if (!wait_for_idle(engine))
1500 			return -EIO;
1501 	}
1502 
1503 	return 0;
1504 }
1505 
1506 static int igt_reset_queue(void *arg)
1507 {
1508 	struct intel_gt *gt = arg;
1509 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1510 	struct intel_engine_cs *engine;
1511 	enum intel_engine_id id;
1512 	struct hang h;
1513 	int err;
1514 
1515 	/* Check that we replay pending requests following a hang */
1516 
1517 	igt_global_reset_lock(gt);
1518 
1519 	err = hang_init(&h, gt);
1520 	if (err)
1521 		goto unlock;
1522 
1523 	for_each_engine(engine, gt, id) {
1524 		struct i915_request *prev;
1525 		IGT_TIMEOUT(end_time);
1526 		unsigned int count;
1527 
1528 		if (!intel_engine_can_store_dword(engine))
1529 			continue;
1530 
1531 		prev = hang_create_request(&h, engine);
1532 		if (IS_ERR(prev)) {
1533 			err = PTR_ERR(prev);
1534 			goto fini;
1535 		}
1536 
1537 		i915_request_get(prev);
1538 		i915_request_add(prev);
1539 
1540 		count = 0;
1541 		do {
1542 			struct i915_request *rq;
1543 			unsigned int reset_count;
1544 
1545 			rq = hang_create_request(&h, engine);
1546 			if (IS_ERR(rq)) {
1547 				err = PTR_ERR(rq);
1548 				goto fini;
1549 			}
1550 
1551 			i915_request_get(rq);
1552 			i915_request_add(rq);
1553 
1554 			/*
1555 			 * XXX We don't handle resetting the kernel context
1556 			 * very well. If we trigger a device reset twice in
1557 			 * quick succession while the kernel context is
1558 			 * executing, we may end up skipping the breadcrumb.
1559 			 * This is really only a problem for the selftest as
1560 			 * normally there is a large interlude between resets
1561 			 * (hangcheck), or we focus on resetting just one
1562 			 * engine and so avoid repeatedly resetting innocents.
1563 			 */
1564 			err = wait_for_others(gt, engine);
1565 			if (err) {
1566 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1567 				       __func__, engine->name);
1568 				i915_request_put(rq);
1569 				i915_request_put(prev);
1570 
1571 				GEM_TRACE_DUMP();
1572 				intel_gt_set_wedged(gt);
1573 				goto fini;
1574 			}
1575 
1576 			if (!wait_until_running(&h, prev)) {
1577 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1578 
1579 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1580 				       __func__, engine->name,
1581 				       prev->fence.seqno, hws_seqno(&h, prev));
1582 				intel_engine_dump(engine, &p,
1583 						  "%s\n", engine->name);
1584 
1585 				i915_request_put(rq);
1586 				i915_request_put(prev);
1587 
1588 				intel_gt_set_wedged(gt);
1589 
1590 				err = -EIO;
1591 				goto fini;
1592 			}
1593 
1594 			reset_count = fake_hangcheck(gt, BIT(id));
1595 
1596 			if (prev->fence.error != -EIO) {
1597 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1598 				       prev->fence.error);
1599 				i915_request_put(rq);
1600 				i915_request_put(prev);
1601 				err = -EINVAL;
1602 				goto fini;
1603 			}
1604 
1605 			if (rq->fence.error) {
1606 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1607 				       rq->fence.error);
1608 				i915_request_put(rq);
1609 				i915_request_put(prev);
1610 				err = -EINVAL;
1611 				goto fini;
1612 			}
1613 
1614 			if (i915_reset_count(global) == reset_count) {
1615 				pr_err("No GPU reset recorded!\n");
1616 				i915_request_put(rq);
1617 				i915_request_put(prev);
1618 				err = -EINVAL;
1619 				goto fini;
1620 			}
1621 
1622 			i915_request_put(prev);
1623 			prev = rq;
1624 			count++;
1625 		} while (time_before(jiffies, end_time));
1626 		pr_info("%s: Completed %d resets\n", engine->name, count);
1627 
1628 		*h.batch = MI_BATCH_BUFFER_END;
1629 		intel_gt_chipset_flush(engine->gt);
1630 
1631 		i915_request_put(prev);
1632 
1633 		err = igt_flush_test(gt->i915);
1634 		if (err)
1635 			break;
1636 	}
1637 
1638 fini:
1639 	hang_fini(&h);
1640 unlock:
1641 	igt_global_reset_unlock(gt);
1642 
1643 	if (intel_gt_is_wedged(gt))
1644 		return -EIO;
1645 
1646 	return err;
1647 }
1648 
1649 static int igt_handle_error(void *arg)
1650 {
1651 	struct intel_gt *gt = arg;
1652 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1653 	struct intel_engine_cs *engine = gt->engine[RCS0];
1654 	struct hang h;
1655 	struct i915_request *rq;
1656 	struct i915_gpu_coredump *error;
1657 	int err;
1658 
1659 	/* Check that we can issue a global GPU and engine reset */
1660 
1661 	if (!intel_has_reset_engine(gt))
1662 		return 0;
1663 
1664 	if (!engine || !intel_engine_can_store_dword(engine))
1665 		return 0;
1666 
1667 	err = hang_init(&h, gt);
1668 	if (err)
1669 		return err;
1670 
1671 	rq = hang_create_request(&h, engine);
1672 	if (IS_ERR(rq)) {
1673 		err = PTR_ERR(rq);
1674 		goto err_fini;
1675 	}
1676 
1677 	i915_request_get(rq);
1678 	i915_request_add(rq);
1679 
1680 	if (!wait_until_running(&h, rq)) {
1681 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1682 
1683 		pr_err("%s: Failed to start request %llx, at %x\n",
1684 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1685 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1686 
1687 		intel_gt_set_wedged(gt);
1688 
1689 		err = -EIO;
1690 		goto err_request;
1691 	}
1692 
1693 	/* Temporarily disable error capture */
1694 	error = xchg(&global->first_error, (void *)-1);
1695 
1696 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1697 
1698 	xchg(&global->first_error, error);
1699 
1700 	if (rq->fence.error != -EIO) {
1701 		pr_err("Guilty request not identified!\n");
1702 		err = -EINVAL;
1703 		goto err_request;
1704 	}
1705 
1706 err_request:
1707 	i915_request_put(rq);
1708 err_fini:
1709 	hang_fini(&h);
1710 	return err;
1711 }
1712 
1713 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1714 				     const struct igt_atomic_section *p,
1715 				     const char *mode)
1716 {
1717 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1718 	int err;
1719 
1720 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1721 		  engine->name, mode, p->name);
1722 
1723 	tasklet_disable(t);
1724 	if (strcmp(p->name, "softirq"))
1725 		local_bh_disable();
1726 	p->critical_section_begin();
1727 
1728 	err = __intel_engine_reset_bh(engine, NULL);
1729 
1730 	p->critical_section_end();
1731 	if (strcmp(p->name, "softirq"))
1732 		local_bh_enable();
1733 	tasklet_enable(t);
1734 	tasklet_hi_schedule(t);
1735 
1736 	if (err)
1737 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1738 		       engine->name, mode, p->name);
1739 
1740 	return err;
1741 }
1742 
1743 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1744 				   const struct igt_atomic_section *p)
1745 {
1746 	struct i915_request *rq;
1747 	struct hang h;
1748 	int err;
1749 
1750 	err = __igt_atomic_reset_engine(engine, p, "idle");
1751 	if (err)
1752 		return err;
1753 
1754 	err = hang_init(&h, engine->gt);
1755 	if (err)
1756 		return err;
1757 
1758 	rq = hang_create_request(&h, engine);
1759 	if (IS_ERR(rq)) {
1760 		err = PTR_ERR(rq);
1761 		goto out;
1762 	}
1763 
1764 	i915_request_get(rq);
1765 	i915_request_add(rq);
1766 
1767 	if (wait_until_running(&h, rq)) {
1768 		err = __igt_atomic_reset_engine(engine, p, "active");
1769 	} else {
1770 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1771 		       __func__, engine->name,
1772 		       rq->fence.seqno, hws_seqno(&h, rq));
1773 		intel_gt_set_wedged(engine->gt);
1774 		err = -EIO;
1775 	}
1776 
1777 	if (err == 0) {
1778 		struct intel_wedge_me w;
1779 
1780 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1781 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1782 		if (intel_gt_is_wedged(engine->gt))
1783 			err = -EIO;
1784 	}
1785 
1786 	i915_request_put(rq);
1787 out:
1788 	hang_fini(&h);
1789 	return err;
1790 }
1791 
1792 static int igt_reset_engines_atomic(void *arg)
1793 {
1794 	struct intel_gt *gt = arg;
1795 	const typeof(*igt_atomic_phases) *p;
1796 	int err = 0;
1797 
1798 	/* Check that the engines resets are usable from atomic context */
1799 
1800 	if (!intel_has_reset_engine(gt))
1801 		return 0;
1802 
1803 	if (intel_uc_uses_guc_submission(&gt->uc))
1804 		return 0;
1805 
1806 	igt_global_reset_lock(gt);
1807 
1808 	/* Flush any requests before we get started and check basics */
1809 	if (!igt_force_reset(gt))
1810 		goto unlock;
1811 
1812 	for (p = igt_atomic_phases; p->name; p++) {
1813 		struct intel_engine_cs *engine;
1814 		enum intel_engine_id id;
1815 
1816 		for_each_engine(engine, gt, id) {
1817 			err = igt_atomic_reset_engine(engine, p);
1818 			if (err)
1819 				goto out;
1820 		}
1821 	}
1822 
1823 out:
1824 	/* As we poke around the guts, do a full reset before continuing. */
1825 	igt_force_reset(gt);
1826 unlock:
1827 	igt_global_reset_unlock(gt);
1828 
1829 	return err;
1830 }
1831 
1832 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1833 {
1834 	static const struct i915_subtest tests[] = {
1835 		SUBTEST(igt_hang_sanitycheck),
1836 		SUBTEST(igt_reset_nop),
1837 		SUBTEST(igt_reset_nop_engine),
1838 		SUBTEST(igt_reset_idle_engine),
1839 		SUBTEST(igt_reset_active_engine),
1840 		SUBTEST(igt_reset_fail_engine),
1841 		SUBTEST(igt_reset_engines),
1842 		SUBTEST(igt_reset_engines_atomic),
1843 		SUBTEST(igt_reset_queue),
1844 		SUBTEST(igt_reset_wait),
1845 		SUBTEST(igt_reset_evict_ggtt),
1846 		SUBTEST(igt_reset_evict_ppgtt),
1847 		SUBTEST(igt_reset_evict_fence),
1848 		SUBTEST(igt_handle_error),
1849 	};
1850 	struct intel_gt *gt = &i915->gt;
1851 	intel_wakeref_t wakeref;
1852 	int err;
1853 
1854 	if (!intel_has_gpu_reset(gt))
1855 		return 0;
1856 
1857 	if (intel_gt_is_wedged(gt))
1858 		return -EIO; /* we're long past hope of a successful reset */
1859 
1860 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1861 
1862 	err = intel_gt_live_subtests(tests, gt);
1863 
1864 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1865 
1866 	return err;
1867 }
1868