1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/kthread.h>
26 
27 #include "gem/i915_gem_context.h"
28 
29 #include "intel_gt.h"
30 #include "intel_engine_heartbeat.h"
31 #include "intel_engine_pm.h"
32 #include "selftest_engine_heartbeat.h"
33 
34 #include "i915_selftest.h"
35 #include "selftests/i915_random.h"
36 #include "selftests/igt_flush_test.h"
37 #include "selftests/igt_reset.h"
38 #include "selftests/igt_atomic.h"
39 
40 #include "selftests/mock_drm.h"
41 
42 #include "gem/selftests/mock_context.h"
43 #include "gem/selftests/igt_gem_utils.h"
44 
45 #define IGT_IDLE_TIMEOUT 50 /* ms; time to wait after flushing between tests */
46 
47 struct hang {
48 	struct intel_gt *gt;
49 	struct drm_i915_gem_object *hws;
50 	struct drm_i915_gem_object *obj;
51 	struct i915_gem_context *ctx;
52 	u32 *seqno;
53 	u32 *batch;
54 };
55 
56 static int hang_init(struct hang *h, struct intel_gt *gt)
57 {
58 	void *vaddr;
59 	int err;
60 
61 	memset(h, 0, sizeof(*h));
62 	h->gt = gt;
63 
64 	h->ctx = kernel_context(gt->i915);
65 	if (IS_ERR(h->ctx))
66 		return PTR_ERR(h->ctx);
67 
68 	GEM_BUG_ON(i915_gem_context_is_bannable(h->ctx));
69 
70 	h->hws = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
71 	if (IS_ERR(h->hws)) {
72 		err = PTR_ERR(h->hws);
73 		goto err_ctx;
74 	}
75 
76 	h->obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
77 	if (IS_ERR(h->obj)) {
78 		err = PTR_ERR(h->obj);
79 		goto err_hws;
80 	}
81 
82 	i915_gem_object_set_cache_coherency(h->hws, I915_CACHE_LLC);
83 	vaddr = i915_gem_object_pin_map(h->hws, I915_MAP_WB);
84 	if (IS_ERR(vaddr)) {
85 		err = PTR_ERR(vaddr);
86 		goto err_obj;
87 	}
88 	h->seqno = memset(vaddr, 0xff, PAGE_SIZE);
89 
90 	vaddr = i915_gem_object_pin_map(h->obj,
91 					i915_coherent_map_type(gt->i915));
92 	if (IS_ERR(vaddr)) {
93 		err = PTR_ERR(vaddr);
94 		goto err_unpin_hws;
95 	}
96 	h->batch = vaddr;
97 
98 	return 0;
99 
100 err_unpin_hws:
101 	i915_gem_object_unpin_map(h->hws);
102 err_obj:
103 	i915_gem_object_put(h->obj);
104 err_hws:
105 	i915_gem_object_put(h->hws);
106 err_ctx:
107 	kernel_context_close(h->ctx);
108 	return err;
109 }
110 
111 static u64 hws_address(const struct i915_vma *hws,
112 		       const struct i915_request *rq)
113 {
114 	return hws->node.start + offset_in_page(sizeof(u32)*rq->fence.context);
115 }
116 
117 static int move_to_active(struct i915_vma *vma,
118 			  struct i915_request *rq,
119 			  unsigned int flags)
120 {
121 	int err;
122 
123 	i915_vma_lock(vma);
124 	err = i915_request_await_object(rq, vma->obj,
125 					flags & EXEC_OBJECT_WRITE);
126 	if (err == 0)
127 		err = i915_vma_move_to_active(vma, rq, flags);
128 	i915_vma_unlock(vma);
129 
130 	return err;
131 }
132 
133 static struct i915_request *
134 hang_create_request(struct hang *h, struct intel_engine_cs *engine)
135 {
136 	struct intel_gt *gt = h->gt;
137 	struct i915_address_space *vm = i915_gem_context_get_vm_rcu(h->ctx);
138 	struct drm_i915_gem_object *obj;
139 	struct i915_request *rq = NULL;
140 	struct i915_vma *hws, *vma;
141 	unsigned int flags;
142 	void *vaddr;
143 	u32 *batch;
144 	int err;
145 
146 	obj = i915_gem_object_create_internal(gt->i915, PAGE_SIZE);
147 	if (IS_ERR(obj)) {
148 		i915_vm_put(vm);
149 		return ERR_CAST(obj);
150 	}
151 
152 	vaddr = i915_gem_object_pin_map(obj, i915_coherent_map_type(gt->i915));
153 	if (IS_ERR(vaddr)) {
154 		i915_gem_object_put(obj);
155 		i915_vm_put(vm);
156 		return ERR_CAST(vaddr);
157 	}
158 
159 	i915_gem_object_unpin_map(h->obj);
160 	i915_gem_object_put(h->obj);
161 
162 	h->obj = obj;
163 	h->batch = vaddr;
164 
165 	vma = i915_vma_instance(h->obj, vm, NULL);
166 	if (IS_ERR(vma)) {
167 		i915_vm_put(vm);
168 		return ERR_CAST(vma);
169 	}
170 
171 	hws = i915_vma_instance(h->hws, vm, NULL);
172 	if (IS_ERR(hws)) {
173 		i915_vm_put(vm);
174 		return ERR_CAST(hws);
175 	}
176 
177 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
178 	if (err) {
179 		i915_vm_put(vm);
180 		return ERR_PTR(err);
181 	}
182 
183 	err = i915_vma_pin(hws, 0, 0, PIN_USER);
184 	if (err)
185 		goto unpin_vma;
186 
187 	rq = igt_request_alloc(h->ctx, engine);
188 	if (IS_ERR(rq)) {
189 		err = PTR_ERR(rq);
190 		goto unpin_hws;
191 	}
192 
193 	err = move_to_active(vma, rq, 0);
194 	if (err)
195 		goto cancel_rq;
196 
197 	err = move_to_active(hws, rq, 0);
198 	if (err)
199 		goto cancel_rq;
200 
201 	batch = h->batch;
202 	if (INTEL_GEN(gt->i915) >= 8) {
203 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
204 		*batch++ = lower_32_bits(hws_address(hws, rq));
205 		*batch++ = upper_32_bits(hws_address(hws, rq));
206 		*batch++ = rq->fence.seqno;
207 		*batch++ = MI_NOOP;
208 
209 		memset(batch, 0, 1024);
210 		batch += 1024 / sizeof(*batch);
211 
212 		*batch++ = MI_NOOP;
213 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
214 		*batch++ = lower_32_bits(vma->node.start);
215 		*batch++ = upper_32_bits(vma->node.start);
216 	} else if (INTEL_GEN(gt->i915) >= 6) {
217 		*batch++ = MI_STORE_DWORD_IMM_GEN4;
218 		*batch++ = 0;
219 		*batch++ = lower_32_bits(hws_address(hws, rq));
220 		*batch++ = rq->fence.seqno;
221 		*batch++ = MI_NOOP;
222 
223 		memset(batch, 0, 1024);
224 		batch += 1024 / sizeof(*batch);
225 
226 		*batch++ = MI_NOOP;
227 		*batch++ = MI_BATCH_BUFFER_START | 1 << 8;
228 		*batch++ = lower_32_bits(vma->node.start);
229 	} else if (INTEL_GEN(gt->i915) >= 4) {
230 		*batch++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
231 		*batch++ = 0;
232 		*batch++ = lower_32_bits(hws_address(hws, rq));
233 		*batch++ = rq->fence.seqno;
234 		*batch++ = MI_NOOP;
235 
236 		memset(batch, 0, 1024);
237 		batch += 1024 / sizeof(*batch);
238 
239 		*batch++ = MI_NOOP;
240 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
241 		*batch++ = lower_32_bits(vma->node.start);
242 	} else {
243 		*batch++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
244 		*batch++ = lower_32_bits(hws_address(hws, rq));
245 		*batch++ = rq->fence.seqno;
246 		*batch++ = MI_NOOP;
247 
248 		memset(batch, 0, 1024);
249 		batch += 1024 / sizeof(*batch);
250 
251 		*batch++ = MI_NOOP;
252 		*batch++ = MI_BATCH_BUFFER_START | 2 << 6;
253 		*batch++ = lower_32_bits(vma->node.start);
254 	}
255 	*batch++ = MI_BATCH_BUFFER_END; /* not reached */
256 	intel_gt_chipset_flush(engine->gt);
257 
258 	if (rq->engine->emit_init_breadcrumb) {
259 		err = rq->engine->emit_init_breadcrumb(rq);
260 		if (err)
261 			goto cancel_rq;
262 	}
263 
264 	flags = 0;
265 	if (INTEL_GEN(gt->i915) <= 5)
266 		flags |= I915_DISPATCH_SECURE;
267 
268 	err = rq->engine->emit_bb_start(rq, vma->node.start, PAGE_SIZE, flags);
269 
270 cancel_rq:
271 	if (err) {
272 		i915_request_set_error_once(rq, err);
273 		i915_request_add(rq);
274 	}
275 unpin_hws:
276 	i915_vma_unpin(hws);
277 unpin_vma:
278 	i915_vma_unpin(vma);
279 	i915_vm_put(vm);
280 	return err ? ERR_PTR(err) : rq;
281 }
282 
283 static u32 hws_seqno(const struct hang *h, const struct i915_request *rq)
284 {
285 	return READ_ONCE(h->seqno[rq->fence.context % (PAGE_SIZE/sizeof(u32))]);
286 }
287 
288 static void hang_fini(struct hang *h)
289 {
290 	*h->batch = MI_BATCH_BUFFER_END;
291 	intel_gt_chipset_flush(h->gt);
292 
293 	i915_gem_object_unpin_map(h->obj);
294 	i915_gem_object_put(h->obj);
295 
296 	i915_gem_object_unpin_map(h->hws);
297 	i915_gem_object_put(h->hws);
298 
299 	kernel_context_close(h->ctx);
300 
301 	igt_flush_test(h->gt->i915);
302 }
303 
304 static bool wait_until_running(struct hang *h, struct i915_request *rq)
305 {
306 	return !(wait_for_us(i915_seqno_passed(hws_seqno(h, rq),
307 					       rq->fence.seqno),
308 			     10) &&
309 		 wait_for(i915_seqno_passed(hws_seqno(h, rq),
310 					    rq->fence.seqno),
311 			  1000));
312 }
313 
314 static int igt_hang_sanitycheck(void *arg)
315 {
316 	struct intel_gt *gt = arg;
317 	struct i915_request *rq;
318 	struct intel_engine_cs *engine;
319 	enum intel_engine_id id;
320 	struct hang h;
321 	int err;
322 
323 	/* Basic check that we can execute our hanging batch */
324 
325 	err = hang_init(&h, gt);
326 	if (err)
327 		return err;
328 
329 	for_each_engine(engine, gt, id) {
330 		struct intel_wedge_me w;
331 		long timeout;
332 
333 		if (!intel_engine_can_store_dword(engine))
334 			continue;
335 
336 		rq = hang_create_request(&h, engine);
337 		if (IS_ERR(rq)) {
338 			err = PTR_ERR(rq);
339 			pr_err("Failed to create request for %s, err=%d\n",
340 			       engine->name, err);
341 			goto fini;
342 		}
343 
344 		i915_request_get(rq);
345 
346 		*h.batch = MI_BATCH_BUFFER_END;
347 		intel_gt_chipset_flush(engine->gt);
348 
349 		i915_request_add(rq);
350 
351 		timeout = 0;
352 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
353 			timeout = i915_request_wait(rq, 0,
354 						    MAX_SCHEDULE_TIMEOUT);
355 		if (intel_gt_is_wedged(gt))
356 			timeout = -EIO;
357 
358 		i915_request_put(rq);
359 
360 		if (timeout < 0) {
361 			err = timeout;
362 			pr_err("Wait for request failed on %s, err=%d\n",
363 			       engine->name, err);
364 			goto fini;
365 		}
366 	}
367 
368 fini:
369 	hang_fini(&h);
370 	return err;
371 }
372 
373 static bool wait_for_idle(struct intel_engine_cs *engine)
374 {
375 	return wait_for(intel_engine_is_idle(engine), IGT_IDLE_TIMEOUT) == 0;
376 }
377 
378 static int igt_reset_nop(void *arg)
379 {
380 	struct intel_gt *gt = arg;
381 	struct i915_gpu_error *global = &gt->i915->gpu_error;
382 	struct intel_engine_cs *engine;
383 	unsigned int reset_count, count;
384 	enum intel_engine_id id;
385 	IGT_TIMEOUT(end_time);
386 	int err = 0;
387 
388 	/* Check that we can reset during non-user portions of requests */
389 
390 	reset_count = i915_reset_count(global);
391 	count = 0;
392 	do {
393 		for_each_engine(engine, gt, id) {
394 			struct intel_context *ce;
395 			int i;
396 
397 			ce = intel_context_create(engine);
398 			if (IS_ERR(ce)) {
399 				err = PTR_ERR(ce);
400 				break;
401 			}
402 
403 			for (i = 0; i < 16; i++) {
404 				struct i915_request *rq;
405 
406 				rq = intel_context_create_request(ce);
407 				if (IS_ERR(rq)) {
408 					err = PTR_ERR(rq);
409 					break;
410 				}
411 
412 				i915_request_add(rq);
413 			}
414 
415 			intel_context_put(ce);
416 		}
417 
418 		igt_global_reset_lock(gt);
419 		intel_gt_reset(gt, ALL_ENGINES, NULL);
420 		igt_global_reset_unlock(gt);
421 
422 		if (intel_gt_is_wedged(gt)) {
423 			err = -EIO;
424 			break;
425 		}
426 
427 		if (i915_reset_count(global) != reset_count + ++count) {
428 			pr_err("Full GPU reset not recorded!\n");
429 			err = -EINVAL;
430 			break;
431 		}
432 
433 		err = igt_flush_test(gt->i915);
434 		if (err)
435 			break;
436 	} while (time_before(jiffies, end_time));
437 	pr_info("%s: %d resets\n", __func__, count);
438 
439 	if (igt_flush_test(gt->i915))
440 		err = -EIO;
441 	return err;
442 }
443 
444 static int igt_reset_nop_engine(void *arg)
445 {
446 	struct intel_gt *gt = arg;
447 	struct i915_gpu_error *global = &gt->i915->gpu_error;
448 	struct intel_engine_cs *engine;
449 	enum intel_engine_id id;
450 
451 	/* Check that we can engine-reset during non-user portions */
452 
453 	if (!intel_has_reset_engine(gt))
454 		return 0;
455 
456 	for_each_engine(engine, gt, id) {
457 		unsigned int reset_count, reset_engine_count, count;
458 		struct intel_context *ce;
459 		IGT_TIMEOUT(end_time);
460 		int err;
461 
462 		ce = intel_context_create(engine);
463 		if (IS_ERR(ce))
464 			return PTR_ERR(ce);
465 
466 		reset_count = i915_reset_count(global);
467 		reset_engine_count = i915_reset_engine_count(global, engine);
468 		count = 0;
469 
470 		st_engine_heartbeat_disable(engine);
471 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
472 		do {
473 			int i;
474 
475 			if (!wait_for_idle(engine)) {
476 				pr_err("%s failed to idle before reset\n",
477 				       engine->name);
478 				err = -EIO;
479 				break;
480 			}
481 
482 			for (i = 0; i < 16; i++) {
483 				struct i915_request *rq;
484 
485 				rq = intel_context_create_request(ce);
486 				if (IS_ERR(rq)) {
487 					struct drm_printer p =
488 						drm_info_printer(gt->i915->drm.dev);
489 					intel_engine_dump(engine, &p,
490 							  "%s(%s): failed to submit request\n",
491 							  __func__,
492 							  engine->name);
493 
494 					GEM_TRACE("%s(%s): failed to submit request\n",
495 						  __func__,
496 						  engine->name);
497 					GEM_TRACE_DUMP();
498 
499 					intel_gt_set_wedged(gt);
500 
501 					err = PTR_ERR(rq);
502 					break;
503 				}
504 
505 				i915_request_add(rq);
506 			}
507 			err = intel_engine_reset(engine, NULL);
508 			if (err) {
509 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
510 				       engine->name, err);
511 				break;
512 			}
513 
514 			if (i915_reset_count(global) != reset_count) {
515 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
516 				err = -EINVAL;
517 				break;
518 			}
519 
520 			if (i915_reset_engine_count(global, engine) !=
521 			    reset_engine_count + ++count) {
522 				pr_err("%s engine reset not recorded!\n",
523 				       engine->name);
524 				err = -EINVAL;
525 				break;
526 			}
527 		} while (time_before(jiffies, end_time));
528 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
529 		st_engine_heartbeat_enable(engine);
530 
531 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
532 
533 		intel_context_put(ce);
534 		if (igt_flush_test(gt->i915))
535 			err = -EIO;
536 		if (err)
537 			return err;
538 	}
539 
540 	return 0;
541 }
542 
543 static void force_reset_timeout(struct intel_engine_cs *engine)
544 {
545 	engine->reset_timeout.probability = 999;
546 	atomic_set(&engine->reset_timeout.times, -1);
547 }
548 
549 static void cancel_reset_timeout(struct intel_engine_cs *engine)
550 {
551 	memset(&engine->reset_timeout, 0, sizeof(engine->reset_timeout));
552 }
553 
554 static int igt_reset_fail_engine(void *arg)
555 {
556 	struct intel_gt *gt = arg;
557 	struct intel_engine_cs *engine;
558 	enum intel_engine_id id;
559 
560 	/* Check that we can recover from engine-reset failues */
561 
562 	if (!intel_has_reset_engine(gt))
563 		return 0;
564 
565 	for_each_engine(engine, gt, id) {
566 		unsigned int count;
567 		struct intel_context *ce;
568 		IGT_TIMEOUT(end_time);
569 		int err;
570 
571 		ce = intel_context_create(engine);
572 		if (IS_ERR(ce))
573 			return PTR_ERR(ce);
574 
575 		st_engine_heartbeat_disable(engine);
576 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
577 
578 		force_reset_timeout(engine);
579 		err = intel_engine_reset(engine, NULL);
580 		cancel_reset_timeout(engine);
581 		if (err == 0) /* timeouts only generated on gen8+ */
582 			goto skip;
583 
584 		count = 0;
585 		do {
586 			struct i915_request *last = NULL;
587 			int i;
588 
589 			if (!wait_for_idle(engine)) {
590 				pr_err("%s failed to idle before reset\n",
591 				       engine->name);
592 				err = -EIO;
593 				break;
594 			}
595 
596 			for (i = 0; i < count % 15; i++) {
597 				struct i915_request *rq;
598 
599 				rq = intel_context_create_request(ce);
600 				if (IS_ERR(rq)) {
601 					struct drm_printer p =
602 						drm_info_printer(gt->i915->drm.dev);
603 					intel_engine_dump(engine, &p,
604 							  "%s(%s): failed to submit request\n",
605 							  __func__,
606 							  engine->name);
607 
608 					GEM_TRACE("%s(%s): failed to submit request\n",
609 						  __func__,
610 						  engine->name);
611 					GEM_TRACE_DUMP();
612 
613 					intel_gt_set_wedged(gt);
614 					if (last)
615 						i915_request_put(last);
616 
617 					err = PTR_ERR(rq);
618 					goto out;
619 				}
620 
621 				if (last)
622 					i915_request_put(last);
623 				last = i915_request_get(rq);
624 				i915_request_add(rq);
625 			}
626 
627 			if (count & 1) {
628 				err = intel_engine_reset(engine, NULL);
629 				if (err) {
630 					GEM_TRACE_ERR("intel_engine_reset(%s) failed, err:%d\n",
631 						      engine->name, err);
632 					GEM_TRACE_DUMP();
633 					i915_request_put(last);
634 					break;
635 				}
636 			} else {
637 				force_reset_timeout(engine);
638 				err = intel_engine_reset(engine, NULL);
639 				cancel_reset_timeout(engine);
640 				if (err != -ETIMEDOUT) {
641 					pr_err("intel_engine_reset(%s) did not fail, err:%d\n",
642 					       engine->name, err);
643 					i915_request_put(last);
644 					break;
645 				}
646 			}
647 
648 			err = 0;
649 			if (last) {
650 				if (i915_request_wait(last, 0, HZ / 2) < 0) {
651 					struct drm_printer p =
652 						drm_info_printer(gt->i915->drm.dev);
653 
654 					intel_engine_dump(engine, &p,
655 							  "%s(%s): failed to complete request\n",
656 							  __func__,
657 							  engine->name);
658 
659 					GEM_TRACE("%s(%s): failed to complete request\n",
660 						  __func__,
661 						  engine->name);
662 					GEM_TRACE_DUMP();
663 
664 					err = -EIO;
665 				}
666 				i915_request_put(last);
667 			}
668 			count++;
669 		} while (err == 0 && time_before(jiffies, end_time));
670 out:
671 		pr_info("%s(%s): %d resets\n", __func__, engine->name, count);
672 skip:
673 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
674 		st_engine_heartbeat_enable(engine);
675 		intel_context_put(ce);
676 
677 		if (igt_flush_test(gt->i915))
678 			err = -EIO;
679 		if (err)
680 			return err;
681 	}
682 
683 	return 0;
684 }
685 
686 static int __igt_reset_engine(struct intel_gt *gt, bool active)
687 {
688 	struct i915_gpu_error *global = &gt->i915->gpu_error;
689 	struct intel_engine_cs *engine;
690 	enum intel_engine_id id;
691 	struct hang h;
692 	int err = 0;
693 
694 	/* Check that we can issue an engine reset on an idle engine (no-op) */
695 
696 	if (!intel_has_reset_engine(gt))
697 		return 0;
698 
699 	if (active) {
700 		err = hang_init(&h, gt);
701 		if (err)
702 			return err;
703 	}
704 
705 	for_each_engine(engine, gt, id) {
706 		unsigned int reset_count, reset_engine_count;
707 		unsigned long count;
708 		IGT_TIMEOUT(end_time);
709 
710 		if (active && !intel_engine_can_store_dword(engine))
711 			continue;
712 
713 		if (!wait_for_idle(engine)) {
714 			pr_err("%s failed to idle before reset\n",
715 			       engine->name);
716 			err = -EIO;
717 			break;
718 		}
719 
720 		reset_count = i915_reset_count(global);
721 		reset_engine_count = i915_reset_engine_count(global, engine);
722 
723 		st_engine_heartbeat_disable(engine);
724 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
725 		count = 0;
726 		do {
727 			if (active) {
728 				struct i915_request *rq;
729 
730 				rq = hang_create_request(&h, engine);
731 				if (IS_ERR(rq)) {
732 					err = PTR_ERR(rq);
733 					break;
734 				}
735 
736 				i915_request_get(rq);
737 				i915_request_add(rq);
738 
739 				if (!wait_until_running(&h, rq)) {
740 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
741 
742 					pr_err("%s: Failed to start request %llx, at %x\n",
743 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
744 					intel_engine_dump(engine, &p,
745 							  "%s\n", engine->name);
746 
747 					i915_request_put(rq);
748 					err = -EIO;
749 					break;
750 				}
751 
752 				i915_request_put(rq);
753 			}
754 
755 			err = intel_engine_reset(engine, NULL);
756 			if (err) {
757 				pr_err("intel_engine_reset(%s) failed, err:%d\n",
758 				       engine->name, err);
759 				break;
760 			}
761 
762 			if (i915_reset_count(global) != reset_count) {
763 				pr_err("Full GPU reset recorded! (engine reset expected)\n");
764 				err = -EINVAL;
765 				break;
766 			}
767 
768 			if (i915_reset_engine_count(global, engine) !=
769 			    ++reset_engine_count) {
770 				pr_err("%s engine reset not recorded!\n",
771 				       engine->name);
772 				err = -EINVAL;
773 				break;
774 			}
775 
776 			count++;
777 		} while (time_before(jiffies, end_time));
778 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
779 		st_engine_heartbeat_enable(engine);
780 		pr_info("%s: Completed %lu %s resets\n",
781 			engine->name, count, active ? "active" : "idle");
782 
783 		if (err)
784 			break;
785 
786 		err = igt_flush_test(gt->i915);
787 		if (err)
788 			break;
789 	}
790 
791 	if (intel_gt_is_wedged(gt))
792 		err = -EIO;
793 
794 	if (active)
795 		hang_fini(&h);
796 
797 	return err;
798 }
799 
800 static int igt_reset_idle_engine(void *arg)
801 {
802 	return __igt_reset_engine(arg, false);
803 }
804 
805 static int igt_reset_active_engine(void *arg)
806 {
807 	return __igt_reset_engine(arg, true);
808 }
809 
810 struct active_engine {
811 	struct task_struct *task;
812 	struct intel_engine_cs *engine;
813 	unsigned long resets;
814 	unsigned int flags;
815 };
816 
817 #define TEST_ACTIVE	BIT(0)
818 #define TEST_OTHERS	BIT(1)
819 #define TEST_SELF	BIT(2)
820 #define TEST_PRIORITY	BIT(3)
821 
822 static int active_request_put(struct i915_request *rq)
823 {
824 	int err = 0;
825 
826 	if (!rq)
827 		return 0;
828 
829 	if (i915_request_wait(rq, 0, 5 * HZ) < 0) {
830 		GEM_TRACE("%s timed out waiting for completion of fence %llx:%lld\n",
831 			  rq->engine->name,
832 			  rq->fence.context,
833 			  rq->fence.seqno);
834 		GEM_TRACE_DUMP();
835 
836 		intel_gt_set_wedged(rq->engine->gt);
837 		err = -EIO;
838 	}
839 
840 	i915_request_put(rq);
841 
842 	return err;
843 }
844 
845 static int active_engine(void *data)
846 {
847 	I915_RND_STATE(prng);
848 	struct active_engine *arg = data;
849 	struct intel_engine_cs *engine = arg->engine;
850 	struct i915_request *rq[8] = {};
851 	struct intel_context *ce[ARRAY_SIZE(rq)];
852 	unsigned long count;
853 	int err = 0;
854 
855 	for (count = 0; count < ARRAY_SIZE(ce); count++) {
856 		ce[count] = intel_context_create(engine);
857 		if (IS_ERR(ce[count])) {
858 			err = PTR_ERR(ce[count]);
859 			while (--count)
860 				intel_context_put(ce[count]);
861 			return err;
862 		}
863 	}
864 
865 	count = 0;
866 	while (!kthread_should_stop()) {
867 		unsigned int idx = count++ & (ARRAY_SIZE(rq) - 1);
868 		struct i915_request *old = rq[idx];
869 		struct i915_request *new;
870 
871 		new = intel_context_create_request(ce[idx]);
872 		if (IS_ERR(new)) {
873 			err = PTR_ERR(new);
874 			break;
875 		}
876 
877 		rq[idx] = i915_request_get(new);
878 		i915_request_add(new);
879 
880 		if (engine->schedule && arg->flags & TEST_PRIORITY) {
881 			struct i915_sched_attr attr = {
882 				.priority =
883 					i915_prandom_u32_max_state(512, &prng),
884 			};
885 			engine->schedule(rq[idx], &attr);
886 		}
887 
888 		err = active_request_put(old);
889 		if (err)
890 			break;
891 
892 		cond_resched();
893 	}
894 
895 	for (count = 0; count < ARRAY_SIZE(rq); count++) {
896 		int err__ = active_request_put(rq[count]);
897 
898 		/* Keep the first error */
899 		if (!err)
900 			err = err__;
901 
902 		intel_context_put(ce[count]);
903 	}
904 
905 	return err;
906 }
907 
908 static int __igt_reset_engines(struct intel_gt *gt,
909 			       const char *test_name,
910 			       unsigned int flags)
911 {
912 	struct i915_gpu_error *global = &gt->i915->gpu_error;
913 	struct intel_engine_cs *engine, *other;
914 	enum intel_engine_id id, tmp;
915 	struct hang h;
916 	int err = 0;
917 
918 	/* Check that issuing a reset on one engine does not interfere
919 	 * with any other engine.
920 	 */
921 
922 	if (!intel_has_reset_engine(gt))
923 		return 0;
924 
925 	if (flags & TEST_ACTIVE) {
926 		err = hang_init(&h, gt);
927 		if (err)
928 			return err;
929 
930 		if (flags & TEST_PRIORITY)
931 			h.ctx->sched.priority = 1024;
932 	}
933 
934 	for_each_engine(engine, gt, id) {
935 		struct active_engine threads[I915_NUM_ENGINES] = {};
936 		unsigned long device = i915_reset_count(global);
937 		unsigned long count = 0, reported;
938 		IGT_TIMEOUT(end_time);
939 
940 		if (flags & TEST_ACTIVE &&
941 		    !intel_engine_can_store_dword(engine))
942 			continue;
943 
944 		if (!wait_for_idle(engine)) {
945 			pr_err("i915_reset_engine(%s:%s): failed to idle before reset\n",
946 			       engine->name, test_name);
947 			err = -EIO;
948 			break;
949 		}
950 
951 		memset(threads, 0, sizeof(threads));
952 		for_each_engine(other, gt, tmp) {
953 			struct task_struct *tsk;
954 
955 			threads[tmp].resets =
956 				i915_reset_engine_count(global, other);
957 
958 			if (other == engine && !(flags & TEST_SELF))
959 				continue;
960 
961 			if (other != engine && !(flags & TEST_OTHERS))
962 				continue;
963 
964 			threads[tmp].engine = other;
965 			threads[tmp].flags = flags;
966 
967 			tsk = kthread_run(active_engine, &threads[tmp],
968 					  "igt/%s", other->name);
969 			if (IS_ERR(tsk)) {
970 				err = PTR_ERR(tsk);
971 				goto unwind;
972 			}
973 
974 			threads[tmp].task = tsk;
975 			get_task_struct(tsk);
976 		}
977 
978 		yield(); /* start all threads before we begin */
979 
980 		st_engine_heartbeat_disable(engine);
981 		set_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
982 		do {
983 			struct i915_request *rq = NULL;
984 
985 			if (flags & TEST_ACTIVE) {
986 				rq = hang_create_request(&h, engine);
987 				if (IS_ERR(rq)) {
988 					err = PTR_ERR(rq);
989 					break;
990 				}
991 
992 				i915_request_get(rq);
993 				i915_request_add(rq);
994 
995 				if (!wait_until_running(&h, rq)) {
996 					struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
997 
998 					pr_err("%s: Failed to start request %llx, at %x\n",
999 					       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1000 					intel_engine_dump(engine, &p,
1001 							  "%s\n", engine->name);
1002 
1003 					i915_request_put(rq);
1004 					err = -EIO;
1005 					break;
1006 				}
1007 			}
1008 
1009 			err = intel_engine_reset(engine, NULL);
1010 			if (err) {
1011 				pr_err("i915_reset_engine(%s:%s): failed, err=%d\n",
1012 				       engine->name, test_name, err);
1013 				break;
1014 			}
1015 
1016 			count++;
1017 
1018 			if (rq) {
1019 				if (rq->fence.error != -EIO) {
1020 					pr_err("i915_reset_engine(%s:%s):"
1021 					       " failed to reset request %llx:%lld\n",
1022 					       engine->name, test_name,
1023 					       rq->fence.context,
1024 					       rq->fence.seqno);
1025 					i915_request_put(rq);
1026 
1027 					GEM_TRACE_DUMP();
1028 					intel_gt_set_wedged(gt);
1029 					err = -EIO;
1030 					break;
1031 				}
1032 
1033 				if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1034 					struct drm_printer p =
1035 						drm_info_printer(gt->i915->drm.dev);
1036 
1037 					pr_err("i915_reset_engine(%s:%s):"
1038 					       " failed to complete request %llx:%lld after reset\n",
1039 					       engine->name, test_name,
1040 					       rq->fence.context,
1041 					       rq->fence.seqno);
1042 					intel_engine_dump(engine, &p,
1043 							  "%s\n", engine->name);
1044 					i915_request_put(rq);
1045 
1046 					GEM_TRACE_DUMP();
1047 					intel_gt_set_wedged(gt);
1048 					err = -EIO;
1049 					break;
1050 				}
1051 
1052 				i915_request_put(rq);
1053 			}
1054 
1055 			if (!(flags & TEST_SELF) && !wait_for_idle(engine)) {
1056 				struct drm_printer p =
1057 					drm_info_printer(gt->i915->drm.dev);
1058 
1059 				pr_err("i915_reset_engine(%s:%s):"
1060 				       " failed to idle after reset\n",
1061 				       engine->name, test_name);
1062 				intel_engine_dump(engine, &p,
1063 						  "%s\n", engine->name);
1064 
1065 				err = -EIO;
1066 				break;
1067 			}
1068 		} while (time_before(jiffies, end_time));
1069 		clear_bit(I915_RESET_ENGINE + id, &gt->reset.flags);
1070 		st_engine_heartbeat_enable(engine);
1071 
1072 		pr_info("i915_reset_engine(%s:%s): %lu resets\n",
1073 			engine->name, test_name, count);
1074 
1075 		reported = i915_reset_engine_count(global, engine);
1076 		reported -= threads[engine->id].resets;
1077 		if (reported != count) {
1078 			pr_err("i915_reset_engine(%s:%s): reset %lu times, but reported %lu\n",
1079 			       engine->name, test_name, count, reported);
1080 			if (!err)
1081 				err = -EINVAL;
1082 		}
1083 
1084 unwind:
1085 		for_each_engine(other, gt, tmp) {
1086 			int ret;
1087 
1088 			if (!threads[tmp].task)
1089 				continue;
1090 
1091 			ret = kthread_stop(threads[tmp].task);
1092 			if (ret) {
1093 				pr_err("kthread for other engine %s failed, err=%d\n",
1094 				       other->name, ret);
1095 				if (!err)
1096 					err = ret;
1097 			}
1098 			put_task_struct(threads[tmp].task);
1099 
1100 			if (other->uabi_class != engine->uabi_class &&
1101 			    threads[tmp].resets !=
1102 			    i915_reset_engine_count(global, other)) {
1103 				pr_err("Innocent engine %s was reset (count=%ld)\n",
1104 				       other->name,
1105 				       i915_reset_engine_count(global, other) -
1106 				       threads[tmp].resets);
1107 				if (!err)
1108 					err = -EINVAL;
1109 			}
1110 		}
1111 
1112 		if (device != i915_reset_count(global)) {
1113 			pr_err("Global reset (count=%ld)!\n",
1114 			       i915_reset_count(global) - device);
1115 			if (!err)
1116 				err = -EINVAL;
1117 		}
1118 
1119 		if (err)
1120 			break;
1121 
1122 		err = igt_flush_test(gt->i915);
1123 		if (err)
1124 			break;
1125 	}
1126 
1127 	if (intel_gt_is_wedged(gt))
1128 		err = -EIO;
1129 
1130 	if (flags & TEST_ACTIVE)
1131 		hang_fini(&h);
1132 
1133 	return err;
1134 }
1135 
1136 static int igt_reset_engines(void *arg)
1137 {
1138 	static const struct {
1139 		const char *name;
1140 		unsigned int flags;
1141 	} phases[] = {
1142 		{ "idle", 0 },
1143 		{ "active", TEST_ACTIVE },
1144 		{ "others-idle", TEST_OTHERS },
1145 		{ "others-active", TEST_OTHERS | TEST_ACTIVE },
1146 		{
1147 			"others-priority",
1148 			TEST_OTHERS | TEST_ACTIVE | TEST_PRIORITY
1149 		},
1150 		{
1151 			"self-priority",
1152 			TEST_ACTIVE | TEST_PRIORITY | TEST_SELF,
1153 		},
1154 		{ }
1155 	};
1156 	struct intel_gt *gt = arg;
1157 	typeof(*phases) *p;
1158 	int err;
1159 
1160 	for (p = phases; p->name; p++) {
1161 		if (p->flags & TEST_PRIORITY) {
1162 			if (!(gt->i915->caps.scheduler & I915_SCHEDULER_CAP_PRIORITY))
1163 				continue;
1164 		}
1165 
1166 		err = __igt_reset_engines(arg, p->name, p->flags);
1167 		if (err)
1168 			return err;
1169 	}
1170 
1171 	return 0;
1172 }
1173 
1174 static u32 fake_hangcheck(struct intel_gt *gt, intel_engine_mask_t mask)
1175 {
1176 	u32 count = i915_reset_count(&gt->i915->gpu_error);
1177 
1178 	intel_gt_reset(gt, mask, NULL);
1179 
1180 	return count;
1181 }
1182 
1183 static int igt_reset_wait(void *arg)
1184 {
1185 	struct intel_gt *gt = arg;
1186 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1187 	struct intel_engine_cs *engine = gt->engine[RCS0];
1188 	struct i915_request *rq;
1189 	unsigned int reset_count;
1190 	struct hang h;
1191 	long timeout;
1192 	int err;
1193 
1194 	if (!engine || !intel_engine_can_store_dword(engine))
1195 		return 0;
1196 
1197 	/* Check that we detect a stuck waiter and issue a reset */
1198 
1199 	igt_global_reset_lock(gt);
1200 
1201 	err = hang_init(&h, gt);
1202 	if (err)
1203 		goto unlock;
1204 
1205 	rq = hang_create_request(&h, engine);
1206 	if (IS_ERR(rq)) {
1207 		err = PTR_ERR(rq);
1208 		goto fini;
1209 	}
1210 
1211 	i915_request_get(rq);
1212 	i915_request_add(rq);
1213 
1214 	if (!wait_until_running(&h, rq)) {
1215 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1216 
1217 		pr_err("%s: Failed to start request %llx, at %x\n",
1218 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1219 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1220 
1221 		intel_gt_set_wedged(gt);
1222 
1223 		err = -EIO;
1224 		goto out_rq;
1225 	}
1226 
1227 	reset_count = fake_hangcheck(gt, ALL_ENGINES);
1228 
1229 	timeout = i915_request_wait(rq, 0, 10);
1230 	if (timeout < 0) {
1231 		pr_err("i915_request_wait failed on a stuck request: err=%ld\n",
1232 		       timeout);
1233 		err = timeout;
1234 		goto out_rq;
1235 	}
1236 
1237 	if (i915_reset_count(global) == reset_count) {
1238 		pr_err("No GPU reset recorded!\n");
1239 		err = -EINVAL;
1240 		goto out_rq;
1241 	}
1242 
1243 out_rq:
1244 	i915_request_put(rq);
1245 fini:
1246 	hang_fini(&h);
1247 unlock:
1248 	igt_global_reset_unlock(gt);
1249 
1250 	if (intel_gt_is_wedged(gt))
1251 		return -EIO;
1252 
1253 	return err;
1254 }
1255 
1256 struct evict_vma {
1257 	struct completion completion;
1258 	struct i915_vma *vma;
1259 };
1260 
1261 static int evict_vma(void *data)
1262 {
1263 	struct evict_vma *arg = data;
1264 	struct i915_address_space *vm = arg->vma->vm;
1265 	struct drm_mm_node evict = arg->vma->node;
1266 	int err;
1267 
1268 	complete(&arg->completion);
1269 
1270 	mutex_lock(&vm->mutex);
1271 	err = i915_gem_evict_for_node(vm, &evict, 0);
1272 	mutex_unlock(&vm->mutex);
1273 
1274 	return err;
1275 }
1276 
1277 static int evict_fence(void *data)
1278 {
1279 	struct evict_vma *arg = data;
1280 	int err;
1281 
1282 	complete(&arg->completion);
1283 
1284 	/* Mark the fence register as dirty to force the mmio update. */
1285 	err = i915_gem_object_set_tiling(arg->vma->obj, I915_TILING_Y, 512);
1286 	if (err) {
1287 		pr_err("Invalid Y-tiling settings; err:%d\n", err);
1288 		return err;
1289 	}
1290 
1291 	err = i915_vma_pin(arg->vma, 0, 0, PIN_GLOBAL | PIN_MAPPABLE);
1292 	if (err) {
1293 		pr_err("Unable to pin vma for Y-tiled fence; err:%d\n", err);
1294 		return err;
1295 	}
1296 
1297 	err = i915_vma_pin_fence(arg->vma);
1298 	i915_vma_unpin(arg->vma);
1299 	if (err) {
1300 		pr_err("Unable to pin Y-tiled fence; err:%d\n", err);
1301 		return err;
1302 	}
1303 
1304 	i915_vma_unpin_fence(arg->vma);
1305 
1306 	return 0;
1307 }
1308 
1309 static int __igt_reset_evict_vma(struct intel_gt *gt,
1310 				 struct i915_address_space *vm,
1311 				 int (*fn)(void *),
1312 				 unsigned int flags)
1313 {
1314 	struct intel_engine_cs *engine = gt->engine[RCS0];
1315 	struct drm_i915_gem_object *obj;
1316 	struct task_struct *tsk = NULL;
1317 	struct i915_request *rq;
1318 	struct evict_vma arg;
1319 	struct hang h;
1320 	unsigned int pin_flags;
1321 	int err;
1322 
1323 	if (!gt->ggtt->num_fences && flags & EXEC_OBJECT_NEEDS_FENCE)
1324 		return 0;
1325 
1326 	if (!engine || !intel_engine_can_store_dword(engine))
1327 		return 0;
1328 
1329 	/* Check that we can recover an unbind stuck on a hanging request */
1330 
1331 	err = hang_init(&h, gt);
1332 	if (err)
1333 		return err;
1334 
1335 	obj = i915_gem_object_create_internal(gt->i915, SZ_1M);
1336 	if (IS_ERR(obj)) {
1337 		err = PTR_ERR(obj);
1338 		goto fini;
1339 	}
1340 
1341 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1342 		err = i915_gem_object_set_tiling(obj, I915_TILING_X, 512);
1343 		if (err) {
1344 			pr_err("Invalid X-tiling settings; err:%d\n", err);
1345 			goto out_obj;
1346 		}
1347 	}
1348 
1349 	arg.vma = i915_vma_instance(obj, vm, NULL);
1350 	if (IS_ERR(arg.vma)) {
1351 		err = PTR_ERR(arg.vma);
1352 		goto out_obj;
1353 	}
1354 
1355 	rq = hang_create_request(&h, engine);
1356 	if (IS_ERR(rq)) {
1357 		err = PTR_ERR(rq);
1358 		goto out_obj;
1359 	}
1360 
1361 	pin_flags = i915_vma_is_ggtt(arg.vma) ? PIN_GLOBAL : PIN_USER;
1362 
1363 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1364 		pin_flags |= PIN_MAPPABLE;
1365 
1366 	err = i915_vma_pin(arg.vma, 0, 0, pin_flags);
1367 	if (err) {
1368 		i915_request_add(rq);
1369 		goto out_obj;
1370 	}
1371 
1372 	if (flags & EXEC_OBJECT_NEEDS_FENCE) {
1373 		err = i915_vma_pin_fence(arg.vma);
1374 		if (err) {
1375 			pr_err("Unable to pin X-tiled fence; err:%d\n", err);
1376 			i915_vma_unpin(arg.vma);
1377 			i915_request_add(rq);
1378 			goto out_obj;
1379 		}
1380 	}
1381 
1382 	i915_vma_lock(arg.vma);
1383 	err = i915_request_await_object(rq, arg.vma->obj,
1384 					flags & EXEC_OBJECT_WRITE);
1385 	if (err == 0)
1386 		err = i915_vma_move_to_active(arg.vma, rq, flags);
1387 	i915_vma_unlock(arg.vma);
1388 
1389 	if (flags & EXEC_OBJECT_NEEDS_FENCE)
1390 		i915_vma_unpin_fence(arg.vma);
1391 	i915_vma_unpin(arg.vma);
1392 
1393 	i915_request_get(rq);
1394 	i915_request_add(rq);
1395 	if (err)
1396 		goto out_rq;
1397 
1398 	if (!wait_until_running(&h, rq)) {
1399 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1400 
1401 		pr_err("%s: Failed to start request %llx, at %x\n",
1402 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1403 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1404 
1405 		intel_gt_set_wedged(gt);
1406 		goto out_reset;
1407 	}
1408 
1409 	init_completion(&arg.completion);
1410 
1411 	tsk = kthread_run(fn, &arg, "igt/evict_vma");
1412 	if (IS_ERR(tsk)) {
1413 		err = PTR_ERR(tsk);
1414 		tsk = NULL;
1415 		goto out_reset;
1416 	}
1417 	get_task_struct(tsk);
1418 
1419 	wait_for_completion(&arg.completion);
1420 
1421 	if (wait_for(!list_empty(&rq->fence.cb_list), 10)) {
1422 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1423 
1424 		pr_err("igt/evict_vma kthread did not wait\n");
1425 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1426 
1427 		intel_gt_set_wedged(gt);
1428 		goto out_reset;
1429 	}
1430 
1431 out_reset:
1432 	igt_global_reset_lock(gt);
1433 	fake_hangcheck(gt, rq->engine->mask);
1434 	igt_global_reset_unlock(gt);
1435 
1436 	if (tsk) {
1437 		struct intel_wedge_me w;
1438 
1439 		/* The reset, even indirectly, should take less than 10ms. */
1440 		intel_wedge_on_timeout(&w, gt, HZ / 10 /* 100ms */)
1441 			err = kthread_stop(tsk);
1442 
1443 		put_task_struct(tsk);
1444 	}
1445 
1446 out_rq:
1447 	i915_request_put(rq);
1448 out_obj:
1449 	i915_gem_object_put(obj);
1450 fini:
1451 	hang_fini(&h);
1452 	if (intel_gt_is_wedged(gt))
1453 		return -EIO;
1454 
1455 	return err;
1456 }
1457 
1458 static int igt_reset_evict_ggtt(void *arg)
1459 {
1460 	struct intel_gt *gt = arg;
1461 
1462 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1463 				     evict_vma, EXEC_OBJECT_WRITE);
1464 }
1465 
1466 static int igt_reset_evict_ppgtt(void *arg)
1467 {
1468 	struct intel_gt *gt = arg;
1469 	struct i915_ppgtt *ppgtt;
1470 	int err;
1471 
1472 	/* aliasing == global gtt locking, covered above */
1473 	if (INTEL_PPGTT(gt->i915) < INTEL_PPGTT_FULL)
1474 		return 0;
1475 
1476 	ppgtt = i915_ppgtt_create(gt);
1477 	if (IS_ERR(ppgtt))
1478 		return PTR_ERR(ppgtt);
1479 
1480 	err = __igt_reset_evict_vma(gt, &ppgtt->vm,
1481 				    evict_vma, EXEC_OBJECT_WRITE);
1482 	i915_vm_put(&ppgtt->vm);
1483 
1484 	return err;
1485 }
1486 
1487 static int igt_reset_evict_fence(void *arg)
1488 {
1489 	struct intel_gt *gt = arg;
1490 
1491 	return __igt_reset_evict_vma(gt, &gt->ggtt->vm,
1492 				     evict_fence, EXEC_OBJECT_NEEDS_FENCE);
1493 }
1494 
1495 static int wait_for_others(struct intel_gt *gt,
1496 			   struct intel_engine_cs *exclude)
1497 {
1498 	struct intel_engine_cs *engine;
1499 	enum intel_engine_id id;
1500 
1501 	for_each_engine(engine, gt, id) {
1502 		if (engine == exclude)
1503 			continue;
1504 
1505 		if (!wait_for_idle(engine))
1506 			return -EIO;
1507 	}
1508 
1509 	return 0;
1510 }
1511 
1512 static int igt_reset_queue(void *arg)
1513 {
1514 	struct intel_gt *gt = arg;
1515 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1516 	struct intel_engine_cs *engine;
1517 	enum intel_engine_id id;
1518 	struct hang h;
1519 	int err;
1520 
1521 	/* Check that we replay pending requests following a hang */
1522 
1523 	igt_global_reset_lock(gt);
1524 
1525 	err = hang_init(&h, gt);
1526 	if (err)
1527 		goto unlock;
1528 
1529 	for_each_engine(engine, gt, id) {
1530 		struct i915_request *prev;
1531 		IGT_TIMEOUT(end_time);
1532 		unsigned int count;
1533 
1534 		if (!intel_engine_can_store_dword(engine))
1535 			continue;
1536 
1537 		prev = hang_create_request(&h, engine);
1538 		if (IS_ERR(prev)) {
1539 			err = PTR_ERR(prev);
1540 			goto fini;
1541 		}
1542 
1543 		i915_request_get(prev);
1544 		i915_request_add(prev);
1545 
1546 		count = 0;
1547 		do {
1548 			struct i915_request *rq;
1549 			unsigned int reset_count;
1550 
1551 			rq = hang_create_request(&h, engine);
1552 			if (IS_ERR(rq)) {
1553 				err = PTR_ERR(rq);
1554 				goto fini;
1555 			}
1556 
1557 			i915_request_get(rq);
1558 			i915_request_add(rq);
1559 
1560 			/*
1561 			 * XXX We don't handle resetting the kernel context
1562 			 * very well. If we trigger a device reset twice in
1563 			 * quick succession while the kernel context is
1564 			 * executing, we may end up skipping the breadcrumb.
1565 			 * This is really only a problem for the selftest as
1566 			 * normally there is a large interlude between resets
1567 			 * (hangcheck), or we focus on resetting just one
1568 			 * engine and so avoid repeatedly resetting innocents.
1569 			 */
1570 			err = wait_for_others(gt, engine);
1571 			if (err) {
1572 				pr_err("%s(%s): Failed to idle other inactive engines after device reset\n",
1573 				       __func__, engine->name);
1574 				i915_request_put(rq);
1575 				i915_request_put(prev);
1576 
1577 				GEM_TRACE_DUMP();
1578 				intel_gt_set_wedged(gt);
1579 				goto fini;
1580 			}
1581 
1582 			if (!wait_until_running(&h, prev)) {
1583 				struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1584 
1585 				pr_err("%s(%s): Failed to start request %llx, at %x\n",
1586 				       __func__, engine->name,
1587 				       prev->fence.seqno, hws_seqno(&h, prev));
1588 				intel_engine_dump(engine, &p,
1589 						  "%s\n", engine->name);
1590 
1591 				i915_request_put(rq);
1592 				i915_request_put(prev);
1593 
1594 				intel_gt_set_wedged(gt);
1595 
1596 				err = -EIO;
1597 				goto fini;
1598 			}
1599 
1600 			reset_count = fake_hangcheck(gt, BIT(id));
1601 
1602 			if (prev->fence.error != -EIO) {
1603 				pr_err("GPU reset not recorded on hanging request [fence.error=%d]!\n",
1604 				       prev->fence.error);
1605 				i915_request_put(rq);
1606 				i915_request_put(prev);
1607 				err = -EINVAL;
1608 				goto fini;
1609 			}
1610 
1611 			if (rq->fence.error) {
1612 				pr_err("Fence error status not zero [%d] after unrelated reset\n",
1613 				       rq->fence.error);
1614 				i915_request_put(rq);
1615 				i915_request_put(prev);
1616 				err = -EINVAL;
1617 				goto fini;
1618 			}
1619 
1620 			if (i915_reset_count(global) == reset_count) {
1621 				pr_err("No GPU reset recorded!\n");
1622 				i915_request_put(rq);
1623 				i915_request_put(prev);
1624 				err = -EINVAL;
1625 				goto fini;
1626 			}
1627 
1628 			i915_request_put(prev);
1629 			prev = rq;
1630 			count++;
1631 		} while (time_before(jiffies, end_time));
1632 		pr_info("%s: Completed %d queued resets\n",
1633 			engine->name, count);
1634 
1635 		*h.batch = MI_BATCH_BUFFER_END;
1636 		intel_gt_chipset_flush(engine->gt);
1637 
1638 		i915_request_put(prev);
1639 
1640 		err = igt_flush_test(gt->i915);
1641 		if (err)
1642 			break;
1643 	}
1644 
1645 fini:
1646 	hang_fini(&h);
1647 unlock:
1648 	igt_global_reset_unlock(gt);
1649 
1650 	if (intel_gt_is_wedged(gt))
1651 		return -EIO;
1652 
1653 	return err;
1654 }
1655 
1656 static int igt_handle_error(void *arg)
1657 {
1658 	struct intel_gt *gt = arg;
1659 	struct i915_gpu_error *global = &gt->i915->gpu_error;
1660 	struct intel_engine_cs *engine = gt->engine[RCS0];
1661 	struct hang h;
1662 	struct i915_request *rq;
1663 	struct i915_gpu_coredump *error;
1664 	int err;
1665 
1666 	/* Check that we can issue a global GPU and engine reset */
1667 
1668 	if (!intel_has_reset_engine(gt))
1669 		return 0;
1670 
1671 	if (!engine || !intel_engine_can_store_dword(engine))
1672 		return 0;
1673 
1674 	err = hang_init(&h, gt);
1675 	if (err)
1676 		return err;
1677 
1678 	rq = hang_create_request(&h, engine);
1679 	if (IS_ERR(rq)) {
1680 		err = PTR_ERR(rq);
1681 		goto err_fini;
1682 	}
1683 
1684 	i915_request_get(rq);
1685 	i915_request_add(rq);
1686 
1687 	if (!wait_until_running(&h, rq)) {
1688 		struct drm_printer p = drm_info_printer(gt->i915->drm.dev);
1689 
1690 		pr_err("%s: Failed to start request %llx, at %x\n",
1691 		       __func__, rq->fence.seqno, hws_seqno(&h, rq));
1692 		intel_engine_dump(rq->engine, &p, "%s\n", rq->engine->name);
1693 
1694 		intel_gt_set_wedged(gt);
1695 
1696 		err = -EIO;
1697 		goto err_request;
1698 	}
1699 
1700 	/* Temporarily disable error capture */
1701 	error = xchg(&global->first_error, (void *)-1);
1702 
1703 	intel_gt_handle_error(gt, engine->mask, 0, NULL);
1704 
1705 	xchg(&global->first_error, error);
1706 
1707 	if (rq->fence.error != -EIO) {
1708 		pr_err("Guilty request not identified!\n");
1709 		err = -EINVAL;
1710 		goto err_request;
1711 	}
1712 
1713 err_request:
1714 	i915_request_put(rq);
1715 err_fini:
1716 	hang_fini(&h);
1717 	return err;
1718 }
1719 
1720 static int __igt_atomic_reset_engine(struct intel_engine_cs *engine,
1721 				     const struct igt_atomic_section *p,
1722 				     const char *mode)
1723 {
1724 	struct tasklet_struct * const t = &engine->execlists.tasklet;
1725 	int err;
1726 
1727 	GEM_TRACE("i915_reset_engine(%s:%s) under %s\n",
1728 		  engine->name, mode, p->name);
1729 
1730 	if (t->func)
1731 		tasklet_disable(t);
1732 	if (strcmp(p->name, "softirq"))
1733 		local_bh_disable();
1734 	p->critical_section_begin();
1735 
1736 	err = __intel_engine_reset_bh(engine, NULL);
1737 
1738 	p->critical_section_end();
1739 	if (strcmp(p->name, "softirq"))
1740 		local_bh_enable();
1741 	if (t->func) {
1742 		tasklet_enable(t);
1743 		tasklet_hi_schedule(t);
1744 	}
1745 
1746 	if (err)
1747 		pr_err("i915_reset_engine(%s:%s) failed under %s\n",
1748 		       engine->name, mode, p->name);
1749 
1750 	return err;
1751 }
1752 
1753 static int igt_atomic_reset_engine(struct intel_engine_cs *engine,
1754 				   const struct igt_atomic_section *p)
1755 {
1756 	struct i915_request *rq;
1757 	struct hang h;
1758 	int err;
1759 
1760 	err = __igt_atomic_reset_engine(engine, p, "idle");
1761 	if (err)
1762 		return err;
1763 
1764 	err = hang_init(&h, engine->gt);
1765 	if (err)
1766 		return err;
1767 
1768 	rq = hang_create_request(&h, engine);
1769 	if (IS_ERR(rq)) {
1770 		err = PTR_ERR(rq);
1771 		goto out;
1772 	}
1773 
1774 	i915_request_get(rq);
1775 	i915_request_add(rq);
1776 
1777 	if (wait_until_running(&h, rq)) {
1778 		err = __igt_atomic_reset_engine(engine, p, "active");
1779 	} else {
1780 		pr_err("%s(%s): Failed to start request %llx, at %x\n",
1781 		       __func__, engine->name,
1782 		       rq->fence.seqno, hws_seqno(&h, rq));
1783 		intel_gt_set_wedged(engine->gt);
1784 		err = -EIO;
1785 	}
1786 
1787 	if (err == 0) {
1788 		struct intel_wedge_me w;
1789 
1790 		intel_wedge_on_timeout(&w, engine->gt, HZ / 20 /* 50ms */)
1791 			i915_request_wait(rq, 0, MAX_SCHEDULE_TIMEOUT);
1792 		if (intel_gt_is_wedged(engine->gt))
1793 			err = -EIO;
1794 	}
1795 
1796 	i915_request_put(rq);
1797 out:
1798 	hang_fini(&h);
1799 	return err;
1800 }
1801 
1802 static int igt_reset_engines_atomic(void *arg)
1803 {
1804 	struct intel_gt *gt = arg;
1805 	const typeof(*igt_atomic_phases) *p;
1806 	int err = 0;
1807 
1808 	/* Check that the engines resets are usable from atomic context */
1809 
1810 	if (!intel_has_reset_engine(gt))
1811 		return 0;
1812 
1813 	if (intel_uc_uses_guc_submission(&gt->uc))
1814 		return 0;
1815 
1816 	igt_global_reset_lock(gt);
1817 
1818 	/* Flush any requests before we get started and check basics */
1819 	if (!igt_force_reset(gt))
1820 		goto unlock;
1821 
1822 	for (p = igt_atomic_phases; p->name; p++) {
1823 		struct intel_engine_cs *engine;
1824 		enum intel_engine_id id;
1825 
1826 		for_each_engine(engine, gt, id) {
1827 			err = igt_atomic_reset_engine(engine, p);
1828 			if (err)
1829 				goto out;
1830 		}
1831 	}
1832 
1833 out:
1834 	/* As we poke around the guts, do a full reset before continuing. */
1835 	igt_force_reset(gt);
1836 unlock:
1837 	igt_global_reset_unlock(gt);
1838 
1839 	return err;
1840 }
1841 
1842 int intel_hangcheck_live_selftests(struct drm_i915_private *i915)
1843 {
1844 	static const struct i915_subtest tests[] = {
1845 		SUBTEST(igt_hang_sanitycheck),
1846 		SUBTEST(igt_reset_nop),
1847 		SUBTEST(igt_reset_nop_engine),
1848 		SUBTEST(igt_reset_idle_engine),
1849 		SUBTEST(igt_reset_active_engine),
1850 		SUBTEST(igt_reset_fail_engine),
1851 		SUBTEST(igt_reset_engines),
1852 		SUBTEST(igt_reset_engines_atomic),
1853 		SUBTEST(igt_reset_queue),
1854 		SUBTEST(igt_reset_wait),
1855 		SUBTEST(igt_reset_evict_ggtt),
1856 		SUBTEST(igt_reset_evict_ppgtt),
1857 		SUBTEST(igt_reset_evict_fence),
1858 		SUBTEST(igt_handle_error),
1859 	};
1860 	struct intel_gt *gt = &i915->gt;
1861 	intel_wakeref_t wakeref;
1862 	int err;
1863 
1864 	if (!intel_has_gpu_reset(gt))
1865 		return 0;
1866 
1867 	if (intel_gt_is_wedged(gt))
1868 		return -EIO; /* we're long past hope of a successful reset */
1869 
1870 	wakeref = intel_runtime_pm_get(gt->uncore->rpm);
1871 
1872 	err = intel_gt_live_subtests(tests, gt);
1873 
1874 	intel_runtime_pm_put(gt->uncore->rpm, wakeref);
1875 
1876 	return err;
1877 }
1878