1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2017-2018 Intel Corporation
4  */
5 
6 #include <linux/prime_numbers.h>
7 
8 #include "intel_context.h"
9 #include "intel_engine_heartbeat.h"
10 #include "intel_engine_pm.h"
11 #include "intel_gpu_commands.h"
12 #include "intel_gt.h"
13 #include "intel_gt_requests.h"
14 #include "intel_ring.h"
15 #include "selftest_engine_heartbeat.h"
16 
17 #include "../selftests/i915_random.h"
18 #include "../i915_selftest.h"
19 
20 #include "selftests/igt_flush_test.h"
21 #include "selftests/lib_sw_fence.h"
22 #include "selftests/mock_gem_device.h"
23 #include "selftests/mock_timeline.h"
24 
25 static struct page *hwsp_page(struct intel_timeline *tl)
26 {
27 	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
28 
29 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
30 	return sg_page(obj->mm.pages->sgl);
31 }
32 
33 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
34 {
35 	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
36 
37 	return (address + offset_in_page(tl->hwsp_offset)) / TIMELINE_SEQNO_BYTES;
38 }
39 
40 static int selftest_tl_pin(struct intel_timeline *tl)
41 {
42 	struct i915_gem_ww_ctx ww;
43 	int err;
44 
45 	i915_gem_ww_ctx_init(&ww, false);
46 retry:
47 	err = i915_gem_object_lock(tl->hwsp_ggtt->obj, &ww);
48 	if (!err)
49 		err = intel_timeline_pin(tl, &ww);
50 
51 	if (err == -EDEADLK) {
52 		err = i915_gem_ww_ctx_backoff(&ww);
53 		if (!err)
54 			goto retry;
55 	}
56 	i915_gem_ww_ctx_fini(&ww);
57 	return err;
58 }
59 
60 /* Only half of seqno's are usable, see __intel_timeline_get_seqno() */
61 #define CACHELINES_PER_PAGE (PAGE_SIZE / TIMELINE_SEQNO_BYTES / 2)
62 
63 struct mock_hwsp_freelist {
64 	struct intel_gt *gt;
65 	struct radix_tree_root cachelines;
66 	struct intel_timeline **history;
67 	unsigned long count, max;
68 	struct rnd_state prng;
69 };
70 
71 enum {
72 	SHUFFLE = BIT(0),
73 };
74 
75 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
76 			       unsigned int idx,
77 			       struct intel_timeline *tl)
78 {
79 	tl = xchg(&state->history[idx], tl);
80 	if (tl) {
81 		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
82 		intel_timeline_unpin(tl);
83 		intel_timeline_put(tl);
84 	}
85 }
86 
87 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
88 				unsigned int count,
89 				unsigned int flags)
90 {
91 	struct intel_timeline *tl;
92 	unsigned int idx;
93 
94 	while (count--) {
95 		unsigned long cacheline;
96 		int err;
97 
98 		tl = intel_timeline_create(state->gt);
99 		if (IS_ERR(tl))
100 			return PTR_ERR(tl);
101 
102 		err = selftest_tl_pin(tl);
103 		if (err) {
104 			intel_timeline_put(tl);
105 			return err;
106 		}
107 
108 		cacheline = hwsp_cacheline(tl);
109 		err = radix_tree_insert(&state->cachelines, cacheline, tl);
110 		if (err) {
111 			if (err == -EEXIST) {
112 				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
113 				       cacheline);
114 			}
115 			intel_timeline_unpin(tl);
116 			intel_timeline_put(tl);
117 			return err;
118 		}
119 
120 		idx = state->count++ % state->max;
121 		__mock_hwsp_record(state, idx, tl);
122 	}
123 
124 	if (flags & SHUFFLE)
125 		i915_prandom_shuffle(state->history,
126 				     sizeof(*state->history),
127 				     min(state->count, state->max),
128 				     &state->prng);
129 
130 	count = i915_prandom_u32_max_state(min(state->count, state->max),
131 					   &state->prng);
132 	while (count--) {
133 		idx = --state->count % state->max;
134 		__mock_hwsp_record(state, idx, NULL);
135 	}
136 
137 	return 0;
138 }
139 
140 static int mock_hwsp_freelist(void *arg)
141 {
142 	struct mock_hwsp_freelist state;
143 	struct drm_i915_private *i915;
144 	const struct {
145 		const char *name;
146 		unsigned int flags;
147 	} phases[] = {
148 		{ "linear", 0 },
149 		{ "shuffled", SHUFFLE },
150 		{ },
151 	}, *p;
152 	unsigned int na;
153 	int err = 0;
154 
155 	i915 = mock_gem_device();
156 	if (!i915)
157 		return -ENOMEM;
158 
159 	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
160 	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
161 
162 	state.gt = to_gt(i915);
163 
164 	/*
165 	 * Create a bunch of timelines and check that their HWSP do not overlap.
166 	 * Free some, and try again.
167 	 */
168 
169 	state.max = PAGE_SIZE / sizeof(*state.history);
170 	state.count = 0;
171 	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
172 	if (!state.history) {
173 		err = -ENOMEM;
174 		goto err_put;
175 	}
176 
177 	for (p = phases; p->name; p++) {
178 		pr_debug("%s(%s)\n", __func__, p->name);
179 		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
180 			err = __mock_hwsp_timeline(&state, na, p->flags);
181 			if (err)
182 				goto out;
183 		}
184 	}
185 
186 out:
187 	for (na = 0; na < state.max; na++)
188 		__mock_hwsp_record(&state, na, NULL);
189 	kfree(state.history);
190 err_put:
191 	mock_destroy_device(i915);
192 	return err;
193 }
194 
195 struct __igt_sync {
196 	const char *name;
197 	u32 seqno;
198 	bool expected;
199 	bool set;
200 };
201 
202 static int __igt_sync(struct intel_timeline *tl,
203 		      u64 ctx,
204 		      const struct __igt_sync *p,
205 		      const char *name)
206 {
207 	int ret;
208 
209 	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
210 		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
211 		       name, p->name, ctx, p->seqno, yesno(p->expected));
212 		return -EINVAL;
213 	}
214 
215 	if (p->set) {
216 		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
217 		if (ret)
218 			return ret;
219 	}
220 
221 	return 0;
222 }
223 
224 static int igt_sync(void *arg)
225 {
226 	const struct __igt_sync pass[] = {
227 		{ "unset", 0, false, false },
228 		{ "new", 0, false, true },
229 		{ "0a", 0, true, true },
230 		{ "1a", 1, false, true },
231 		{ "1b", 1, true, true },
232 		{ "0b", 0, true, false },
233 		{ "2a", 2, false, true },
234 		{ "4", 4, false, true },
235 		{ "INT_MAX", INT_MAX, false, true },
236 		{ "INT_MAX-1", INT_MAX-1, true, false },
237 		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
238 		{ "INT_MAX", INT_MAX, true, false },
239 		{ "UINT_MAX", UINT_MAX, false, true },
240 		{ "wrap", 0, false, true },
241 		{ "unwrap", UINT_MAX, true, false },
242 		{},
243 	}, *p;
244 	struct intel_timeline tl;
245 	int order, offset;
246 	int ret = -ENODEV;
247 
248 	mock_timeline_init(&tl, 0);
249 	for (p = pass; p->name; p++) {
250 		for (order = 1; order < 64; order++) {
251 			for (offset = -1; offset <= (order > 1); offset++) {
252 				u64 ctx = BIT_ULL(order) + offset;
253 
254 				ret = __igt_sync(&tl, ctx, p, "1");
255 				if (ret)
256 					goto out;
257 			}
258 		}
259 	}
260 	mock_timeline_fini(&tl);
261 
262 	mock_timeline_init(&tl, 0);
263 	for (order = 1; order < 64; order++) {
264 		for (offset = -1; offset <= (order > 1); offset++) {
265 			u64 ctx = BIT_ULL(order) + offset;
266 
267 			for (p = pass; p->name; p++) {
268 				ret = __igt_sync(&tl, ctx, p, "2");
269 				if (ret)
270 					goto out;
271 			}
272 		}
273 	}
274 
275 out:
276 	mock_timeline_fini(&tl);
277 	return ret;
278 }
279 
280 static unsigned int random_engine(struct rnd_state *rnd)
281 {
282 	return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
283 }
284 
285 static int bench_sync(void *arg)
286 {
287 	struct rnd_state prng;
288 	struct intel_timeline tl;
289 	unsigned long end_time, count;
290 	u64 prng32_1M;
291 	ktime_t kt;
292 	int order, last_order;
293 
294 	mock_timeline_init(&tl, 0);
295 
296 	/* Lookups from cache are very fast and so the random number generation
297 	 * and the loop itself becomes a significant factor in the per-iteration
298 	 * timings. We try to compensate the results by measuring the overhead
299 	 * of the prng and subtract it from the reported results.
300 	 */
301 	prandom_seed_state(&prng, i915_selftest.random_seed);
302 	count = 0;
303 	kt = ktime_get();
304 	end_time = jiffies + HZ/10;
305 	do {
306 		u32 x;
307 
308 		/* Make sure the compiler doesn't optimise away the prng call */
309 		WRITE_ONCE(x, prandom_u32_state(&prng));
310 
311 		count++;
312 	} while (!time_after(jiffies, end_time));
313 	kt = ktime_sub(ktime_get(), kt);
314 	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
315 		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
316 	prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
317 
318 	/* Benchmark (only) setting random context ids */
319 	prandom_seed_state(&prng, i915_selftest.random_seed);
320 	count = 0;
321 	kt = ktime_get();
322 	end_time = jiffies + HZ/10;
323 	do {
324 		u64 id = i915_prandom_u64_state(&prng);
325 
326 		__intel_timeline_sync_set(&tl, id, 0);
327 		count++;
328 	} while (!time_after(jiffies, end_time));
329 	kt = ktime_sub(ktime_get(), kt);
330 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
331 	pr_info("%s: %lu random insertions, %lluns/insert\n",
332 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
333 
334 	/* Benchmark looking up the exact same context ids as we just set */
335 	prandom_seed_state(&prng, i915_selftest.random_seed);
336 	end_time = count;
337 	kt = ktime_get();
338 	while (end_time--) {
339 		u64 id = i915_prandom_u64_state(&prng);
340 
341 		if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
342 			mock_timeline_fini(&tl);
343 			pr_err("Lookup of %llu failed\n", id);
344 			return -EINVAL;
345 		}
346 	}
347 	kt = ktime_sub(ktime_get(), kt);
348 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
349 	pr_info("%s: %lu random lookups, %lluns/lookup\n",
350 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
351 
352 	mock_timeline_fini(&tl);
353 	cond_resched();
354 
355 	mock_timeline_init(&tl, 0);
356 
357 	/* Benchmark setting the first N (in order) contexts */
358 	count = 0;
359 	kt = ktime_get();
360 	end_time = jiffies + HZ/10;
361 	do {
362 		__intel_timeline_sync_set(&tl, count++, 0);
363 	} while (!time_after(jiffies, end_time));
364 	kt = ktime_sub(ktime_get(), kt);
365 	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
366 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
367 
368 	/* Benchmark looking up the exact same context ids as we just set */
369 	end_time = count;
370 	kt = ktime_get();
371 	while (end_time--) {
372 		if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
373 			pr_err("Lookup of %lu failed\n", end_time);
374 			mock_timeline_fini(&tl);
375 			return -EINVAL;
376 		}
377 	}
378 	kt = ktime_sub(ktime_get(), kt);
379 	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
380 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
381 
382 	mock_timeline_fini(&tl);
383 	cond_resched();
384 
385 	mock_timeline_init(&tl, 0);
386 
387 	/* Benchmark searching for a random context id and maybe changing it */
388 	prandom_seed_state(&prng, i915_selftest.random_seed);
389 	count = 0;
390 	kt = ktime_get();
391 	end_time = jiffies + HZ/10;
392 	do {
393 		u32 id = random_engine(&prng);
394 		u32 seqno = prandom_u32_state(&prng);
395 
396 		if (!__intel_timeline_sync_is_later(&tl, id, seqno))
397 			__intel_timeline_sync_set(&tl, id, seqno);
398 
399 		count++;
400 	} while (!time_after(jiffies, end_time));
401 	kt = ktime_sub(ktime_get(), kt);
402 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
403 	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
404 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
405 	mock_timeline_fini(&tl);
406 	cond_resched();
407 
408 	/* Benchmark searching for a known context id and changing the seqno */
409 	for (last_order = 1, order = 1; order < 32;
410 	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
411 		unsigned int mask = BIT(order) - 1;
412 
413 		mock_timeline_init(&tl, 0);
414 
415 		count = 0;
416 		kt = ktime_get();
417 		end_time = jiffies + HZ/10;
418 		do {
419 			/* Without assuming too many details of the underlying
420 			 * implementation, try to identify its phase-changes
421 			 * (if any)!
422 			 */
423 			u64 id = (u64)(count & mask) << order;
424 
425 			__intel_timeline_sync_is_later(&tl, id, 0);
426 			__intel_timeline_sync_set(&tl, id, 0);
427 
428 			count++;
429 		} while (!time_after(jiffies, end_time));
430 		kt = ktime_sub(ktime_get(), kt);
431 		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
432 			__func__, count, order,
433 			(long long)div64_ul(ktime_to_ns(kt), count));
434 		mock_timeline_fini(&tl);
435 		cond_resched();
436 	}
437 
438 	return 0;
439 }
440 
441 int intel_timeline_mock_selftests(void)
442 {
443 	static const struct i915_subtest tests[] = {
444 		SUBTEST(mock_hwsp_freelist),
445 		SUBTEST(igt_sync),
446 		SUBTEST(bench_sync),
447 	};
448 
449 	return i915_subtests(tests, NULL);
450 }
451 
452 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
453 {
454 	u32 *cs;
455 
456 	cs = intel_ring_begin(rq, 4);
457 	if (IS_ERR(cs))
458 		return PTR_ERR(cs);
459 
460 	if (GRAPHICS_VER(rq->engine->i915) >= 8) {
461 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
462 		*cs++ = addr;
463 		*cs++ = 0;
464 		*cs++ = value;
465 	} else if (GRAPHICS_VER(rq->engine->i915) >= 4) {
466 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
467 		*cs++ = 0;
468 		*cs++ = addr;
469 		*cs++ = value;
470 	} else {
471 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
472 		*cs++ = addr;
473 		*cs++ = value;
474 		*cs++ = MI_NOOP;
475 	}
476 
477 	intel_ring_advance(rq, cs);
478 
479 	return 0;
480 }
481 
482 static struct i915_request *
483 checked_tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
484 {
485 	struct i915_request *rq;
486 	int err;
487 
488 	err = selftest_tl_pin(tl);
489 	if (err) {
490 		rq = ERR_PTR(err);
491 		goto out;
492 	}
493 
494 	if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
495 		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
496 		       *tl->hwsp_seqno, tl->seqno);
497 		intel_timeline_unpin(tl);
498 		return ERR_PTR(-EINVAL);
499 	}
500 
501 	rq = intel_engine_create_kernel_request(engine);
502 	if (IS_ERR(rq))
503 		goto out_unpin;
504 
505 	i915_request_get(rq);
506 
507 	err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
508 	i915_request_add(rq);
509 	if (err) {
510 		i915_request_put(rq);
511 		rq = ERR_PTR(err);
512 	}
513 
514 out_unpin:
515 	intel_timeline_unpin(tl);
516 out:
517 	if (IS_ERR(rq))
518 		pr_err("Failed to write to timeline!\n");
519 	return rq;
520 }
521 
522 static int live_hwsp_engine(void *arg)
523 {
524 #define NUM_TIMELINES 4096
525 	struct intel_gt *gt = arg;
526 	struct intel_timeline **timelines;
527 	struct intel_engine_cs *engine;
528 	enum intel_engine_id id;
529 	unsigned long count, n;
530 	int err = 0;
531 
532 	/*
533 	 * Create a bunch of timelines and check we can write
534 	 * independently to each of their breadcrumb slots.
535 	 */
536 
537 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
538 				   sizeof(*timelines),
539 				   GFP_KERNEL);
540 	if (!timelines)
541 		return -ENOMEM;
542 
543 	count = 0;
544 	for_each_engine(engine, gt, id) {
545 		if (!intel_engine_can_store_dword(engine))
546 			continue;
547 
548 		intel_engine_pm_get(engine);
549 
550 		for (n = 0; n < NUM_TIMELINES; n++) {
551 			struct intel_timeline *tl;
552 			struct i915_request *rq;
553 
554 			tl = intel_timeline_create(gt);
555 			if (IS_ERR(tl)) {
556 				err = PTR_ERR(tl);
557 				break;
558 			}
559 
560 			rq = checked_tl_write(tl, engine, count);
561 			if (IS_ERR(rq)) {
562 				intel_timeline_put(tl);
563 				err = PTR_ERR(rq);
564 				break;
565 			}
566 
567 			timelines[count++] = tl;
568 			i915_request_put(rq);
569 		}
570 
571 		intel_engine_pm_put(engine);
572 		if (err)
573 			break;
574 	}
575 
576 	if (igt_flush_test(gt->i915))
577 		err = -EIO;
578 
579 	for (n = 0; n < count; n++) {
580 		struct intel_timeline *tl = timelines[n];
581 
582 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
583 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
584 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
585 			GEM_TRACE_DUMP();
586 			err = -EINVAL;
587 		}
588 		intel_timeline_put(tl);
589 	}
590 
591 	kvfree(timelines);
592 	return err;
593 #undef NUM_TIMELINES
594 }
595 
596 static int live_hwsp_alternate(void *arg)
597 {
598 #define NUM_TIMELINES 4096
599 	struct intel_gt *gt = arg;
600 	struct intel_timeline **timelines;
601 	struct intel_engine_cs *engine;
602 	enum intel_engine_id id;
603 	unsigned long count, n;
604 	int err = 0;
605 
606 	/*
607 	 * Create a bunch of timelines and check we can write
608 	 * independently to each of their breadcrumb slots with adjacent
609 	 * engines.
610 	 */
611 
612 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
613 				   sizeof(*timelines),
614 				   GFP_KERNEL);
615 	if (!timelines)
616 		return -ENOMEM;
617 
618 	count = 0;
619 	for (n = 0; n < NUM_TIMELINES; n++) {
620 		for_each_engine(engine, gt, id) {
621 			struct intel_timeline *tl;
622 			struct i915_request *rq;
623 
624 			if (!intel_engine_can_store_dword(engine))
625 				continue;
626 
627 			tl = intel_timeline_create(gt);
628 			if (IS_ERR(tl)) {
629 				err = PTR_ERR(tl);
630 				goto out;
631 			}
632 
633 			intel_engine_pm_get(engine);
634 			rq = checked_tl_write(tl, engine, count);
635 			intel_engine_pm_put(engine);
636 			if (IS_ERR(rq)) {
637 				intel_timeline_put(tl);
638 				err = PTR_ERR(rq);
639 				goto out;
640 			}
641 
642 			timelines[count++] = tl;
643 			i915_request_put(rq);
644 		}
645 	}
646 
647 out:
648 	if (igt_flush_test(gt->i915))
649 		err = -EIO;
650 
651 	for (n = 0; n < count; n++) {
652 		struct intel_timeline *tl = timelines[n];
653 
654 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
655 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
656 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
657 			GEM_TRACE_DUMP();
658 			err = -EINVAL;
659 		}
660 		intel_timeline_put(tl);
661 	}
662 
663 	kvfree(timelines);
664 	return err;
665 #undef NUM_TIMELINES
666 }
667 
668 static int live_hwsp_wrap(void *arg)
669 {
670 	struct intel_gt *gt = arg;
671 	struct intel_engine_cs *engine;
672 	struct intel_timeline *tl;
673 	enum intel_engine_id id;
674 	int err = 0;
675 
676 	/*
677 	 * Across a seqno wrap, we need to keep the old cacheline alive for
678 	 * foreign GPU references.
679 	 */
680 
681 	tl = intel_timeline_create(gt);
682 	if (IS_ERR(tl))
683 		return PTR_ERR(tl);
684 
685 	if (!tl->has_initial_breadcrumb)
686 		goto out_free;
687 
688 	err = selftest_tl_pin(tl);
689 	if (err)
690 		goto out_free;
691 
692 	for_each_engine(engine, gt, id) {
693 		const u32 *hwsp_seqno[2];
694 		struct i915_request *rq;
695 		u32 seqno[2];
696 
697 		if (!intel_engine_can_store_dword(engine))
698 			continue;
699 
700 		rq = intel_engine_create_kernel_request(engine);
701 		if (IS_ERR(rq)) {
702 			err = PTR_ERR(rq);
703 			goto out;
704 		}
705 
706 		tl->seqno = -4u;
707 
708 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
709 		err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
710 		mutex_unlock(&tl->mutex);
711 		if (err) {
712 			i915_request_add(rq);
713 			goto out;
714 		}
715 		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
716 			 seqno[0], tl->hwsp_offset);
717 
718 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
719 		if (err) {
720 			i915_request_add(rq);
721 			goto out;
722 		}
723 		hwsp_seqno[0] = tl->hwsp_seqno;
724 
725 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
726 		err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
727 		mutex_unlock(&tl->mutex);
728 		if (err) {
729 			i915_request_add(rq);
730 			goto out;
731 		}
732 		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
733 			 seqno[1], tl->hwsp_offset);
734 
735 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
736 		if (err) {
737 			i915_request_add(rq);
738 			goto out;
739 		}
740 		hwsp_seqno[1] = tl->hwsp_seqno;
741 
742 		/* With wrap should come a new hwsp */
743 		GEM_BUG_ON(seqno[1] >= seqno[0]);
744 		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
745 
746 		i915_request_add(rq);
747 
748 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
749 			pr_err("Wait for timeline writes timed out!\n");
750 			err = -EIO;
751 			goto out;
752 		}
753 
754 		if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
755 		    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
756 			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
757 			       *hwsp_seqno[0], *hwsp_seqno[1],
758 			       seqno[0], seqno[1]);
759 			err = -EINVAL;
760 			goto out;
761 		}
762 
763 		intel_gt_retire_requests(gt); /* recycle HWSP */
764 	}
765 
766 out:
767 	if (igt_flush_test(gt->i915))
768 		err = -EIO;
769 
770 	intel_timeline_unpin(tl);
771 out_free:
772 	intel_timeline_put(tl);
773 	return err;
774 }
775 
776 static int emit_read_hwsp(struct i915_request *rq,
777 			  u32 seqno, u32 hwsp,
778 			  u32 *addr)
779 {
780 	const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
781 	u32 *cs;
782 
783 	cs = intel_ring_begin(rq, 12);
784 	if (IS_ERR(cs))
785 		return PTR_ERR(cs);
786 
787 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
788 	*cs++ = *addr;
789 	*cs++ = 0;
790 	*cs++ = seqno;
791 	*addr += 4;
792 
793 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
794 	*cs++ = gpr;
795 	*cs++ = hwsp;
796 	*cs++ = 0;
797 
798 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
799 	*cs++ = gpr;
800 	*cs++ = *addr;
801 	*cs++ = 0;
802 	*addr += 4;
803 
804 	intel_ring_advance(rq, cs);
805 
806 	return 0;
807 }
808 
809 struct hwsp_watcher {
810 	struct i915_vma *vma;
811 	struct i915_request *rq;
812 	u32 addr;
813 	u32 *map;
814 };
815 
816 static bool cmp_lt(u32 a, u32 b)
817 {
818 	return a < b;
819 }
820 
821 static bool cmp_gte(u32 a, u32 b)
822 {
823 	return a >= b;
824 }
825 
826 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
827 {
828 	struct drm_i915_gem_object *obj;
829 	struct i915_vma *vma;
830 
831 	obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
832 	if (IS_ERR(obj))
833 		return PTR_ERR(obj);
834 
835 	w->map = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
836 	if (IS_ERR(w->map)) {
837 		i915_gem_object_put(obj);
838 		return PTR_ERR(w->map);
839 	}
840 
841 	vma = i915_gem_object_ggtt_pin(obj, NULL, 0, 0, 0);
842 	if (IS_ERR(vma)) {
843 		i915_gem_object_put(obj);
844 		return PTR_ERR(vma);
845 	}
846 
847 	w->vma = vma;
848 	w->addr = i915_ggtt_offset(vma);
849 	return 0;
850 }
851 
852 static void switch_tl_lock(struct i915_request *from, struct i915_request *to)
853 {
854 	/* some light mutex juggling required; think co-routines */
855 
856 	if (from) {
857 		lockdep_unpin_lock(&from->context->timeline->mutex, from->cookie);
858 		mutex_unlock(&from->context->timeline->mutex);
859 	}
860 
861 	if (to) {
862 		mutex_lock(&to->context->timeline->mutex);
863 		to->cookie = lockdep_pin_lock(&to->context->timeline->mutex);
864 	}
865 }
866 
867 static int create_watcher(struct hwsp_watcher *w,
868 			  struct intel_engine_cs *engine,
869 			  int ringsz)
870 {
871 	struct intel_context *ce;
872 
873 	ce = intel_context_create(engine);
874 	if (IS_ERR(ce))
875 		return PTR_ERR(ce);
876 
877 	ce->ring_size = ringsz;
878 	w->rq = intel_context_create_request(ce);
879 	intel_context_put(ce);
880 	if (IS_ERR(w->rq))
881 		return PTR_ERR(w->rq);
882 
883 	w->addr = i915_ggtt_offset(w->vma);
884 
885 	switch_tl_lock(w->rq, NULL);
886 
887 	return 0;
888 }
889 
890 static int check_watcher(struct hwsp_watcher *w, const char *name,
891 			 bool (*op)(u32 hwsp, u32 seqno))
892 {
893 	struct i915_request *rq = fetch_and_zero(&w->rq);
894 	u32 offset, end;
895 	int err;
896 
897 	GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
898 
899 	i915_request_get(rq);
900 	switch_tl_lock(NULL, rq);
901 	i915_request_add(rq);
902 
903 	if (i915_request_wait(rq, 0, HZ) < 0) {
904 		err = -ETIME;
905 		goto out;
906 	}
907 
908 	err = 0;
909 	offset = 0;
910 	end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
911 	while (offset < end) {
912 		if (!op(w->map[offset + 1], w->map[offset])) {
913 			pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
914 			       name, w->map[offset + 1], w->map[offset]);
915 			err = -EINVAL;
916 		}
917 
918 		offset += 2;
919 	}
920 
921 out:
922 	i915_request_put(rq);
923 	return err;
924 }
925 
926 static void cleanup_watcher(struct hwsp_watcher *w)
927 {
928 	if (w->rq) {
929 		switch_tl_lock(NULL, w->rq);
930 
931 		i915_request_add(w->rq);
932 	}
933 
934 	i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
935 }
936 
937 static bool retire_requests(struct intel_timeline *tl)
938 {
939 	struct i915_request *rq, *rn;
940 
941 	mutex_lock(&tl->mutex);
942 	list_for_each_entry_safe(rq, rn, &tl->requests, link)
943 		if (!i915_request_retire(rq))
944 			break;
945 	mutex_unlock(&tl->mutex);
946 
947 	return !i915_active_fence_isset(&tl->last_request);
948 }
949 
950 static struct i915_request *wrap_timeline(struct i915_request *rq)
951 {
952 	struct intel_context *ce = rq->context;
953 	struct intel_timeline *tl = ce->timeline;
954 	u32 seqno = rq->fence.seqno;
955 
956 	while (tl->seqno >= seqno) { /* Cause a wrap */
957 		i915_request_put(rq);
958 		rq = intel_context_create_request(ce);
959 		if (IS_ERR(rq))
960 			return rq;
961 
962 		i915_request_get(rq);
963 		i915_request_add(rq);
964 	}
965 
966 	i915_request_put(rq);
967 	rq = i915_request_create(ce);
968 	if (IS_ERR(rq))
969 		return rq;
970 
971 	i915_request_get(rq);
972 	i915_request_add(rq);
973 
974 	return rq;
975 }
976 
977 static int live_hwsp_read(void *arg)
978 {
979 	struct intel_gt *gt = arg;
980 	struct hwsp_watcher watcher[2] = {};
981 	struct intel_engine_cs *engine;
982 	struct intel_timeline *tl;
983 	enum intel_engine_id id;
984 	int err = 0;
985 	int i;
986 
987 	/*
988 	 * If we take a reference to the HWSP for reading on the GPU, that
989 	 * read may be arbitrarily delayed (either by foreign fence or
990 	 * priority saturation) and a wrap can happen within 30 minutes.
991 	 * When the GPU read is finally submitted it should be correct,
992 	 * even across multiple wraps.
993 	 */
994 
995 	if (GRAPHICS_VER(gt->i915) < 8) /* CS convenience [SRM/LRM] */
996 		return 0;
997 
998 	tl = intel_timeline_create(gt);
999 	if (IS_ERR(tl))
1000 		return PTR_ERR(tl);
1001 
1002 	if (!tl->has_initial_breadcrumb)
1003 		goto out_free;
1004 
1005 	for (i = 0; i < ARRAY_SIZE(watcher); i++) {
1006 		err = setup_watcher(&watcher[i], gt);
1007 		if (err)
1008 			goto out;
1009 	}
1010 
1011 	for_each_engine(engine, gt, id) {
1012 		struct intel_context *ce;
1013 		unsigned long count = 0;
1014 		IGT_TIMEOUT(end_time);
1015 
1016 		/* Create a request we can use for remote reading of the HWSP */
1017 		err = create_watcher(&watcher[1], engine, SZ_512K);
1018 		if (err)
1019 			goto out;
1020 
1021 		do {
1022 			struct i915_sw_fence *submit;
1023 			struct i915_request *rq;
1024 			u32 hwsp, dummy;
1025 
1026 			submit = heap_fence_create(GFP_KERNEL);
1027 			if (!submit) {
1028 				err = -ENOMEM;
1029 				goto out;
1030 			}
1031 
1032 			err = create_watcher(&watcher[0], engine, SZ_4K);
1033 			if (err)
1034 				goto out;
1035 
1036 			ce = intel_context_create(engine);
1037 			if (IS_ERR(ce)) {
1038 				err = PTR_ERR(ce);
1039 				goto out;
1040 			}
1041 
1042 			ce->timeline = intel_timeline_get(tl);
1043 
1044 			/* Ensure timeline is mapped, done during first pin */
1045 			err = intel_context_pin(ce);
1046 			if (err) {
1047 				intel_context_put(ce);
1048 				goto out;
1049 			}
1050 
1051 			/*
1052 			 * Start at a new wrap, and set seqno right before another wrap,
1053 			 * saving 30 minutes of nops
1054 			 */
1055 			tl->seqno = -12u + 2 * (count & 3);
1056 			__intel_timeline_get_seqno(tl, &dummy);
1057 
1058 			rq = i915_request_create(ce);
1059 			if (IS_ERR(rq)) {
1060 				err = PTR_ERR(rq);
1061 				intel_context_unpin(ce);
1062 				intel_context_put(ce);
1063 				goto out;
1064 			}
1065 
1066 			err = i915_sw_fence_await_dma_fence(&rq->submit,
1067 							    &watcher[0].rq->fence, 0,
1068 							    GFP_KERNEL);
1069 			if (err < 0) {
1070 				i915_request_add(rq);
1071 				intel_context_unpin(ce);
1072 				intel_context_put(ce);
1073 				goto out;
1074 			}
1075 
1076 			switch_tl_lock(rq, watcher[0].rq);
1077 			err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1078 			if (err == 0)
1079 				err = emit_read_hwsp(watcher[0].rq, /* before */
1080 						     rq->fence.seqno, hwsp,
1081 						     &watcher[0].addr);
1082 			switch_tl_lock(watcher[0].rq, rq);
1083 			if (err) {
1084 				i915_request_add(rq);
1085 				intel_context_unpin(ce);
1086 				intel_context_put(ce);
1087 				goto out;
1088 			}
1089 
1090 			switch_tl_lock(rq, watcher[1].rq);
1091 			err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1092 			if (err == 0)
1093 				err = emit_read_hwsp(watcher[1].rq, /* after */
1094 						     rq->fence.seqno, hwsp,
1095 						     &watcher[1].addr);
1096 			switch_tl_lock(watcher[1].rq, rq);
1097 			if (err) {
1098 				i915_request_add(rq);
1099 				intel_context_unpin(ce);
1100 				intel_context_put(ce);
1101 				goto out;
1102 			}
1103 
1104 			i915_request_get(rq);
1105 			i915_request_add(rq);
1106 
1107 			rq = wrap_timeline(rq);
1108 			intel_context_unpin(ce);
1109 			intel_context_put(ce);
1110 			if (IS_ERR(rq)) {
1111 				err = PTR_ERR(rq);
1112 				goto out;
1113 			}
1114 
1115 			err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1116 							    &rq->fence, 0,
1117 							    GFP_KERNEL);
1118 			if (err < 0) {
1119 				i915_request_put(rq);
1120 				goto out;
1121 			}
1122 
1123 			err = check_watcher(&watcher[0], "before", cmp_lt);
1124 			i915_sw_fence_commit(submit);
1125 			heap_fence_put(submit);
1126 			if (err) {
1127 				i915_request_put(rq);
1128 				goto out;
1129 			}
1130 			count++;
1131 
1132 			/* Flush the timeline before manually wrapping again */
1133 			if (i915_request_wait(rq,
1134 					      I915_WAIT_INTERRUPTIBLE,
1135 					      HZ) < 0) {
1136 				err = -ETIME;
1137 				i915_request_put(rq);
1138 				goto out;
1139 			}
1140 			retire_requests(tl);
1141 			i915_request_put(rq);
1142 
1143 			/* Single requests are limited to half a ring at most */
1144 			if (8 * watcher[1].rq->ring->emit >
1145 			    3 * watcher[1].rq->ring->size)
1146 				break;
1147 
1148 		} while (!__igt_timeout(end_time, NULL) &&
1149 			 count < (PAGE_SIZE / TIMELINE_SEQNO_BYTES - 1) / 2);
1150 
1151 		pr_info("%s: simulated %lu wraps\n", engine->name, count);
1152 		err = check_watcher(&watcher[1], "after", cmp_gte);
1153 		if (err)
1154 			goto out;
1155 	}
1156 
1157 out:
1158 	for (i = 0; i < ARRAY_SIZE(watcher); i++)
1159 		cleanup_watcher(&watcher[i]);
1160 
1161 	if (igt_flush_test(gt->i915))
1162 		err = -EIO;
1163 
1164 out_free:
1165 	intel_timeline_put(tl);
1166 	return err;
1167 }
1168 
1169 static int live_hwsp_rollover_kernel(void *arg)
1170 {
1171 	struct intel_gt *gt = arg;
1172 	struct intel_engine_cs *engine;
1173 	enum intel_engine_id id;
1174 	int err = 0;
1175 
1176 	/*
1177 	 * Run the host for long enough, and even the kernel context will
1178 	 * see a seqno rollover.
1179 	 */
1180 
1181 	for_each_engine(engine, gt, id) {
1182 		struct intel_context *ce = engine->kernel_context;
1183 		struct intel_timeline *tl = ce->timeline;
1184 		struct i915_request *rq[3] = {};
1185 		int i;
1186 
1187 		st_engine_heartbeat_disable(engine);
1188 		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1189 			err = -EIO;
1190 			goto out;
1191 		}
1192 
1193 		GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1194 		tl->seqno = -2u;
1195 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1196 
1197 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1198 			struct i915_request *this;
1199 
1200 			this = i915_request_create(ce);
1201 			if (IS_ERR(this)) {
1202 				err = PTR_ERR(this);
1203 				goto out;
1204 			}
1205 
1206 			pr_debug("%s: create fence.seqnp:%d\n",
1207 				 engine->name,
1208 				 lower_32_bits(this->fence.seqno));
1209 
1210 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1211 
1212 			rq[i] = i915_request_get(this);
1213 			i915_request_add(this);
1214 		}
1215 
1216 		/* We expected a wrap! */
1217 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1218 
1219 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1220 			pr_err("Wait for timeline wrap timed out!\n");
1221 			err = -EIO;
1222 			goto out;
1223 		}
1224 
1225 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1226 			if (!i915_request_completed(rq[i])) {
1227 				pr_err("Pre-wrap request not completed!\n");
1228 				err = -EINVAL;
1229 				goto out;
1230 			}
1231 		}
1232 
1233 out:
1234 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1235 			i915_request_put(rq[i]);
1236 		st_engine_heartbeat_enable(engine);
1237 		if (err)
1238 			break;
1239 	}
1240 
1241 	if (igt_flush_test(gt->i915))
1242 		err = -EIO;
1243 
1244 	return err;
1245 }
1246 
1247 static int live_hwsp_rollover_user(void *arg)
1248 {
1249 	struct intel_gt *gt = arg;
1250 	struct intel_engine_cs *engine;
1251 	enum intel_engine_id id;
1252 	int err = 0;
1253 
1254 	/*
1255 	 * Simulate a long running user context, and force the seqno wrap
1256 	 * on the user's timeline.
1257 	 */
1258 
1259 	for_each_engine(engine, gt, id) {
1260 		struct i915_request *rq[3] = {};
1261 		struct intel_timeline *tl;
1262 		struct intel_context *ce;
1263 		int i;
1264 
1265 		ce = intel_context_create(engine);
1266 		if (IS_ERR(ce))
1267 			return PTR_ERR(ce);
1268 
1269 		err = intel_context_alloc_state(ce);
1270 		if (err)
1271 			goto out;
1272 
1273 		tl = ce->timeline;
1274 		if (!tl->has_initial_breadcrumb)
1275 			goto out;
1276 
1277 		err = intel_context_pin(ce);
1278 		if (err)
1279 			goto out;
1280 
1281 		tl->seqno = -4u;
1282 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1283 
1284 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1285 			struct i915_request *this;
1286 
1287 			this = intel_context_create_request(ce);
1288 			if (IS_ERR(this)) {
1289 				err = PTR_ERR(this);
1290 				goto out_unpin;
1291 			}
1292 
1293 			pr_debug("%s: create fence.seqnp:%d\n",
1294 				 engine->name,
1295 				 lower_32_bits(this->fence.seqno));
1296 
1297 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1298 
1299 			rq[i] = i915_request_get(this);
1300 			i915_request_add(this);
1301 		}
1302 
1303 		/* We expected a wrap! */
1304 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1305 
1306 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1307 			pr_err("Wait for timeline wrap timed out!\n");
1308 			err = -EIO;
1309 			goto out_unpin;
1310 		}
1311 
1312 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1313 			if (!i915_request_completed(rq[i])) {
1314 				pr_err("Pre-wrap request not completed!\n");
1315 				err = -EINVAL;
1316 				goto out_unpin;
1317 			}
1318 		}
1319 out_unpin:
1320 		intel_context_unpin(ce);
1321 out:
1322 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1323 			i915_request_put(rq[i]);
1324 		intel_context_put(ce);
1325 		if (err)
1326 			break;
1327 	}
1328 
1329 	if (igt_flush_test(gt->i915))
1330 		err = -EIO;
1331 
1332 	return err;
1333 }
1334 
1335 static int live_hwsp_recycle(void *arg)
1336 {
1337 	struct intel_gt *gt = arg;
1338 	struct intel_engine_cs *engine;
1339 	enum intel_engine_id id;
1340 	unsigned long count;
1341 	int err = 0;
1342 
1343 	/*
1344 	 * Check seqno writes into one timeline at a time. We expect to
1345 	 * recycle the breadcrumb slot between iterations and neither
1346 	 * want to confuse ourselves or the GPU.
1347 	 */
1348 
1349 	count = 0;
1350 	for_each_engine(engine, gt, id) {
1351 		IGT_TIMEOUT(end_time);
1352 
1353 		if (!intel_engine_can_store_dword(engine))
1354 			continue;
1355 
1356 		intel_engine_pm_get(engine);
1357 
1358 		do {
1359 			struct intel_timeline *tl;
1360 			struct i915_request *rq;
1361 
1362 			tl = intel_timeline_create(gt);
1363 			if (IS_ERR(tl)) {
1364 				err = PTR_ERR(tl);
1365 				break;
1366 			}
1367 
1368 			rq = checked_tl_write(tl, engine, count);
1369 			if (IS_ERR(rq)) {
1370 				intel_timeline_put(tl);
1371 				err = PTR_ERR(rq);
1372 				break;
1373 			}
1374 
1375 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1376 				pr_err("Wait for timeline writes timed out!\n");
1377 				i915_request_put(rq);
1378 				intel_timeline_put(tl);
1379 				err = -EIO;
1380 				break;
1381 			}
1382 
1383 			if (READ_ONCE(*tl->hwsp_seqno) != count) {
1384 				GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1385 					      count, tl->fence_context,
1386 					      tl->hwsp_offset, *tl->hwsp_seqno);
1387 				GEM_TRACE_DUMP();
1388 				err = -EINVAL;
1389 			}
1390 
1391 			i915_request_put(rq);
1392 			intel_timeline_put(tl);
1393 			count++;
1394 
1395 			if (err)
1396 				break;
1397 		} while (!__igt_timeout(end_time, NULL));
1398 
1399 		intel_engine_pm_put(engine);
1400 		if (err)
1401 			break;
1402 	}
1403 
1404 	return err;
1405 }
1406 
1407 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1408 {
1409 	static const struct i915_subtest tests[] = {
1410 		SUBTEST(live_hwsp_recycle),
1411 		SUBTEST(live_hwsp_engine),
1412 		SUBTEST(live_hwsp_alternate),
1413 		SUBTEST(live_hwsp_wrap),
1414 		SUBTEST(live_hwsp_read),
1415 		SUBTEST(live_hwsp_rollover_kernel),
1416 		SUBTEST(live_hwsp_rollover_user),
1417 	};
1418 
1419 	if (intel_gt_is_wedged(to_gt(i915)))
1420 		return 0;
1421 
1422 	return intel_gt_live_subtests(tests, to_gt(i915));
1423 }
1424