1 /*
2  * SPDX-License-Identifier: MIT
3  *
4  * Copyright © 2017-2018 Intel Corporation
5  */
6 
7 #include <linux/prime_numbers.h>
8 
9 #include "intel_context.h"
10 #include "intel_engine_heartbeat.h"
11 #include "intel_engine_pm.h"
12 #include "intel_gpu_commands.h"
13 #include "intel_gt.h"
14 #include "intel_gt_requests.h"
15 #include "intel_ring.h"
16 #include "selftest_engine_heartbeat.h"
17 
18 #include "../selftests/i915_random.h"
19 #include "../i915_selftest.h"
20 
21 #include "selftests/igt_flush_test.h"
22 #include "selftests/lib_sw_fence.h"
23 #include "selftests/mock_gem_device.h"
24 #include "selftests/mock_timeline.h"
25 
26 static struct page *hwsp_page(struct intel_timeline *tl)
27 {
28 	struct drm_i915_gem_object *obj = tl->hwsp_ggtt->obj;
29 
30 	GEM_BUG_ON(!i915_gem_object_has_pinned_pages(obj));
31 	return sg_page(obj->mm.pages->sgl);
32 }
33 
34 static unsigned long hwsp_cacheline(struct intel_timeline *tl)
35 {
36 	unsigned long address = (unsigned long)page_address(hwsp_page(tl));
37 
38 	return (address + tl->hwsp_offset) / CACHELINE_BYTES;
39 }
40 
41 #define CACHELINES_PER_PAGE (PAGE_SIZE / CACHELINE_BYTES)
42 
43 struct mock_hwsp_freelist {
44 	struct intel_gt *gt;
45 	struct radix_tree_root cachelines;
46 	struct intel_timeline **history;
47 	unsigned long count, max;
48 	struct rnd_state prng;
49 };
50 
51 enum {
52 	SHUFFLE = BIT(0),
53 };
54 
55 static void __mock_hwsp_record(struct mock_hwsp_freelist *state,
56 			       unsigned int idx,
57 			       struct intel_timeline *tl)
58 {
59 	tl = xchg(&state->history[idx], tl);
60 	if (tl) {
61 		radix_tree_delete(&state->cachelines, hwsp_cacheline(tl));
62 		intel_timeline_put(tl);
63 	}
64 }
65 
66 static int __mock_hwsp_timeline(struct mock_hwsp_freelist *state,
67 				unsigned int count,
68 				unsigned int flags)
69 {
70 	struct intel_timeline *tl;
71 	unsigned int idx;
72 
73 	while (count--) {
74 		unsigned long cacheline;
75 		int err;
76 
77 		tl = intel_timeline_create(state->gt);
78 		if (IS_ERR(tl))
79 			return PTR_ERR(tl);
80 
81 		cacheline = hwsp_cacheline(tl);
82 		err = radix_tree_insert(&state->cachelines, cacheline, tl);
83 		if (err) {
84 			if (err == -EEXIST) {
85 				pr_err("HWSP cacheline %lu already used; duplicate allocation!\n",
86 				       cacheline);
87 			}
88 			intel_timeline_put(tl);
89 			return err;
90 		}
91 
92 		idx = state->count++ % state->max;
93 		__mock_hwsp_record(state, idx, tl);
94 	}
95 
96 	if (flags & SHUFFLE)
97 		i915_prandom_shuffle(state->history,
98 				     sizeof(*state->history),
99 				     min(state->count, state->max),
100 				     &state->prng);
101 
102 	count = i915_prandom_u32_max_state(min(state->count, state->max),
103 					   &state->prng);
104 	while (count--) {
105 		idx = --state->count % state->max;
106 		__mock_hwsp_record(state, idx, NULL);
107 	}
108 
109 	return 0;
110 }
111 
112 static int mock_hwsp_freelist(void *arg)
113 {
114 	struct mock_hwsp_freelist state;
115 	struct drm_i915_private *i915;
116 	const struct {
117 		const char *name;
118 		unsigned int flags;
119 	} phases[] = {
120 		{ "linear", 0 },
121 		{ "shuffled", SHUFFLE },
122 		{ },
123 	}, *p;
124 	unsigned int na;
125 	int err = 0;
126 
127 	i915 = mock_gem_device();
128 	if (!i915)
129 		return -ENOMEM;
130 
131 	INIT_RADIX_TREE(&state.cachelines, GFP_KERNEL);
132 	state.prng = I915_RND_STATE_INITIALIZER(i915_selftest.random_seed);
133 
134 	state.gt = &i915->gt;
135 
136 	/*
137 	 * Create a bunch of timelines and check that their HWSP do not overlap.
138 	 * Free some, and try again.
139 	 */
140 
141 	state.max = PAGE_SIZE / sizeof(*state.history);
142 	state.count = 0;
143 	state.history = kcalloc(state.max, sizeof(*state.history), GFP_KERNEL);
144 	if (!state.history) {
145 		err = -ENOMEM;
146 		goto err_put;
147 	}
148 
149 	for (p = phases; p->name; p++) {
150 		pr_debug("%s(%s)\n", __func__, p->name);
151 		for_each_prime_number_from(na, 1, 2 * CACHELINES_PER_PAGE) {
152 			err = __mock_hwsp_timeline(&state, na, p->flags);
153 			if (err)
154 				goto out;
155 		}
156 	}
157 
158 out:
159 	for (na = 0; na < state.max; na++)
160 		__mock_hwsp_record(&state, na, NULL);
161 	kfree(state.history);
162 err_put:
163 	mock_destroy_device(i915);
164 	return err;
165 }
166 
167 struct __igt_sync {
168 	const char *name;
169 	u32 seqno;
170 	bool expected;
171 	bool set;
172 };
173 
174 static int __igt_sync(struct intel_timeline *tl,
175 		      u64 ctx,
176 		      const struct __igt_sync *p,
177 		      const char *name)
178 {
179 	int ret;
180 
181 	if (__intel_timeline_sync_is_later(tl, ctx, p->seqno) != p->expected) {
182 		pr_err("%s: %s(ctx=%llu, seqno=%u) expected passed %s but failed\n",
183 		       name, p->name, ctx, p->seqno, yesno(p->expected));
184 		return -EINVAL;
185 	}
186 
187 	if (p->set) {
188 		ret = __intel_timeline_sync_set(tl, ctx, p->seqno);
189 		if (ret)
190 			return ret;
191 	}
192 
193 	return 0;
194 }
195 
196 static int igt_sync(void *arg)
197 {
198 	const struct __igt_sync pass[] = {
199 		{ "unset", 0, false, false },
200 		{ "new", 0, false, true },
201 		{ "0a", 0, true, true },
202 		{ "1a", 1, false, true },
203 		{ "1b", 1, true, true },
204 		{ "0b", 0, true, false },
205 		{ "2a", 2, false, true },
206 		{ "4", 4, false, true },
207 		{ "INT_MAX", INT_MAX, false, true },
208 		{ "INT_MAX-1", INT_MAX-1, true, false },
209 		{ "INT_MAX+1", (u32)INT_MAX+1, false, true },
210 		{ "INT_MAX", INT_MAX, true, false },
211 		{ "UINT_MAX", UINT_MAX, false, true },
212 		{ "wrap", 0, false, true },
213 		{ "unwrap", UINT_MAX, true, false },
214 		{},
215 	}, *p;
216 	struct intel_timeline tl;
217 	int order, offset;
218 	int ret = -ENODEV;
219 
220 	mock_timeline_init(&tl, 0);
221 	for (p = pass; p->name; p++) {
222 		for (order = 1; order < 64; order++) {
223 			for (offset = -1; offset <= (order > 1); offset++) {
224 				u64 ctx = BIT_ULL(order) + offset;
225 
226 				ret = __igt_sync(&tl, ctx, p, "1");
227 				if (ret)
228 					goto out;
229 			}
230 		}
231 	}
232 	mock_timeline_fini(&tl);
233 
234 	mock_timeline_init(&tl, 0);
235 	for (order = 1; order < 64; order++) {
236 		for (offset = -1; offset <= (order > 1); offset++) {
237 			u64 ctx = BIT_ULL(order) + offset;
238 
239 			for (p = pass; p->name; p++) {
240 				ret = __igt_sync(&tl, ctx, p, "2");
241 				if (ret)
242 					goto out;
243 			}
244 		}
245 	}
246 
247 out:
248 	mock_timeline_fini(&tl);
249 	return ret;
250 }
251 
252 static unsigned int random_engine(struct rnd_state *rnd)
253 {
254 	return i915_prandom_u32_max_state(I915_NUM_ENGINES, rnd);
255 }
256 
257 static int bench_sync(void *arg)
258 {
259 	struct rnd_state prng;
260 	struct intel_timeline tl;
261 	unsigned long end_time, count;
262 	u64 prng32_1M;
263 	ktime_t kt;
264 	int order, last_order;
265 
266 	mock_timeline_init(&tl, 0);
267 
268 	/* Lookups from cache are very fast and so the random number generation
269 	 * and the loop itself becomes a significant factor in the per-iteration
270 	 * timings. We try to compensate the results by measuring the overhead
271 	 * of the prng and subtract it from the reported results.
272 	 */
273 	prandom_seed_state(&prng, i915_selftest.random_seed);
274 	count = 0;
275 	kt = ktime_get();
276 	end_time = jiffies + HZ/10;
277 	do {
278 		u32 x;
279 
280 		/* Make sure the compiler doesn't optimise away the prng call */
281 		WRITE_ONCE(x, prandom_u32_state(&prng));
282 
283 		count++;
284 	} while (!time_after(jiffies, end_time));
285 	kt = ktime_sub(ktime_get(), kt);
286 	pr_debug("%s: %lu random evaluations, %lluns/prng\n",
287 		 __func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
288 	prng32_1M = div64_ul(ktime_to_ns(kt) << 20, count);
289 
290 	/* Benchmark (only) setting random context ids */
291 	prandom_seed_state(&prng, i915_selftest.random_seed);
292 	count = 0;
293 	kt = ktime_get();
294 	end_time = jiffies + HZ/10;
295 	do {
296 		u64 id = i915_prandom_u64_state(&prng);
297 
298 		__intel_timeline_sync_set(&tl, id, 0);
299 		count++;
300 	} while (!time_after(jiffies, end_time));
301 	kt = ktime_sub(ktime_get(), kt);
302 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
303 	pr_info("%s: %lu random insertions, %lluns/insert\n",
304 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
305 
306 	/* Benchmark looking up the exact same context ids as we just set */
307 	prandom_seed_state(&prng, i915_selftest.random_seed);
308 	end_time = count;
309 	kt = ktime_get();
310 	while (end_time--) {
311 		u64 id = i915_prandom_u64_state(&prng);
312 
313 		if (!__intel_timeline_sync_is_later(&tl, id, 0)) {
314 			mock_timeline_fini(&tl);
315 			pr_err("Lookup of %llu failed\n", id);
316 			return -EINVAL;
317 		}
318 	}
319 	kt = ktime_sub(ktime_get(), kt);
320 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
321 	pr_info("%s: %lu random lookups, %lluns/lookup\n",
322 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
323 
324 	mock_timeline_fini(&tl);
325 	cond_resched();
326 
327 	mock_timeline_init(&tl, 0);
328 
329 	/* Benchmark setting the first N (in order) contexts */
330 	count = 0;
331 	kt = ktime_get();
332 	end_time = jiffies + HZ/10;
333 	do {
334 		__intel_timeline_sync_set(&tl, count++, 0);
335 	} while (!time_after(jiffies, end_time));
336 	kt = ktime_sub(ktime_get(), kt);
337 	pr_info("%s: %lu in-order insertions, %lluns/insert\n",
338 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
339 
340 	/* Benchmark looking up the exact same context ids as we just set */
341 	end_time = count;
342 	kt = ktime_get();
343 	while (end_time--) {
344 		if (!__intel_timeline_sync_is_later(&tl, end_time, 0)) {
345 			pr_err("Lookup of %lu failed\n", end_time);
346 			mock_timeline_fini(&tl);
347 			return -EINVAL;
348 		}
349 	}
350 	kt = ktime_sub(ktime_get(), kt);
351 	pr_info("%s: %lu in-order lookups, %lluns/lookup\n",
352 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
353 
354 	mock_timeline_fini(&tl);
355 	cond_resched();
356 
357 	mock_timeline_init(&tl, 0);
358 
359 	/* Benchmark searching for a random context id and maybe changing it */
360 	prandom_seed_state(&prng, i915_selftest.random_seed);
361 	count = 0;
362 	kt = ktime_get();
363 	end_time = jiffies + HZ/10;
364 	do {
365 		u32 id = random_engine(&prng);
366 		u32 seqno = prandom_u32_state(&prng);
367 
368 		if (!__intel_timeline_sync_is_later(&tl, id, seqno))
369 			__intel_timeline_sync_set(&tl, id, seqno);
370 
371 		count++;
372 	} while (!time_after(jiffies, end_time));
373 	kt = ktime_sub(ktime_get(), kt);
374 	kt = ktime_sub_ns(kt, (count * prng32_1M * 2) >> 20);
375 	pr_info("%s: %lu repeated insert/lookups, %lluns/op\n",
376 		__func__, count, (long long)div64_ul(ktime_to_ns(kt), count));
377 	mock_timeline_fini(&tl);
378 	cond_resched();
379 
380 	/* Benchmark searching for a known context id and changing the seqno */
381 	for (last_order = 1, order = 1; order < 32;
382 	     ({ int tmp = last_order; last_order = order; order += tmp; })) {
383 		unsigned int mask = BIT(order) - 1;
384 
385 		mock_timeline_init(&tl, 0);
386 
387 		count = 0;
388 		kt = ktime_get();
389 		end_time = jiffies + HZ/10;
390 		do {
391 			/* Without assuming too many details of the underlying
392 			 * implementation, try to identify its phase-changes
393 			 * (if any)!
394 			 */
395 			u64 id = (u64)(count & mask) << order;
396 
397 			__intel_timeline_sync_is_later(&tl, id, 0);
398 			__intel_timeline_sync_set(&tl, id, 0);
399 
400 			count++;
401 		} while (!time_after(jiffies, end_time));
402 		kt = ktime_sub(ktime_get(), kt);
403 		pr_info("%s: %lu cyclic/%d insert/lookups, %lluns/op\n",
404 			__func__, count, order,
405 			(long long)div64_ul(ktime_to_ns(kt), count));
406 		mock_timeline_fini(&tl);
407 		cond_resched();
408 	}
409 
410 	return 0;
411 }
412 
413 int intel_timeline_mock_selftests(void)
414 {
415 	static const struct i915_subtest tests[] = {
416 		SUBTEST(mock_hwsp_freelist),
417 		SUBTEST(igt_sync),
418 		SUBTEST(bench_sync),
419 	};
420 
421 	return i915_subtests(tests, NULL);
422 }
423 
424 static int emit_ggtt_store_dw(struct i915_request *rq, u32 addr, u32 value)
425 {
426 	u32 *cs;
427 
428 	cs = intel_ring_begin(rq, 4);
429 	if (IS_ERR(cs))
430 		return PTR_ERR(cs);
431 
432 	if (INTEL_GEN(rq->engine->i915) >= 8) {
433 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
434 		*cs++ = addr;
435 		*cs++ = 0;
436 		*cs++ = value;
437 	} else if (INTEL_GEN(rq->engine->i915) >= 4) {
438 		*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
439 		*cs++ = 0;
440 		*cs++ = addr;
441 		*cs++ = value;
442 	} else {
443 		*cs++ = MI_STORE_DWORD_IMM | MI_MEM_VIRTUAL;
444 		*cs++ = addr;
445 		*cs++ = value;
446 		*cs++ = MI_NOOP;
447 	}
448 
449 	intel_ring_advance(rq, cs);
450 
451 	return 0;
452 }
453 
454 static struct i915_request *
455 tl_write(struct intel_timeline *tl, struct intel_engine_cs *engine, u32 value)
456 {
457 	struct i915_request *rq;
458 	int err;
459 
460 	err = intel_timeline_pin(tl, NULL);
461 	if (err) {
462 		rq = ERR_PTR(err);
463 		goto out;
464 	}
465 
466 	rq = intel_engine_create_kernel_request(engine);
467 	if (IS_ERR(rq))
468 		goto out_unpin;
469 
470 	i915_request_get(rq);
471 
472 	err = emit_ggtt_store_dw(rq, tl->hwsp_offset, value);
473 	i915_request_add(rq);
474 	if (err) {
475 		i915_request_put(rq);
476 		rq = ERR_PTR(err);
477 	}
478 
479 out_unpin:
480 	intel_timeline_unpin(tl);
481 out:
482 	if (IS_ERR(rq))
483 		pr_err("Failed to write to timeline!\n");
484 	return rq;
485 }
486 
487 static struct intel_timeline *
488 checked_intel_timeline_create(struct intel_gt *gt)
489 {
490 	struct intel_timeline *tl;
491 
492 	tl = intel_timeline_create(gt);
493 	if (IS_ERR(tl))
494 		return tl;
495 
496 	if (READ_ONCE(*tl->hwsp_seqno) != tl->seqno) {
497 		pr_err("Timeline created with incorrect breadcrumb, found %x, expected %x\n",
498 		       *tl->hwsp_seqno, tl->seqno);
499 		intel_timeline_put(tl);
500 		return ERR_PTR(-EINVAL);
501 	}
502 
503 	return tl;
504 }
505 
506 static int live_hwsp_engine(void *arg)
507 {
508 #define NUM_TIMELINES 4096
509 	struct intel_gt *gt = arg;
510 	struct intel_timeline **timelines;
511 	struct intel_engine_cs *engine;
512 	enum intel_engine_id id;
513 	unsigned long count, n;
514 	int err = 0;
515 
516 	/*
517 	 * Create a bunch of timelines and check we can write
518 	 * independently to each of their breadcrumb slots.
519 	 */
520 
521 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
522 				   sizeof(*timelines),
523 				   GFP_KERNEL);
524 	if (!timelines)
525 		return -ENOMEM;
526 
527 	count = 0;
528 	for_each_engine(engine, gt, id) {
529 		if (!intel_engine_can_store_dword(engine))
530 			continue;
531 
532 		intel_engine_pm_get(engine);
533 
534 		for (n = 0; n < NUM_TIMELINES; n++) {
535 			struct intel_timeline *tl;
536 			struct i915_request *rq;
537 
538 			tl = checked_intel_timeline_create(gt);
539 			if (IS_ERR(tl)) {
540 				err = PTR_ERR(tl);
541 				break;
542 			}
543 
544 			rq = tl_write(tl, engine, count);
545 			if (IS_ERR(rq)) {
546 				intel_timeline_put(tl);
547 				err = PTR_ERR(rq);
548 				break;
549 			}
550 
551 			timelines[count++] = tl;
552 			i915_request_put(rq);
553 		}
554 
555 		intel_engine_pm_put(engine);
556 		if (err)
557 			break;
558 	}
559 
560 	if (igt_flush_test(gt->i915))
561 		err = -EIO;
562 
563 	for (n = 0; n < count; n++) {
564 		struct intel_timeline *tl = timelines[n];
565 
566 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
567 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
568 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
569 			GEM_TRACE_DUMP();
570 			err = -EINVAL;
571 		}
572 		intel_timeline_put(tl);
573 	}
574 
575 	kvfree(timelines);
576 	return err;
577 #undef NUM_TIMELINES
578 }
579 
580 static int live_hwsp_alternate(void *arg)
581 {
582 #define NUM_TIMELINES 4096
583 	struct intel_gt *gt = arg;
584 	struct intel_timeline **timelines;
585 	struct intel_engine_cs *engine;
586 	enum intel_engine_id id;
587 	unsigned long count, n;
588 	int err = 0;
589 
590 	/*
591 	 * Create a bunch of timelines and check we can write
592 	 * independently to each of their breadcrumb slots with adjacent
593 	 * engines.
594 	 */
595 
596 	timelines = kvmalloc_array(NUM_TIMELINES * I915_NUM_ENGINES,
597 				   sizeof(*timelines),
598 				   GFP_KERNEL);
599 	if (!timelines)
600 		return -ENOMEM;
601 
602 	count = 0;
603 	for (n = 0; n < NUM_TIMELINES; n++) {
604 		for_each_engine(engine, gt, id) {
605 			struct intel_timeline *tl;
606 			struct i915_request *rq;
607 
608 			if (!intel_engine_can_store_dword(engine))
609 				continue;
610 
611 			tl = checked_intel_timeline_create(gt);
612 			if (IS_ERR(tl)) {
613 				err = PTR_ERR(tl);
614 				goto out;
615 			}
616 
617 			intel_engine_pm_get(engine);
618 			rq = tl_write(tl, engine, count);
619 			intel_engine_pm_put(engine);
620 			if (IS_ERR(rq)) {
621 				intel_timeline_put(tl);
622 				err = PTR_ERR(rq);
623 				goto out;
624 			}
625 
626 			timelines[count++] = tl;
627 			i915_request_put(rq);
628 		}
629 	}
630 
631 out:
632 	if (igt_flush_test(gt->i915))
633 		err = -EIO;
634 
635 	for (n = 0; n < count; n++) {
636 		struct intel_timeline *tl = timelines[n];
637 
638 		if (!err && READ_ONCE(*tl->hwsp_seqno) != n) {
639 			GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x, found 0x%x\n",
640 				      n, tl->fence_context, tl->hwsp_offset, *tl->hwsp_seqno);
641 			GEM_TRACE_DUMP();
642 			err = -EINVAL;
643 		}
644 		intel_timeline_put(tl);
645 	}
646 
647 	kvfree(timelines);
648 	return err;
649 #undef NUM_TIMELINES
650 }
651 
652 static int live_hwsp_wrap(void *arg)
653 {
654 	struct intel_gt *gt = arg;
655 	struct intel_engine_cs *engine;
656 	struct intel_timeline *tl;
657 	enum intel_engine_id id;
658 	int err = 0;
659 
660 	/*
661 	 * Across a seqno wrap, we need to keep the old cacheline alive for
662 	 * foreign GPU references.
663 	 */
664 
665 	tl = intel_timeline_create(gt);
666 	if (IS_ERR(tl))
667 		return PTR_ERR(tl);
668 
669 	if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
670 		goto out_free;
671 
672 	err = intel_timeline_pin(tl, NULL);
673 	if (err)
674 		goto out_free;
675 
676 	for_each_engine(engine, gt, id) {
677 		const u32 *hwsp_seqno[2];
678 		struct i915_request *rq;
679 		u32 seqno[2];
680 
681 		if (!intel_engine_can_store_dword(engine))
682 			continue;
683 
684 		rq = intel_engine_create_kernel_request(engine);
685 		if (IS_ERR(rq)) {
686 			err = PTR_ERR(rq);
687 			goto out;
688 		}
689 
690 		tl->seqno = -4u;
691 
692 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
693 		err = intel_timeline_get_seqno(tl, rq, &seqno[0]);
694 		mutex_unlock(&tl->mutex);
695 		if (err) {
696 			i915_request_add(rq);
697 			goto out;
698 		}
699 		pr_debug("seqno[0]:%08x, hwsp_offset:%08x\n",
700 			 seqno[0], tl->hwsp_offset);
701 
702 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[0]);
703 		if (err) {
704 			i915_request_add(rq);
705 			goto out;
706 		}
707 		hwsp_seqno[0] = tl->hwsp_seqno;
708 
709 		mutex_lock_nested(&tl->mutex, SINGLE_DEPTH_NESTING);
710 		err = intel_timeline_get_seqno(tl, rq, &seqno[1]);
711 		mutex_unlock(&tl->mutex);
712 		if (err) {
713 			i915_request_add(rq);
714 			goto out;
715 		}
716 		pr_debug("seqno[1]:%08x, hwsp_offset:%08x\n",
717 			 seqno[1], tl->hwsp_offset);
718 
719 		err = emit_ggtt_store_dw(rq, tl->hwsp_offset, seqno[1]);
720 		if (err) {
721 			i915_request_add(rq);
722 			goto out;
723 		}
724 		hwsp_seqno[1] = tl->hwsp_seqno;
725 
726 		/* With wrap should come a new hwsp */
727 		GEM_BUG_ON(seqno[1] >= seqno[0]);
728 		GEM_BUG_ON(hwsp_seqno[0] == hwsp_seqno[1]);
729 
730 		i915_request_add(rq);
731 
732 		if (i915_request_wait(rq, 0, HZ / 5) < 0) {
733 			pr_err("Wait for timeline writes timed out!\n");
734 			err = -EIO;
735 			goto out;
736 		}
737 
738 		if (READ_ONCE(*hwsp_seqno[0]) != seqno[0] ||
739 		    READ_ONCE(*hwsp_seqno[1]) != seqno[1]) {
740 			pr_err("Bad timeline values: found (%x, %x), expected (%x, %x)\n",
741 			       *hwsp_seqno[0], *hwsp_seqno[1],
742 			       seqno[0], seqno[1]);
743 			err = -EINVAL;
744 			goto out;
745 		}
746 
747 		intel_gt_retire_requests(gt); /* recycle HWSP */
748 	}
749 
750 out:
751 	if (igt_flush_test(gt->i915))
752 		err = -EIO;
753 
754 	intel_timeline_unpin(tl);
755 out_free:
756 	intel_timeline_put(tl);
757 	return err;
758 }
759 
760 static int emit_read_hwsp(struct i915_request *rq,
761 			  u32 seqno, u32 hwsp,
762 			  u32 *addr)
763 {
764 	const u32 gpr = i915_mmio_reg_offset(GEN8_RING_CS_GPR(rq->engine->mmio_base, 0));
765 	u32 *cs;
766 
767 	cs = intel_ring_begin(rq, 12);
768 	if (IS_ERR(cs))
769 		return PTR_ERR(cs);
770 
771 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
772 	*cs++ = *addr;
773 	*cs++ = 0;
774 	*cs++ = seqno;
775 	*addr += 4;
776 
777 	*cs++ = MI_LOAD_REGISTER_MEM_GEN8 | MI_USE_GGTT;
778 	*cs++ = gpr;
779 	*cs++ = hwsp;
780 	*cs++ = 0;
781 
782 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
783 	*cs++ = gpr;
784 	*cs++ = *addr;
785 	*cs++ = 0;
786 	*addr += 4;
787 
788 	intel_ring_advance(rq, cs);
789 
790 	return 0;
791 }
792 
793 struct hwsp_watcher {
794 	struct i915_vma *vma;
795 	struct i915_request *rq;
796 	u32 addr;
797 	u32 *map;
798 };
799 
800 static bool cmp_lt(u32 a, u32 b)
801 {
802 	return a < b;
803 }
804 
805 static bool cmp_gte(u32 a, u32 b)
806 {
807 	return a >= b;
808 }
809 
810 static int setup_watcher(struct hwsp_watcher *w, struct intel_gt *gt)
811 {
812 	struct drm_i915_gem_object *obj;
813 	struct i915_vma *vma;
814 
815 	obj = i915_gem_object_create_internal(gt->i915, SZ_2M);
816 	if (IS_ERR(obj))
817 		return PTR_ERR(obj);
818 
819 	w->map = i915_gem_object_pin_map(obj, I915_MAP_WB);
820 	if (IS_ERR(w->map)) {
821 		i915_gem_object_put(obj);
822 		return PTR_ERR(w->map);
823 	}
824 
825 	vma = i915_gem_object_ggtt_pin_ww(obj, NULL, NULL, 0, 0, 0);
826 	if (IS_ERR(vma)) {
827 		i915_gem_object_put(obj);
828 		return PTR_ERR(vma);
829 	}
830 
831 	w->vma = vma;
832 	w->addr = i915_ggtt_offset(vma);
833 	return 0;
834 }
835 
836 static int create_watcher(struct hwsp_watcher *w,
837 			  struct intel_engine_cs *engine,
838 			  int ringsz)
839 {
840 	struct intel_context *ce;
841 	struct intel_timeline *tl;
842 
843 	ce = intel_context_create(engine);
844 	if (IS_ERR(ce))
845 		return PTR_ERR(ce);
846 
847 	ce->ring = __intel_context_ring_size(ringsz);
848 	w->rq = intel_context_create_request(ce);
849 	intel_context_put(ce);
850 	if (IS_ERR(w->rq))
851 		return PTR_ERR(w->rq);
852 
853 	w->addr = i915_ggtt_offset(w->vma);
854 	tl = w->rq->context->timeline;
855 
856 	/* some light mutex juggling required; think co-routines */
857 	lockdep_unpin_lock(&tl->mutex, w->rq->cookie);
858 	mutex_unlock(&tl->mutex);
859 
860 	return 0;
861 }
862 
863 static int check_watcher(struct hwsp_watcher *w, const char *name,
864 			 bool (*op)(u32 hwsp, u32 seqno))
865 {
866 	struct i915_request *rq = fetch_and_zero(&w->rq);
867 	struct intel_timeline *tl = rq->context->timeline;
868 	u32 offset, end;
869 	int err;
870 
871 	GEM_BUG_ON(w->addr - i915_ggtt_offset(w->vma) > w->vma->size);
872 
873 	i915_request_get(rq);
874 	mutex_lock(&tl->mutex);
875 	rq->cookie = lockdep_pin_lock(&tl->mutex);
876 	i915_request_add(rq);
877 
878 	if (i915_request_wait(rq, 0, HZ) < 0) {
879 		err = -ETIME;
880 		goto out;
881 	}
882 
883 	err = 0;
884 	offset = 0;
885 	end = (w->addr - i915_ggtt_offset(w->vma)) / sizeof(*w->map);
886 	while (offset < end) {
887 		if (!op(w->map[offset + 1], w->map[offset])) {
888 			pr_err("Watcher '%s' found HWSP value %x for seqno %x\n",
889 			       name, w->map[offset + 1], w->map[offset]);
890 			err = -EINVAL;
891 		}
892 
893 		offset += 2;
894 	}
895 
896 out:
897 	i915_request_put(rq);
898 	return err;
899 }
900 
901 static void cleanup_watcher(struct hwsp_watcher *w)
902 {
903 	if (w->rq) {
904 		struct intel_timeline *tl = w->rq->context->timeline;
905 
906 		mutex_lock(&tl->mutex);
907 		w->rq->cookie = lockdep_pin_lock(&tl->mutex);
908 
909 		i915_request_add(w->rq);
910 	}
911 
912 	i915_vma_unpin_and_release(&w->vma, I915_VMA_RELEASE_MAP);
913 }
914 
915 static bool retire_requests(struct intel_timeline *tl)
916 {
917 	struct i915_request *rq, *rn;
918 
919 	mutex_lock(&tl->mutex);
920 	list_for_each_entry_safe(rq, rn, &tl->requests, link)
921 		if (!i915_request_retire(rq))
922 			break;
923 	mutex_unlock(&tl->mutex);
924 
925 	return !i915_active_fence_isset(&tl->last_request);
926 }
927 
928 static struct i915_request *wrap_timeline(struct i915_request *rq)
929 {
930 	struct intel_context *ce = rq->context;
931 	struct intel_timeline *tl = ce->timeline;
932 	u32 seqno = rq->fence.seqno;
933 
934 	while (tl->seqno >= seqno) { /* Cause a wrap */
935 		i915_request_put(rq);
936 		rq = intel_context_create_request(ce);
937 		if (IS_ERR(rq))
938 			return rq;
939 
940 		i915_request_get(rq);
941 		i915_request_add(rq);
942 	}
943 
944 	i915_request_put(rq);
945 	rq = intel_context_create_request(ce);
946 	if (IS_ERR(rq))
947 		return rq;
948 
949 	i915_request_get(rq);
950 	i915_request_add(rq);
951 
952 	return rq;
953 }
954 
955 static int live_hwsp_read(void *arg)
956 {
957 	struct intel_gt *gt = arg;
958 	struct hwsp_watcher watcher[2] = {};
959 	struct intel_engine_cs *engine;
960 	struct intel_timeline *tl;
961 	enum intel_engine_id id;
962 	int err = 0;
963 	int i;
964 
965 	/*
966 	 * If we take a reference to the HWSP for reading on the GPU, that
967 	 * read may be arbitrarily delayed (either by foreign fence or
968 	 * priority saturation) and a wrap can happen within 30 minutes.
969 	 * When the GPU read is finally submitted it should be correct,
970 	 * even across multiple wraps.
971 	 */
972 
973 	if (INTEL_GEN(gt->i915) < 8) /* CS convenience [SRM/LRM] */
974 		return 0;
975 
976 	tl = intel_timeline_create(gt);
977 	if (IS_ERR(tl))
978 		return PTR_ERR(tl);
979 
980 	if (!tl->hwsp_cacheline)
981 		goto out_free;
982 
983 	for (i = 0; i < ARRAY_SIZE(watcher); i++) {
984 		err = setup_watcher(&watcher[i], gt);
985 		if (err)
986 			goto out;
987 	}
988 
989 	for_each_engine(engine, gt, id) {
990 		struct intel_context *ce;
991 		unsigned long count = 0;
992 		IGT_TIMEOUT(end_time);
993 
994 		/* Create a request we can use for remote reading of the HWSP */
995 		err = create_watcher(&watcher[1], engine, SZ_512K);
996 		if (err)
997 			goto out;
998 
999 		do {
1000 			struct i915_sw_fence *submit;
1001 			struct i915_request *rq;
1002 			u32 hwsp;
1003 
1004 			submit = heap_fence_create(GFP_KERNEL);
1005 			if (!submit) {
1006 				err = -ENOMEM;
1007 				goto out;
1008 			}
1009 
1010 			err = create_watcher(&watcher[0], engine, SZ_4K);
1011 			if (err)
1012 				goto out;
1013 
1014 			ce = intel_context_create(engine);
1015 			if (IS_ERR(ce)) {
1016 				err = PTR_ERR(ce);
1017 				goto out;
1018 			}
1019 
1020 			/* Skip to the end, saving 30 minutes of nops */
1021 			tl->seqno = -10u + 2 * (count & 3);
1022 			WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1023 			ce->timeline = intel_timeline_get(tl);
1024 
1025 			rq = intel_context_create_request(ce);
1026 			if (IS_ERR(rq)) {
1027 				err = PTR_ERR(rq);
1028 				intel_context_put(ce);
1029 				goto out;
1030 			}
1031 
1032 			err = i915_sw_fence_await_dma_fence(&rq->submit,
1033 							    &watcher[0].rq->fence, 0,
1034 							    GFP_KERNEL);
1035 			if (err < 0) {
1036 				i915_request_add(rq);
1037 				intel_context_put(ce);
1038 				goto out;
1039 			}
1040 
1041 			mutex_lock(&watcher[0].rq->context->timeline->mutex);
1042 			err = intel_timeline_read_hwsp(rq, watcher[0].rq, &hwsp);
1043 			if (err == 0)
1044 				err = emit_read_hwsp(watcher[0].rq, /* before */
1045 						     rq->fence.seqno, hwsp,
1046 						     &watcher[0].addr);
1047 			mutex_unlock(&watcher[0].rq->context->timeline->mutex);
1048 			if (err) {
1049 				i915_request_add(rq);
1050 				intel_context_put(ce);
1051 				goto out;
1052 			}
1053 
1054 			mutex_lock(&watcher[1].rq->context->timeline->mutex);
1055 			err = intel_timeline_read_hwsp(rq, watcher[1].rq, &hwsp);
1056 			if (err == 0)
1057 				err = emit_read_hwsp(watcher[1].rq, /* after */
1058 						     rq->fence.seqno, hwsp,
1059 						     &watcher[1].addr);
1060 			mutex_unlock(&watcher[1].rq->context->timeline->mutex);
1061 			if (err) {
1062 				i915_request_add(rq);
1063 				intel_context_put(ce);
1064 				goto out;
1065 			}
1066 
1067 			i915_request_get(rq);
1068 			i915_request_add(rq);
1069 
1070 			rq = wrap_timeline(rq);
1071 			intel_context_put(ce);
1072 			if (IS_ERR(rq)) {
1073 				err = PTR_ERR(rq);
1074 				goto out;
1075 			}
1076 
1077 			err = i915_sw_fence_await_dma_fence(&watcher[1].rq->submit,
1078 							    &rq->fence, 0,
1079 							    GFP_KERNEL);
1080 			if (err < 0) {
1081 				i915_request_put(rq);
1082 				goto out;
1083 			}
1084 
1085 			err = check_watcher(&watcher[0], "before", cmp_lt);
1086 			i915_sw_fence_commit(submit);
1087 			heap_fence_put(submit);
1088 			if (err) {
1089 				i915_request_put(rq);
1090 				goto out;
1091 			}
1092 			count++;
1093 
1094 			/* Flush the timeline before manually wrapping again */
1095 			if (i915_request_wait(rq,
1096 					      I915_WAIT_INTERRUPTIBLE,
1097 					      HZ) < 0) {
1098 				err = -ETIME;
1099 				i915_request_put(rq);
1100 				goto out;
1101 			}
1102 			retire_requests(tl);
1103 			i915_request_put(rq);
1104 
1105 			/* Single requests are limited to half a ring at most */
1106 			if (8 * watcher[1].rq->ring->emit >
1107 			    3 * watcher[1].rq->ring->size)
1108 				break;
1109 
1110 		} while (!__igt_timeout(end_time, NULL));
1111 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, 0xdeadbeef);
1112 
1113 		pr_info("%s: simulated %lu wraps\n", engine->name, count);
1114 		err = check_watcher(&watcher[1], "after", cmp_gte);
1115 		if (err)
1116 			goto out;
1117 	}
1118 
1119 out:
1120 	for (i = 0; i < ARRAY_SIZE(watcher); i++)
1121 		cleanup_watcher(&watcher[i]);
1122 
1123 	if (igt_flush_test(gt->i915))
1124 		err = -EIO;
1125 
1126 out_free:
1127 	intel_timeline_put(tl);
1128 	return err;
1129 }
1130 
1131 static int live_hwsp_rollover_kernel(void *arg)
1132 {
1133 	struct intel_gt *gt = arg;
1134 	struct intel_engine_cs *engine;
1135 	enum intel_engine_id id;
1136 	int err = 0;
1137 
1138 	/*
1139 	 * Run the host for long enough, and even the kernel context will
1140 	 * see a seqno rollover.
1141 	 */
1142 
1143 	for_each_engine(engine, gt, id) {
1144 		struct intel_context *ce = engine->kernel_context;
1145 		struct intel_timeline *tl = ce->timeline;
1146 		struct i915_request *rq[3] = {};
1147 		int i;
1148 
1149 		st_engine_heartbeat_disable(engine);
1150 		if (intel_gt_wait_for_idle(gt, HZ / 2)) {
1151 			err = -EIO;
1152 			goto out;
1153 		}
1154 
1155 		GEM_BUG_ON(i915_active_fence_isset(&tl->last_request));
1156 		tl->seqno = 0;
1157 		timeline_rollback(tl);
1158 		timeline_rollback(tl);
1159 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1160 
1161 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1162 			struct i915_request *this;
1163 
1164 			this = i915_request_create(ce);
1165 			if (IS_ERR(this)) {
1166 				err = PTR_ERR(this);
1167 				goto out;
1168 			}
1169 
1170 			pr_debug("%s: create fence.seqnp:%d\n",
1171 				 engine->name,
1172 				 lower_32_bits(this->fence.seqno));
1173 
1174 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1175 
1176 			rq[i] = i915_request_get(this);
1177 			i915_request_add(this);
1178 		}
1179 
1180 		/* We expected a wrap! */
1181 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1182 
1183 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1184 			pr_err("Wait for timeline wrap timed out!\n");
1185 			err = -EIO;
1186 			goto out;
1187 		}
1188 
1189 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1190 			if (!i915_request_completed(rq[i])) {
1191 				pr_err("Pre-wrap request not completed!\n");
1192 				err = -EINVAL;
1193 				goto out;
1194 			}
1195 		}
1196 
1197 out:
1198 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1199 			i915_request_put(rq[i]);
1200 		st_engine_heartbeat_enable(engine);
1201 		if (err)
1202 			break;
1203 	}
1204 
1205 	if (igt_flush_test(gt->i915))
1206 		err = -EIO;
1207 
1208 	return err;
1209 }
1210 
1211 static int live_hwsp_rollover_user(void *arg)
1212 {
1213 	struct intel_gt *gt = arg;
1214 	struct intel_engine_cs *engine;
1215 	enum intel_engine_id id;
1216 	int err = 0;
1217 
1218 	/*
1219 	 * Simulate a long running user context, and force the seqno wrap
1220 	 * on the user's timeline.
1221 	 */
1222 
1223 	for_each_engine(engine, gt, id) {
1224 		struct i915_request *rq[3] = {};
1225 		struct intel_timeline *tl;
1226 		struct intel_context *ce;
1227 		int i;
1228 
1229 		ce = intel_context_create(engine);
1230 		if (IS_ERR(ce))
1231 			return PTR_ERR(ce);
1232 
1233 		err = intel_context_alloc_state(ce);
1234 		if (err)
1235 			goto out;
1236 
1237 		tl = ce->timeline;
1238 		if (!tl->has_initial_breadcrumb || !tl->hwsp_cacheline)
1239 			goto out;
1240 
1241 		timeline_rollback(tl);
1242 		timeline_rollback(tl);
1243 		WRITE_ONCE(*(u32 *)tl->hwsp_seqno, tl->seqno);
1244 
1245 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1246 			struct i915_request *this;
1247 
1248 			this = intel_context_create_request(ce);
1249 			if (IS_ERR(this)) {
1250 				err = PTR_ERR(this);
1251 				goto out;
1252 			}
1253 
1254 			pr_debug("%s: create fence.seqnp:%d\n",
1255 				 engine->name,
1256 				 lower_32_bits(this->fence.seqno));
1257 
1258 			GEM_BUG_ON(rcu_access_pointer(this->timeline) != tl);
1259 
1260 			rq[i] = i915_request_get(this);
1261 			i915_request_add(this);
1262 		}
1263 
1264 		/* We expected a wrap! */
1265 		GEM_BUG_ON(rq[2]->fence.seqno > rq[0]->fence.seqno);
1266 
1267 		if (i915_request_wait(rq[2], 0, HZ / 5) < 0) {
1268 			pr_err("Wait for timeline wrap timed out!\n");
1269 			err = -EIO;
1270 			goto out;
1271 		}
1272 
1273 		for (i = 0; i < ARRAY_SIZE(rq); i++) {
1274 			if (!i915_request_completed(rq[i])) {
1275 				pr_err("Pre-wrap request not completed!\n");
1276 				err = -EINVAL;
1277 				goto out;
1278 			}
1279 		}
1280 
1281 out:
1282 		for (i = 0; i < ARRAY_SIZE(rq); i++)
1283 			i915_request_put(rq[i]);
1284 		intel_context_put(ce);
1285 		if (err)
1286 			break;
1287 	}
1288 
1289 	if (igt_flush_test(gt->i915))
1290 		err = -EIO;
1291 
1292 	return err;
1293 }
1294 
1295 static int live_hwsp_recycle(void *arg)
1296 {
1297 	struct intel_gt *gt = arg;
1298 	struct intel_engine_cs *engine;
1299 	enum intel_engine_id id;
1300 	unsigned long count;
1301 	int err = 0;
1302 
1303 	/*
1304 	 * Check seqno writes into one timeline at a time. We expect to
1305 	 * recycle the breadcrumb slot between iterations and neither
1306 	 * want to confuse ourselves or the GPU.
1307 	 */
1308 
1309 	count = 0;
1310 	for_each_engine(engine, gt, id) {
1311 		IGT_TIMEOUT(end_time);
1312 
1313 		if (!intel_engine_can_store_dword(engine))
1314 			continue;
1315 
1316 		intel_engine_pm_get(engine);
1317 
1318 		do {
1319 			struct intel_timeline *tl;
1320 			struct i915_request *rq;
1321 
1322 			tl = checked_intel_timeline_create(gt);
1323 			if (IS_ERR(tl)) {
1324 				err = PTR_ERR(tl);
1325 				break;
1326 			}
1327 
1328 			rq = tl_write(tl, engine, count);
1329 			if (IS_ERR(rq)) {
1330 				intel_timeline_put(tl);
1331 				err = PTR_ERR(rq);
1332 				break;
1333 			}
1334 
1335 			if (i915_request_wait(rq, 0, HZ / 5) < 0) {
1336 				pr_err("Wait for timeline writes timed out!\n");
1337 				i915_request_put(rq);
1338 				intel_timeline_put(tl);
1339 				err = -EIO;
1340 				break;
1341 			}
1342 
1343 			if (READ_ONCE(*tl->hwsp_seqno) != count) {
1344 				GEM_TRACE_ERR("Invalid seqno:%lu stored in timeline %llu @ %x found 0x%x\n",
1345 					      count, tl->fence_context,
1346 					      tl->hwsp_offset, *tl->hwsp_seqno);
1347 				GEM_TRACE_DUMP();
1348 				err = -EINVAL;
1349 			}
1350 
1351 			i915_request_put(rq);
1352 			intel_timeline_put(tl);
1353 			count++;
1354 
1355 			if (err)
1356 				break;
1357 		} while (!__igt_timeout(end_time, NULL));
1358 
1359 		intel_engine_pm_put(engine);
1360 		if (err)
1361 			break;
1362 	}
1363 
1364 	return err;
1365 }
1366 
1367 int intel_timeline_live_selftests(struct drm_i915_private *i915)
1368 {
1369 	static const struct i915_subtest tests[] = {
1370 		SUBTEST(live_hwsp_recycle),
1371 		SUBTEST(live_hwsp_engine),
1372 		SUBTEST(live_hwsp_alternate),
1373 		SUBTEST(live_hwsp_wrap),
1374 		SUBTEST(live_hwsp_read),
1375 		SUBTEST(live_hwsp_rollover_kernel),
1376 		SUBTEST(live_hwsp_rollover_user),
1377 	};
1378 
1379 	if (intel_gt_is_wedged(&i915->gt))
1380 		return 0;
1381 
1382 	return intel_gt_live_subtests(tests, &i915->gt);
1383 }
1384