xref: /openbmc/linux/drivers/gpu/drm/i915/selftests/i915_request.c (revision f8a11425075ff11b4b5784f077cb84f3d2dfb3f0)
1 /*
2  * Copyright © 2016 Intel Corporation
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice (including the next
12  * paragraph) shall be included in all copies or substantial portions of the
13  * Software.
14  *
15  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
16  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
18  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
19  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
20  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
21  * IN THE SOFTWARE.
22  *
23  */
24 
25 #include <linux/prime_numbers.h>
26 #include <linux/pm_qos.h>
27 #include <linux/sort.h>
28 
29 #include "gem/i915_gem_pm.h"
30 #include "gem/selftests/mock_context.h"
31 
32 #include "gt/intel_engine_heartbeat.h"
33 #include "gt/intel_engine_pm.h"
34 #include "gt/intel_engine_user.h"
35 #include "gt/intel_gt.h"
36 #include "gt/intel_gt_clock_utils.h"
37 #include "gt/intel_gt_requests.h"
38 #include "gt/selftest_engine_heartbeat.h"
39 
40 #include "i915_random.h"
41 #include "i915_selftest.h"
42 #include "igt_flush_test.h"
43 #include "igt_live_test.h"
44 #include "igt_spinner.h"
45 #include "lib_sw_fence.h"
46 
47 #include "mock_drm.h"
48 #include "mock_gem_device.h"
49 
50 static unsigned int num_uabi_engines(struct drm_i915_private *i915)
51 {
52 	struct intel_engine_cs *engine;
53 	unsigned int count;
54 
55 	count = 0;
56 	for_each_uabi_engine(engine, i915)
57 		count++;
58 
59 	return count;
60 }
61 
62 static struct intel_engine_cs *rcs0(struct drm_i915_private *i915)
63 {
64 	return intel_engine_lookup_user(i915, I915_ENGINE_CLASS_RENDER, 0);
65 }
66 
67 static int igt_add_request(void *arg)
68 {
69 	struct drm_i915_private *i915 = arg;
70 	struct i915_request *request;
71 
72 	/* Basic preliminary test to create a request and let it loose! */
73 
74 	request = mock_request(rcs0(i915)->kernel_context, HZ / 10);
75 	if (!request)
76 		return -ENOMEM;
77 
78 	i915_request_add(request);
79 
80 	return 0;
81 }
82 
83 static int igt_wait_request(void *arg)
84 {
85 	const long T = HZ / 4;
86 	struct drm_i915_private *i915 = arg;
87 	struct i915_request *request;
88 	int err = -EINVAL;
89 
90 	/* Submit a request, then wait upon it */
91 
92 	request = mock_request(rcs0(i915)->kernel_context, T);
93 	if (!request)
94 		return -ENOMEM;
95 
96 	i915_request_get(request);
97 
98 	if (i915_request_wait(request, 0, 0) != -ETIME) {
99 		pr_err("request wait (busy query) succeeded (expected timeout before submit!)\n");
100 		goto out_request;
101 	}
102 
103 	if (i915_request_wait(request, 0, T) != -ETIME) {
104 		pr_err("request wait succeeded (expected timeout before submit!)\n");
105 		goto out_request;
106 	}
107 
108 	if (i915_request_completed(request)) {
109 		pr_err("request completed before submit!!\n");
110 		goto out_request;
111 	}
112 
113 	i915_request_add(request);
114 
115 	if (i915_request_wait(request, 0, 0) != -ETIME) {
116 		pr_err("request wait (busy query) succeeded (expected timeout after submit!)\n");
117 		goto out_request;
118 	}
119 
120 	if (i915_request_completed(request)) {
121 		pr_err("request completed immediately!\n");
122 		goto out_request;
123 	}
124 
125 	if (i915_request_wait(request, 0, T / 2) != -ETIME) {
126 		pr_err("request wait succeeded (expected timeout!)\n");
127 		goto out_request;
128 	}
129 
130 	if (i915_request_wait(request, 0, T) == -ETIME) {
131 		pr_err("request wait timed out!\n");
132 		goto out_request;
133 	}
134 
135 	if (!i915_request_completed(request)) {
136 		pr_err("request not complete after waiting!\n");
137 		goto out_request;
138 	}
139 
140 	if (i915_request_wait(request, 0, T) == -ETIME) {
141 		pr_err("request wait timed out when already complete!\n");
142 		goto out_request;
143 	}
144 
145 	err = 0;
146 out_request:
147 	i915_request_put(request);
148 	mock_device_flush(i915);
149 	return err;
150 }
151 
152 static int igt_fence_wait(void *arg)
153 {
154 	const long T = HZ / 4;
155 	struct drm_i915_private *i915 = arg;
156 	struct i915_request *request;
157 	int err = -EINVAL;
158 
159 	/* Submit a request, treat it as a fence and wait upon it */
160 
161 	request = mock_request(rcs0(i915)->kernel_context, T);
162 	if (!request)
163 		return -ENOMEM;
164 
165 	if (dma_fence_wait_timeout(&request->fence, false, T) != -ETIME) {
166 		pr_err("fence wait success before submit (expected timeout)!\n");
167 		goto out;
168 	}
169 
170 	i915_request_add(request);
171 
172 	if (dma_fence_is_signaled(&request->fence)) {
173 		pr_err("fence signaled immediately!\n");
174 		goto out;
175 	}
176 
177 	if (dma_fence_wait_timeout(&request->fence, false, T / 2) != -ETIME) {
178 		pr_err("fence wait success after submit (expected timeout)!\n");
179 		goto out;
180 	}
181 
182 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
183 		pr_err("fence wait timed out (expected success)!\n");
184 		goto out;
185 	}
186 
187 	if (!dma_fence_is_signaled(&request->fence)) {
188 		pr_err("fence unsignaled after waiting!\n");
189 		goto out;
190 	}
191 
192 	if (dma_fence_wait_timeout(&request->fence, false, T) <= 0) {
193 		pr_err("fence wait timed out when complete (expected success)!\n");
194 		goto out;
195 	}
196 
197 	err = 0;
198 out:
199 	mock_device_flush(i915);
200 	return err;
201 }
202 
203 static int igt_request_rewind(void *arg)
204 {
205 	struct drm_i915_private *i915 = arg;
206 	struct i915_request *request, *vip;
207 	struct i915_gem_context *ctx[2];
208 	struct intel_context *ce;
209 	int err = -EINVAL;
210 
211 	ctx[0] = mock_context(i915, "A");
212 
213 	ce = i915_gem_context_get_engine(ctx[0], RCS0);
214 	GEM_BUG_ON(IS_ERR(ce));
215 	request = mock_request(ce, 2 * HZ);
216 	intel_context_put(ce);
217 	if (!request) {
218 		err = -ENOMEM;
219 		goto err_context_0;
220 	}
221 
222 	i915_request_get(request);
223 	i915_request_add(request);
224 
225 	ctx[1] = mock_context(i915, "B");
226 
227 	ce = i915_gem_context_get_engine(ctx[1], RCS0);
228 	GEM_BUG_ON(IS_ERR(ce));
229 	vip = mock_request(ce, 0);
230 	intel_context_put(ce);
231 	if (!vip) {
232 		err = -ENOMEM;
233 		goto err_context_1;
234 	}
235 
236 	/* Simulate preemption by manual reordering */
237 	if (!mock_cancel_request(request)) {
238 		pr_err("failed to cancel request (already executed)!\n");
239 		i915_request_add(vip);
240 		goto err_context_1;
241 	}
242 	i915_request_get(vip);
243 	i915_request_add(vip);
244 	rcu_read_lock();
245 	request->engine->submit_request(request);
246 	rcu_read_unlock();
247 
248 
249 	if (i915_request_wait(vip, 0, HZ) == -ETIME) {
250 		pr_err("timed out waiting for high priority request\n");
251 		goto err;
252 	}
253 
254 	if (i915_request_completed(request)) {
255 		pr_err("low priority request already completed\n");
256 		goto err;
257 	}
258 
259 	err = 0;
260 err:
261 	i915_request_put(vip);
262 err_context_1:
263 	mock_context_close(ctx[1]);
264 	i915_request_put(request);
265 err_context_0:
266 	mock_context_close(ctx[0]);
267 	mock_device_flush(i915);
268 	return err;
269 }
270 
271 struct smoketest {
272 	struct intel_engine_cs *engine;
273 	struct i915_gem_context **contexts;
274 	atomic_long_t num_waits, num_fences;
275 	int ncontexts, max_batch;
276 	struct i915_request *(*request_alloc)(struct intel_context *ce);
277 };
278 
279 static struct i915_request *
280 __mock_request_alloc(struct intel_context *ce)
281 {
282 	return mock_request(ce, 0);
283 }
284 
285 static struct i915_request *
286 __live_request_alloc(struct intel_context *ce)
287 {
288 	return intel_context_create_request(ce);
289 }
290 
291 static int __igt_breadcrumbs_smoketest(void *arg)
292 {
293 	struct smoketest *t = arg;
294 	const unsigned int max_batch = min(t->ncontexts, t->max_batch) - 1;
295 	const unsigned int total = 4 * t->ncontexts + 1;
296 	unsigned int num_waits = 0, num_fences = 0;
297 	struct i915_request **requests;
298 	I915_RND_STATE(prng);
299 	unsigned int *order;
300 	int err = 0;
301 
302 	/*
303 	 * A very simple test to catch the most egregious of list handling bugs.
304 	 *
305 	 * At its heart, we simply create oodles of requests running across
306 	 * multiple kthreads and enable signaling on them, for the sole purpose
307 	 * of stressing our breadcrumb handling. The only inspection we do is
308 	 * that the fences were marked as signaled.
309 	 */
310 
311 	requests = kcalloc(total, sizeof(*requests), GFP_KERNEL);
312 	if (!requests)
313 		return -ENOMEM;
314 
315 	order = i915_random_order(total, &prng);
316 	if (!order) {
317 		err = -ENOMEM;
318 		goto out_requests;
319 	}
320 
321 	while (!kthread_should_stop()) {
322 		struct i915_sw_fence *submit, *wait;
323 		unsigned int n, count;
324 
325 		submit = heap_fence_create(GFP_KERNEL);
326 		if (!submit) {
327 			err = -ENOMEM;
328 			break;
329 		}
330 
331 		wait = heap_fence_create(GFP_KERNEL);
332 		if (!wait) {
333 			i915_sw_fence_commit(submit);
334 			heap_fence_put(submit);
335 			err = -ENOMEM;
336 			break;
337 		}
338 
339 		i915_random_reorder(order, total, &prng);
340 		count = 1 + i915_prandom_u32_max_state(max_batch, &prng);
341 
342 		for (n = 0; n < count; n++) {
343 			struct i915_gem_context *ctx =
344 				t->contexts[order[n] % t->ncontexts];
345 			struct i915_request *rq;
346 			struct intel_context *ce;
347 
348 			ce = i915_gem_context_get_engine(ctx, t->engine->legacy_idx);
349 			GEM_BUG_ON(IS_ERR(ce));
350 			rq = t->request_alloc(ce);
351 			intel_context_put(ce);
352 			if (IS_ERR(rq)) {
353 				err = PTR_ERR(rq);
354 				count = n;
355 				break;
356 			}
357 
358 			err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
359 							       submit,
360 							       GFP_KERNEL);
361 
362 			requests[n] = i915_request_get(rq);
363 			i915_request_add(rq);
364 
365 			if (err >= 0)
366 				err = i915_sw_fence_await_dma_fence(wait,
367 								    &rq->fence,
368 								    0,
369 								    GFP_KERNEL);
370 
371 			if (err < 0) {
372 				i915_request_put(rq);
373 				count = n;
374 				break;
375 			}
376 		}
377 
378 		i915_sw_fence_commit(submit);
379 		i915_sw_fence_commit(wait);
380 
381 		if (!wait_event_timeout(wait->wait,
382 					i915_sw_fence_done(wait),
383 					5 * HZ)) {
384 			struct i915_request *rq = requests[count - 1];
385 
386 			pr_err("waiting for %d/%d fences (last %llx:%lld) on %s timed out!\n",
387 			       atomic_read(&wait->pending), count,
388 			       rq->fence.context, rq->fence.seqno,
389 			       t->engine->name);
390 			GEM_TRACE_DUMP();
391 
392 			intel_gt_set_wedged(t->engine->gt);
393 			GEM_BUG_ON(!i915_request_completed(rq));
394 			i915_sw_fence_wait(wait);
395 			err = -EIO;
396 		}
397 
398 		for (n = 0; n < count; n++) {
399 			struct i915_request *rq = requests[n];
400 
401 			if (!test_bit(DMA_FENCE_FLAG_SIGNALED_BIT,
402 				      &rq->fence.flags)) {
403 				pr_err("%llu:%llu was not signaled!\n",
404 				       rq->fence.context, rq->fence.seqno);
405 				err = -EINVAL;
406 			}
407 
408 			i915_request_put(rq);
409 		}
410 
411 		heap_fence_put(wait);
412 		heap_fence_put(submit);
413 
414 		if (err < 0)
415 			break;
416 
417 		num_fences += count;
418 		num_waits++;
419 
420 		cond_resched();
421 	}
422 
423 	atomic_long_add(num_fences, &t->num_fences);
424 	atomic_long_add(num_waits, &t->num_waits);
425 
426 	kfree(order);
427 out_requests:
428 	kfree(requests);
429 	return err;
430 }
431 
432 static int mock_breadcrumbs_smoketest(void *arg)
433 {
434 	struct drm_i915_private *i915 = arg;
435 	struct smoketest t = {
436 		.engine = rcs0(i915),
437 		.ncontexts = 1024,
438 		.max_batch = 1024,
439 		.request_alloc = __mock_request_alloc
440 	};
441 	unsigned int ncpus = num_online_cpus();
442 	struct task_struct **threads;
443 	unsigned int n;
444 	int ret = 0;
445 
446 	/*
447 	 * Smoketest our breadcrumb/signal handling for requests across multiple
448 	 * threads. A very simple test to only catch the most egregious of bugs.
449 	 * See __igt_breadcrumbs_smoketest();
450 	 */
451 
452 	threads = kcalloc(ncpus, sizeof(*threads), GFP_KERNEL);
453 	if (!threads)
454 		return -ENOMEM;
455 
456 	t.contexts = kcalloc(t.ncontexts, sizeof(*t.contexts), GFP_KERNEL);
457 	if (!t.contexts) {
458 		ret = -ENOMEM;
459 		goto out_threads;
460 	}
461 
462 	for (n = 0; n < t.ncontexts; n++) {
463 		t.contexts[n] = mock_context(t.engine->i915, "mock");
464 		if (!t.contexts[n]) {
465 			ret = -ENOMEM;
466 			goto out_contexts;
467 		}
468 	}
469 
470 	for (n = 0; n < ncpus; n++) {
471 		threads[n] = kthread_run(__igt_breadcrumbs_smoketest,
472 					 &t, "igt/%d", n);
473 		if (IS_ERR(threads[n])) {
474 			ret = PTR_ERR(threads[n]);
475 			ncpus = n;
476 			break;
477 		}
478 
479 		get_task_struct(threads[n]);
480 	}
481 
482 	yield(); /* start all threads before we begin */
483 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
484 
485 	for (n = 0; n < ncpus; n++) {
486 		int err;
487 
488 		err = kthread_stop(threads[n]);
489 		if (err < 0 && !ret)
490 			ret = err;
491 
492 		put_task_struct(threads[n]);
493 	}
494 	pr_info("Completed %lu waits for %lu fence across %d cpus\n",
495 		atomic_long_read(&t.num_waits),
496 		atomic_long_read(&t.num_fences),
497 		ncpus);
498 
499 out_contexts:
500 	for (n = 0; n < t.ncontexts; n++) {
501 		if (!t.contexts[n])
502 			break;
503 		mock_context_close(t.contexts[n]);
504 	}
505 	kfree(t.contexts);
506 out_threads:
507 	kfree(threads);
508 	return ret;
509 }
510 
511 int i915_request_mock_selftests(void)
512 {
513 	static const struct i915_subtest tests[] = {
514 		SUBTEST(igt_add_request),
515 		SUBTEST(igt_wait_request),
516 		SUBTEST(igt_fence_wait),
517 		SUBTEST(igt_request_rewind),
518 		SUBTEST(mock_breadcrumbs_smoketest),
519 	};
520 	struct drm_i915_private *i915;
521 	intel_wakeref_t wakeref;
522 	int err = 0;
523 
524 	i915 = mock_gem_device();
525 	if (!i915)
526 		return -ENOMEM;
527 
528 	with_intel_runtime_pm(&i915->runtime_pm, wakeref)
529 		err = i915_subtests(tests, i915);
530 
531 	mock_destroy_device(i915);
532 
533 	return err;
534 }
535 
536 static int live_nop_request(void *arg)
537 {
538 	struct drm_i915_private *i915 = arg;
539 	struct intel_engine_cs *engine;
540 	struct igt_live_test t;
541 	int err = -ENODEV;
542 
543 	/*
544 	 * Submit various sized batches of empty requests, to each engine
545 	 * (individually), and wait for the batch to complete. We can check
546 	 * the overhead of submitting requests to the hardware.
547 	 */
548 
549 	for_each_uabi_engine(engine, i915) {
550 		unsigned long n, prime;
551 		IGT_TIMEOUT(end_time);
552 		ktime_t times[2] = {};
553 
554 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
555 		if (err)
556 			return err;
557 
558 		intel_engine_pm_get(engine);
559 		for_each_prime_number_from(prime, 1, 8192) {
560 			struct i915_request *request = NULL;
561 
562 			times[1] = ktime_get_raw();
563 
564 			for (n = 0; n < prime; n++) {
565 				i915_request_put(request);
566 				request = i915_request_create(engine->kernel_context);
567 				if (IS_ERR(request))
568 					return PTR_ERR(request);
569 
570 				/*
571 				 * This space is left intentionally blank.
572 				 *
573 				 * We do not actually want to perform any
574 				 * action with this request, we just want
575 				 * to measure the latency in allocation
576 				 * and submission of our breadcrumbs -
577 				 * ensuring that the bare request is sufficient
578 				 * for the system to work (i.e. proper HEAD
579 				 * tracking of the rings, interrupt handling,
580 				 * etc). It also gives us the lowest bounds
581 				 * for latency.
582 				 */
583 
584 				i915_request_get(request);
585 				i915_request_add(request);
586 			}
587 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
588 			i915_request_put(request);
589 
590 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
591 			if (prime == 1)
592 				times[0] = times[1];
593 
594 			if (__igt_timeout(end_time, NULL))
595 				break;
596 		}
597 		intel_engine_pm_put(engine);
598 
599 		err = igt_live_test_end(&t);
600 		if (err)
601 			return err;
602 
603 		pr_info("Request latencies on %s: 1 = %lluns, %lu = %lluns\n",
604 			engine->name,
605 			ktime_to_ns(times[0]),
606 			prime, div64_u64(ktime_to_ns(times[1]), prime));
607 	}
608 
609 	return err;
610 }
611 
612 static int __cancel_inactive(struct intel_engine_cs *engine)
613 {
614 	struct intel_context *ce;
615 	struct igt_spinner spin;
616 	struct i915_request *rq;
617 	int err = 0;
618 
619 	if (igt_spinner_init(&spin, engine->gt))
620 		return -ENOMEM;
621 
622 	ce = intel_context_create(engine);
623 	if (IS_ERR(ce)) {
624 		err = PTR_ERR(ce);
625 		goto out_spin;
626 	}
627 
628 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
629 	if (IS_ERR(rq)) {
630 		err = PTR_ERR(rq);
631 		goto out_ce;
632 	}
633 
634 	pr_debug("%s: Cancelling inactive request\n", engine->name);
635 	i915_request_cancel(rq, -EINTR);
636 	i915_request_get(rq);
637 	i915_request_add(rq);
638 
639 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
640 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
641 
642 		pr_err("%s: Failed to cancel inactive request\n", engine->name);
643 		intel_engine_dump(engine, &p, "%s\n", engine->name);
644 		err = -ETIME;
645 		goto out_rq;
646 	}
647 
648 	if (rq->fence.error != -EINTR) {
649 		pr_err("%s: fence not cancelled (%u)\n",
650 		       engine->name, rq->fence.error);
651 		err = -EINVAL;
652 	}
653 
654 out_rq:
655 	i915_request_put(rq);
656 out_ce:
657 	intel_context_put(ce);
658 out_spin:
659 	igt_spinner_fini(&spin);
660 	if (err)
661 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
662 	return err;
663 }
664 
665 static int __cancel_active(struct intel_engine_cs *engine)
666 {
667 	struct intel_context *ce;
668 	struct igt_spinner spin;
669 	struct i915_request *rq;
670 	int err = 0;
671 
672 	if (igt_spinner_init(&spin, engine->gt))
673 		return -ENOMEM;
674 
675 	ce = intel_context_create(engine);
676 	if (IS_ERR(ce)) {
677 		err = PTR_ERR(ce);
678 		goto out_spin;
679 	}
680 
681 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
682 	if (IS_ERR(rq)) {
683 		err = PTR_ERR(rq);
684 		goto out_ce;
685 	}
686 
687 	pr_debug("%s: Cancelling active request\n", engine->name);
688 	i915_request_get(rq);
689 	i915_request_add(rq);
690 	if (!igt_wait_for_spinner(&spin, rq)) {
691 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
692 
693 		pr_err("Failed to start spinner on %s\n", engine->name);
694 		intel_engine_dump(engine, &p, "%s\n", engine->name);
695 		err = -ETIME;
696 		goto out_rq;
697 	}
698 	i915_request_cancel(rq, -EINTR);
699 
700 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
701 		struct drm_printer p = drm_info_printer(engine->i915->drm.dev);
702 
703 		pr_err("%s: Failed to cancel active request\n", engine->name);
704 		intel_engine_dump(engine, &p, "%s\n", engine->name);
705 		err = -ETIME;
706 		goto out_rq;
707 	}
708 
709 	if (rq->fence.error != -EINTR) {
710 		pr_err("%s: fence not cancelled (%u)\n",
711 		       engine->name, rq->fence.error);
712 		err = -EINVAL;
713 	}
714 
715 out_rq:
716 	i915_request_put(rq);
717 out_ce:
718 	intel_context_put(ce);
719 out_spin:
720 	igt_spinner_fini(&spin);
721 	if (err)
722 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
723 	return err;
724 }
725 
726 static int __cancel_completed(struct intel_engine_cs *engine)
727 {
728 	struct intel_context *ce;
729 	struct igt_spinner spin;
730 	struct i915_request *rq;
731 	int err = 0;
732 
733 	if (igt_spinner_init(&spin, engine->gt))
734 		return -ENOMEM;
735 
736 	ce = intel_context_create(engine);
737 	if (IS_ERR(ce)) {
738 		err = PTR_ERR(ce);
739 		goto out_spin;
740 	}
741 
742 	rq = igt_spinner_create_request(&spin, ce, MI_ARB_CHECK);
743 	if (IS_ERR(rq)) {
744 		err = PTR_ERR(rq);
745 		goto out_ce;
746 	}
747 	igt_spinner_end(&spin);
748 	i915_request_get(rq);
749 	i915_request_add(rq);
750 
751 	if (i915_request_wait(rq, 0, HZ / 5) < 0) {
752 		err = -ETIME;
753 		goto out_rq;
754 	}
755 
756 	pr_debug("%s: Cancelling completed request\n", engine->name);
757 	i915_request_cancel(rq, -EINTR);
758 	if (rq->fence.error) {
759 		pr_err("%s: fence not cancelled (%u)\n",
760 		       engine->name, rq->fence.error);
761 		err = -EINVAL;
762 	}
763 
764 out_rq:
765 	i915_request_put(rq);
766 out_ce:
767 	intel_context_put(ce);
768 out_spin:
769 	igt_spinner_fini(&spin);
770 	if (err)
771 		pr_err("%s: %s error %d\n", __func__, engine->name, err);
772 	return err;
773 }
774 
775 static int live_cancel_request(void *arg)
776 {
777 	struct drm_i915_private *i915 = arg;
778 	struct intel_engine_cs *engine;
779 
780 	/*
781 	 * Check cancellation of requests. We expect to be able to immediately
782 	 * cancel active requests, even if they are currently on the GPU.
783 	 */
784 
785 	for_each_uabi_engine(engine, i915) {
786 		struct igt_live_test t;
787 		int err, err2;
788 
789 		if (!intel_engine_has_preemption(engine))
790 			continue;
791 
792 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
793 		if (err)
794 			return err;
795 
796 		err = __cancel_inactive(engine);
797 		if (err == 0)
798 			err = __cancel_active(engine);
799 		if (err == 0)
800 			err = __cancel_completed(engine);
801 
802 		err2 = igt_live_test_end(&t);
803 		if (err)
804 			return err;
805 		if (err2)
806 			return err2;
807 	}
808 
809 	return 0;
810 }
811 
812 static struct i915_vma *empty_batch(struct drm_i915_private *i915)
813 {
814 	struct drm_i915_gem_object *obj;
815 	struct i915_vma *vma;
816 	u32 *cmd;
817 	int err;
818 
819 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
820 	if (IS_ERR(obj))
821 		return ERR_CAST(obj);
822 
823 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WB);
824 	if (IS_ERR(cmd)) {
825 		err = PTR_ERR(cmd);
826 		goto err;
827 	}
828 
829 	*cmd = MI_BATCH_BUFFER_END;
830 
831 	__i915_gem_object_flush_map(obj, 0, 64);
832 	i915_gem_object_unpin_map(obj);
833 
834 	intel_gt_chipset_flush(&i915->gt);
835 
836 	vma = i915_vma_instance(obj, &i915->ggtt.vm, NULL);
837 	if (IS_ERR(vma)) {
838 		err = PTR_ERR(vma);
839 		goto err;
840 	}
841 
842 	err = i915_vma_pin(vma, 0, 0, PIN_USER | PIN_GLOBAL);
843 	if (err)
844 		goto err;
845 
846 	/* Force the wait wait now to avoid including it in the benchmark */
847 	err = i915_vma_sync(vma);
848 	if (err)
849 		goto err_pin;
850 
851 	return vma;
852 
853 err_pin:
854 	i915_vma_unpin(vma);
855 err:
856 	i915_gem_object_put(obj);
857 	return ERR_PTR(err);
858 }
859 
860 static struct i915_request *
861 empty_request(struct intel_engine_cs *engine,
862 	      struct i915_vma *batch)
863 {
864 	struct i915_request *request;
865 	int err;
866 
867 	request = i915_request_create(engine->kernel_context);
868 	if (IS_ERR(request))
869 		return request;
870 
871 	err = engine->emit_bb_start(request,
872 				    batch->node.start,
873 				    batch->node.size,
874 				    I915_DISPATCH_SECURE);
875 	if (err)
876 		goto out_request;
877 
878 	i915_request_get(request);
879 out_request:
880 	i915_request_add(request);
881 	return err ? ERR_PTR(err) : request;
882 }
883 
884 static int live_empty_request(void *arg)
885 {
886 	struct drm_i915_private *i915 = arg;
887 	struct intel_engine_cs *engine;
888 	struct igt_live_test t;
889 	struct i915_vma *batch;
890 	int err = 0;
891 
892 	/*
893 	 * Submit various sized batches of empty requests, to each engine
894 	 * (individually), and wait for the batch to complete. We can check
895 	 * the overhead of submitting requests to the hardware.
896 	 */
897 
898 	batch = empty_batch(i915);
899 	if (IS_ERR(batch))
900 		return PTR_ERR(batch);
901 
902 	for_each_uabi_engine(engine, i915) {
903 		IGT_TIMEOUT(end_time);
904 		struct i915_request *request;
905 		unsigned long n, prime;
906 		ktime_t times[2] = {};
907 
908 		err = igt_live_test_begin(&t, i915, __func__, engine->name);
909 		if (err)
910 			goto out_batch;
911 
912 		intel_engine_pm_get(engine);
913 
914 		/* Warmup / preload */
915 		request = empty_request(engine, batch);
916 		if (IS_ERR(request)) {
917 			err = PTR_ERR(request);
918 			intel_engine_pm_put(engine);
919 			goto out_batch;
920 		}
921 		i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
922 
923 		for_each_prime_number_from(prime, 1, 8192) {
924 			times[1] = ktime_get_raw();
925 
926 			for (n = 0; n < prime; n++) {
927 				i915_request_put(request);
928 				request = empty_request(engine, batch);
929 				if (IS_ERR(request)) {
930 					err = PTR_ERR(request);
931 					intel_engine_pm_put(engine);
932 					goto out_batch;
933 				}
934 			}
935 			i915_request_wait(request, 0, MAX_SCHEDULE_TIMEOUT);
936 
937 			times[1] = ktime_sub(ktime_get_raw(), times[1]);
938 			if (prime == 1)
939 				times[0] = times[1];
940 
941 			if (__igt_timeout(end_time, NULL))
942 				break;
943 		}
944 		i915_request_put(request);
945 		intel_engine_pm_put(engine);
946 
947 		err = igt_live_test_end(&t);
948 		if (err)
949 			goto out_batch;
950 
951 		pr_info("Batch latencies on %s: 1 = %lluns, %lu = %lluns\n",
952 			engine->name,
953 			ktime_to_ns(times[0]),
954 			prime, div64_u64(ktime_to_ns(times[1]), prime));
955 	}
956 
957 out_batch:
958 	i915_vma_unpin(batch);
959 	i915_vma_put(batch);
960 	return err;
961 }
962 
963 static struct i915_vma *recursive_batch(struct drm_i915_private *i915)
964 {
965 	struct drm_i915_gem_object *obj;
966 	const int gen = INTEL_GEN(i915);
967 	struct i915_vma *vma;
968 	u32 *cmd;
969 	int err;
970 
971 	obj = i915_gem_object_create_internal(i915, PAGE_SIZE);
972 	if (IS_ERR(obj))
973 		return ERR_CAST(obj);
974 
975 	vma = i915_vma_instance(obj, i915->gt.vm, NULL);
976 	if (IS_ERR(vma)) {
977 		err = PTR_ERR(vma);
978 		goto err;
979 	}
980 
981 	err = i915_vma_pin(vma, 0, 0, PIN_USER);
982 	if (err)
983 		goto err;
984 
985 	cmd = i915_gem_object_pin_map_unlocked(obj, I915_MAP_WC);
986 	if (IS_ERR(cmd)) {
987 		err = PTR_ERR(cmd);
988 		goto err;
989 	}
990 
991 	if (gen >= 8) {
992 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8 | 1;
993 		*cmd++ = lower_32_bits(vma->node.start);
994 		*cmd++ = upper_32_bits(vma->node.start);
995 	} else if (gen >= 6) {
996 		*cmd++ = MI_BATCH_BUFFER_START | 1 << 8;
997 		*cmd++ = lower_32_bits(vma->node.start);
998 	} else {
999 		*cmd++ = MI_BATCH_BUFFER_START | MI_BATCH_GTT;
1000 		*cmd++ = lower_32_bits(vma->node.start);
1001 	}
1002 	*cmd++ = MI_BATCH_BUFFER_END; /* terminate early in case of error */
1003 
1004 	__i915_gem_object_flush_map(obj, 0, 64);
1005 	i915_gem_object_unpin_map(obj);
1006 
1007 	intel_gt_chipset_flush(&i915->gt);
1008 
1009 	return vma;
1010 
1011 err:
1012 	i915_gem_object_put(obj);
1013 	return ERR_PTR(err);
1014 }
1015 
1016 static int recursive_batch_resolve(struct i915_vma *batch)
1017 {
1018 	u32 *cmd;
1019 
1020 	cmd = i915_gem_object_pin_map_unlocked(batch->obj, I915_MAP_WC);
1021 	if (IS_ERR(cmd))
1022 		return PTR_ERR(cmd);
1023 
1024 	*cmd = MI_BATCH_BUFFER_END;
1025 
1026 	__i915_gem_object_flush_map(batch->obj, 0, sizeof(*cmd));
1027 	i915_gem_object_unpin_map(batch->obj);
1028 
1029 	intel_gt_chipset_flush(batch->vm->gt);
1030 
1031 	return 0;
1032 }
1033 
1034 static int live_all_engines(void *arg)
1035 {
1036 	struct drm_i915_private *i915 = arg;
1037 	const unsigned int nengines = num_uabi_engines(i915);
1038 	struct intel_engine_cs *engine;
1039 	struct i915_request **request;
1040 	struct igt_live_test t;
1041 	struct i915_vma *batch;
1042 	unsigned int idx;
1043 	int err;
1044 
1045 	/*
1046 	 * Check we can submit requests to all engines simultaneously. We
1047 	 * send a recursive batch to each engine - checking that we don't
1048 	 * block doing so, and that they don't complete too soon.
1049 	 */
1050 
1051 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1052 	if (!request)
1053 		return -ENOMEM;
1054 
1055 	err = igt_live_test_begin(&t, i915, __func__, "");
1056 	if (err)
1057 		goto out_free;
1058 
1059 	batch = recursive_batch(i915);
1060 	if (IS_ERR(batch)) {
1061 		err = PTR_ERR(batch);
1062 		pr_err("%s: Unable to create batch, err=%d\n", __func__, err);
1063 		goto out_free;
1064 	}
1065 
1066 	i915_vma_lock(batch);
1067 
1068 	idx = 0;
1069 	for_each_uabi_engine(engine, i915) {
1070 		request[idx] = intel_engine_create_kernel_request(engine);
1071 		if (IS_ERR(request[idx])) {
1072 			err = PTR_ERR(request[idx]);
1073 			pr_err("%s: Request allocation failed with err=%d\n",
1074 			       __func__, err);
1075 			goto out_request;
1076 		}
1077 
1078 		err = i915_request_await_object(request[idx], batch->obj, 0);
1079 		if (err == 0)
1080 			err = i915_vma_move_to_active(batch, request[idx], 0);
1081 		GEM_BUG_ON(err);
1082 
1083 		err = engine->emit_bb_start(request[idx],
1084 					    batch->node.start,
1085 					    batch->node.size,
1086 					    0);
1087 		GEM_BUG_ON(err);
1088 		request[idx]->batch = batch;
1089 
1090 		i915_request_get(request[idx]);
1091 		i915_request_add(request[idx]);
1092 		idx++;
1093 	}
1094 
1095 	i915_vma_unlock(batch);
1096 
1097 	idx = 0;
1098 	for_each_uabi_engine(engine, i915) {
1099 		if (i915_request_completed(request[idx])) {
1100 			pr_err("%s(%s): request completed too early!\n",
1101 			       __func__, engine->name);
1102 			err = -EINVAL;
1103 			goto out_request;
1104 		}
1105 		idx++;
1106 	}
1107 
1108 	err = recursive_batch_resolve(batch);
1109 	if (err) {
1110 		pr_err("%s: failed to resolve batch, err=%d\n", __func__, err);
1111 		goto out_request;
1112 	}
1113 
1114 	idx = 0;
1115 	for_each_uabi_engine(engine, i915) {
1116 		long timeout;
1117 
1118 		timeout = i915_request_wait(request[idx], 0,
1119 					    MAX_SCHEDULE_TIMEOUT);
1120 		if (timeout < 0) {
1121 			err = timeout;
1122 			pr_err("%s: error waiting for request on %s, err=%d\n",
1123 			       __func__, engine->name, err);
1124 			goto out_request;
1125 		}
1126 
1127 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1128 		i915_request_put(request[idx]);
1129 		request[idx] = NULL;
1130 		idx++;
1131 	}
1132 
1133 	err = igt_live_test_end(&t);
1134 
1135 out_request:
1136 	idx = 0;
1137 	for_each_uabi_engine(engine, i915) {
1138 		if (request[idx])
1139 			i915_request_put(request[idx]);
1140 		idx++;
1141 	}
1142 	i915_vma_unpin(batch);
1143 	i915_vma_put(batch);
1144 out_free:
1145 	kfree(request);
1146 	return err;
1147 }
1148 
1149 static int live_sequential_engines(void *arg)
1150 {
1151 	struct drm_i915_private *i915 = arg;
1152 	const unsigned int nengines = num_uabi_engines(i915);
1153 	struct i915_request **request;
1154 	struct i915_request *prev = NULL;
1155 	struct intel_engine_cs *engine;
1156 	struct igt_live_test t;
1157 	unsigned int idx;
1158 	int err;
1159 
1160 	/*
1161 	 * Check we can submit requests to all engines sequentially, such
1162 	 * that each successive request waits for the earlier ones. This
1163 	 * tests that we don't execute requests out of order, even though
1164 	 * they are running on independent engines.
1165 	 */
1166 
1167 	request = kcalloc(nengines, sizeof(*request), GFP_KERNEL);
1168 	if (!request)
1169 		return -ENOMEM;
1170 
1171 	err = igt_live_test_begin(&t, i915, __func__, "");
1172 	if (err)
1173 		goto out_free;
1174 
1175 	idx = 0;
1176 	for_each_uabi_engine(engine, i915) {
1177 		struct i915_vma *batch;
1178 
1179 		batch = recursive_batch(i915);
1180 		if (IS_ERR(batch)) {
1181 			err = PTR_ERR(batch);
1182 			pr_err("%s: Unable to create batch for %s, err=%d\n",
1183 			       __func__, engine->name, err);
1184 			goto out_free;
1185 		}
1186 
1187 		i915_vma_lock(batch);
1188 		request[idx] = intel_engine_create_kernel_request(engine);
1189 		if (IS_ERR(request[idx])) {
1190 			err = PTR_ERR(request[idx]);
1191 			pr_err("%s: Request allocation failed for %s with err=%d\n",
1192 			       __func__, engine->name, err);
1193 			goto out_unlock;
1194 		}
1195 
1196 		if (prev) {
1197 			err = i915_request_await_dma_fence(request[idx],
1198 							   &prev->fence);
1199 			if (err) {
1200 				i915_request_add(request[idx]);
1201 				pr_err("%s: Request await failed for %s with err=%d\n",
1202 				       __func__, engine->name, err);
1203 				goto out_unlock;
1204 			}
1205 		}
1206 
1207 		err = i915_request_await_object(request[idx],
1208 						batch->obj, false);
1209 		if (err == 0)
1210 			err = i915_vma_move_to_active(batch, request[idx], 0);
1211 		GEM_BUG_ON(err);
1212 
1213 		err = engine->emit_bb_start(request[idx],
1214 					    batch->node.start,
1215 					    batch->node.size,
1216 					    0);
1217 		GEM_BUG_ON(err);
1218 		request[idx]->batch = batch;
1219 
1220 		i915_request_get(request[idx]);
1221 		i915_request_add(request[idx]);
1222 
1223 		prev = request[idx];
1224 		idx++;
1225 
1226 out_unlock:
1227 		i915_vma_unlock(batch);
1228 		if (err)
1229 			goto out_request;
1230 	}
1231 
1232 	idx = 0;
1233 	for_each_uabi_engine(engine, i915) {
1234 		long timeout;
1235 
1236 		if (i915_request_completed(request[idx])) {
1237 			pr_err("%s(%s): request completed too early!\n",
1238 			       __func__, engine->name);
1239 			err = -EINVAL;
1240 			goto out_request;
1241 		}
1242 
1243 		err = recursive_batch_resolve(request[idx]->batch);
1244 		if (err) {
1245 			pr_err("%s: failed to resolve batch, err=%d\n",
1246 			       __func__, err);
1247 			goto out_request;
1248 		}
1249 
1250 		timeout = i915_request_wait(request[idx], 0,
1251 					    MAX_SCHEDULE_TIMEOUT);
1252 		if (timeout < 0) {
1253 			err = timeout;
1254 			pr_err("%s: error waiting for request on %s, err=%d\n",
1255 			       __func__, engine->name, err);
1256 			goto out_request;
1257 		}
1258 
1259 		GEM_BUG_ON(!i915_request_completed(request[idx]));
1260 		idx++;
1261 	}
1262 
1263 	err = igt_live_test_end(&t);
1264 
1265 out_request:
1266 	idx = 0;
1267 	for_each_uabi_engine(engine, i915) {
1268 		u32 *cmd;
1269 
1270 		if (!request[idx])
1271 			break;
1272 
1273 		cmd = i915_gem_object_pin_map_unlocked(request[idx]->batch->obj,
1274 						       I915_MAP_WC);
1275 		if (!IS_ERR(cmd)) {
1276 			*cmd = MI_BATCH_BUFFER_END;
1277 
1278 			__i915_gem_object_flush_map(request[idx]->batch->obj,
1279 						    0, sizeof(*cmd));
1280 			i915_gem_object_unpin_map(request[idx]->batch->obj);
1281 
1282 			intel_gt_chipset_flush(engine->gt);
1283 		}
1284 
1285 		i915_vma_put(request[idx]->batch);
1286 		i915_request_put(request[idx]);
1287 		idx++;
1288 	}
1289 out_free:
1290 	kfree(request);
1291 	return err;
1292 }
1293 
1294 static int __live_parallel_engine1(void *arg)
1295 {
1296 	struct intel_engine_cs *engine = arg;
1297 	IGT_TIMEOUT(end_time);
1298 	unsigned long count;
1299 	int err = 0;
1300 
1301 	count = 0;
1302 	intel_engine_pm_get(engine);
1303 	do {
1304 		struct i915_request *rq;
1305 
1306 		rq = i915_request_create(engine->kernel_context);
1307 		if (IS_ERR(rq)) {
1308 			err = PTR_ERR(rq);
1309 			break;
1310 		}
1311 
1312 		i915_request_get(rq);
1313 		i915_request_add(rq);
1314 
1315 		err = 0;
1316 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
1317 			err = -ETIME;
1318 		i915_request_put(rq);
1319 		if (err)
1320 			break;
1321 
1322 		count++;
1323 	} while (!__igt_timeout(end_time, NULL));
1324 	intel_engine_pm_put(engine);
1325 
1326 	pr_info("%s: %lu request + sync\n", engine->name, count);
1327 	return err;
1328 }
1329 
1330 static int __live_parallel_engineN(void *arg)
1331 {
1332 	struct intel_engine_cs *engine = arg;
1333 	IGT_TIMEOUT(end_time);
1334 	unsigned long count;
1335 	int err = 0;
1336 
1337 	count = 0;
1338 	intel_engine_pm_get(engine);
1339 	do {
1340 		struct i915_request *rq;
1341 
1342 		rq = i915_request_create(engine->kernel_context);
1343 		if (IS_ERR(rq)) {
1344 			err = PTR_ERR(rq);
1345 			break;
1346 		}
1347 
1348 		i915_request_add(rq);
1349 		count++;
1350 	} while (!__igt_timeout(end_time, NULL));
1351 	intel_engine_pm_put(engine);
1352 
1353 	pr_info("%s: %lu requests\n", engine->name, count);
1354 	return err;
1355 }
1356 
1357 static bool wake_all(struct drm_i915_private *i915)
1358 {
1359 	if (atomic_dec_and_test(&i915->selftest.counter)) {
1360 		wake_up_var(&i915->selftest.counter);
1361 		return true;
1362 	}
1363 
1364 	return false;
1365 }
1366 
1367 static int wait_for_all(struct drm_i915_private *i915)
1368 {
1369 	if (wake_all(i915))
1370 		return 0;
1371 
1372 	if (wait_var_event_timeout(&i915->selftest.counter,
1373 				   !atomic_read(&i915->selftest.counter),
1374 				   i915_selftest.timeout_jiffies))
1375 		return 0;
1376 
1377 	return -ETIME;
1378 }
1379 
1380 static int __live_parallel_spin(void *arg)
1381 {
1382 	struct intel_engine_cs *engine = arg;
1383 	struct igt_spinner spin;
1384 	struct i915_request *rq;
1385 	int err = 0;
1386 
1387 	/*
1388 	 * Create a spinner running for eternity on each engine. If a second
1389 	 * spinner is incorrectly placed on the same engine, it will not be
1390 	 * able to start in time.
1391 	 */
1392 
1393 	if (igt_spinner_init(&spin, engine->gt)) {
1394 		wake_all(engine->i915);
1395 		return -ENOMEM;
1396 	}
1397 
1398 	intel_engine_pm_get(engine);
1399 	rq = igt_spinner_create_request(&spin,
1400 					engine->kernel_context,
1401 					MI_NOOP); /* no preemption */
1402 	intel_engine_pm_put(engine);
1403 	if (IS_ERR(rq)) {
1404 		err = PTR_ERR(rq);
1405 		if (err == -ENODEV)
1406 			err = 0;
1407 		wake_all(engine->i915);
1408 		goto out_spin;
1409 	}
1410 
1411 	i915_request_get(rq);
1412 	i915_request_add(rq);
1413 	if (igt_wait_for_spinner(&spin, rq)) {
1414 		/* Occupy this engine for the whole test */
1415 		err = wait_for_all(engine->i915);
1416 	} else {
1417 		pr_err("Failed to start spinner on %s\n", engine->name);
1418 		err = -EINVAL;
1419 	}
1420 	igt_spinner_end(&spin);
1421 
1422 	if (err == 0 && i915_request_wait(rq, 0, HZ / 5) < 0)
1423 		err = -EIO;
1424 	i915_request_put(rq);
1425 
1426 out_spin:
1427 	igt_spinner_fini(&spin);
1428 	return err;
1429 }
1430 
1431 static int live_parallel_engines(void *arg)
1432 {
1433 	struct drm_i915_private *i915 = arg;
1434 	static int (* const func[])(void *arg) = {
1435 		__live_parallel_engine1,
1436 		__live_parallel_engineN,
1437 		__live_parallel_spin,
1438 		NULL,
1439 	};
1440 	const unsigned int nengines = num_uabi_engines(i915);
1441 	struct intel_engine_cs *engine;
1442 	int (* const *fn)(void *arg);
1443 	struct task_struct **tsk;
1444 	int err = 0;
1445 
1446 	/*
1447 	 * Check we can submit requests to all engines concurrently. This
1448 	 * tests that we load up the system maximally.
1449 	 */
1450 
1451 	tsk = kcalloc(nengines, sizeof(*tsk), GFP_KERNEL);
1452 	if (!tsk)
1453 		return -ENOMEM;
1454 
1455 	for (fn = func; !err && *fn; fn++) {
1456 		char name[KSYM_NAME_LEN];
1457 		struct igt_live_test t;
1458 		unsigned int idx;
1459 
1460 		snprintf(name, sizeof(name), "%ps", *fn);
1461 		err = igt_live_test_begin(&t, i915, __func__, name);
1462 		if (err)
1463 			break;
1464 
1465 		atomic_set(&i915->selftest.counter, nengines);
1466 
1467 		idx = 0;
1468 		for_each_uabi_engine(engine, i915) {
1469 			tsk[idx] = kthread_run(*fn, engine,
1470 					       "igt/parallel:%s",
1471 					       engine->name);
1472 			if (IS_ERR(tsk[idx])) {
1473 				err = PTR_ERR(tsk[idx]);
1474 				break;
1475 			}
1476 			get_task_struct(tsk[idx++]);
1477 		}
1478 
1479 		yield(); /* start all threads before we kthread_stop() */
1480 
1481 		idx = 0;
1482 		for_each_uabi_engine(engine, i915) {
1483 			int status;
1484 
1485 			if (IS_ERR(tsk[idx]))
1486 				break;
1487 
1488 			status = kthread_stop(tsk[idx]);
1489 			if (status && !err)
1490 				err = status;
1491 
1492 			put_task_struct(tsk[idx++]);
1493 		}
1494 
1495 		if (igt_live_test_end(&t))
1496 			err = -EIO;
1497 	}
1498 
1499 	kfree(tsk);
1500 	return err;
1501 }
1502 
1503 static int
1504 max_batches(struct i915_gem_context *ctx, struct intel_engine_cs *engine)
1505 {
1506 	struct i915_request *rq;
1507 	int ret;
1508 
1509 	/*
1510 	 * Before execlists, all contexts share the same ringbuffer. With
1511 	 * execlists, each context/engine has a separate ringbuffer and
1512 	 * for the purposes of this test, inexhaustible.
1513 	 *
1514 	 * For the global ringbuffer though, we have to be very careful
1515 	 * that we do not wrap while preventing the execution of requests
1516 	 * with a unsignaled fence.
1517 	 */
1518 	if (HAS_EXECLISTS(ctx->i915))
1519 		return INT_MAX;
1520 
1521 	rq = igt_request_alloc(ctx, engine);
1522 	if (IS_ERR(rq)) {
1523 		ret = PTR_ERR(rq);
1524 	} else {
1525 		int sz;
1526 
1527 		ret = rq->ring->size - rq->reserved_space;
1528 		i915_request_add(rq);
1529 
1530 		sz = rq->ring->emit - rq->head;
1531 		if (sz < 0)
1532 			sz += rq->ring->size;
1533 		ret /= sz;
1534 		ret /= 2; /* leave half spare, in case of emergency! */
1535 	}
1536 
1537 	return ret;
1538 }
1539 
1540 static int live_breadcrumbs_smoketest(void *arg)
1541 {
1542 	struct drm_i915_private *i915 = arg;
1543 	const unsigned int nengines = num_uabi_engines(i915);
1544 	const unsigned int ncpus = num_online_cpus();
1545 	unsigned long num_waits, num_fences;
1546 	struct intel_engine_cs *engine;
1547 	struct task_struct **threads;
1548 	struct igt_live_test live;
1549 	intel_wakeref_t wakeref;
1550 	struct smoketest *smoke;
1551 	unsigned int n, idx;
1552 	struct file *file;
1553 	int ret = 0;
1554 
1555 	/*
1556 	 * Smoketest our breadcrumb/signal handling for requests across multiple
1557 	 * threads. A very simple test to only catch the most egregious of bugs.
1558 	 * See __igt_breadcrumbs_smoketest();
1559 	 *
1560 	 * On real hardware this time.
1561 	 */
1562 
1563 	wakeref = intel_runtime_pm_get(&i915->runtime_pm);
1564 
1565 	file = mock_file(i915);
1566 	if (IS_ERR(file)) {
1567 		ret = PTR_ERR(file);
1568 		goto out_rpm;
1569 	}
1570 
1571 	smoke = kcalloc(nengines, sizeof(*smoke), GFP_KERNEL);
1572 	if (!smoke) {
1573 		ret = -ENOMEM;
1574 		goto out_file;
1575 	}
1576 
1577 	threads = kcalloc(ncpus * nengines, sizeof(*threads), GFP_KERNEL);
1578 	if (!threads) {
1579 		ret = -ENOMEM;
1580 		goto out_smoke;
1581 	}
1582 
1583 	smoke[0].request_alloc = __live_request_alloc;
1584 	smoke[0].ncontexts = 64;
1585 	smoke[0].contexts = kcalloc(smoke[0].ncontexts,
1586 				    sizeof(*smoke[0].contexts),
1587 				    GFP_KERNEL);
1588 	if (!smoke[0].contexts) {
1589 		ret = -ENOMEM;
1590 		goto out_threads;
1591 	}
1592 
1593 	for (n = 0; n < smoke[0].ncontexts; n++) {
1594 		smoke[0].contexts[n] = live_context(i915, file);
1595 		if (!smoke[0].contexts[n]) {
1596 			ret = -ENOMEM;
1597 			goto out_contexts;
1598 		}
1599 	}
1600 
1601 	ret = igt_live_test_begin(&live, i915, __func__, "");
1602 	if (ret)
1603 		goto out_contexts;
1604 
1605 	idx = 0;
1606 	for_each_uabi_engine(engine, i915) {
1607 		smoke[idx] = smoke[0];
1608 		smoke[idx].engine = engine;
1609 		smoke[idx].max_batch =
1610 			max_batches(smoke[0].contexts[0], engine);
1611 		if (smoke[idx].max_batch < 0) {
1612 			ret = smoke[idx].max_batch;
1613 			goto out_flush;
1614 		}
1615 		/* One ring interleaved between requests from all cpus */
1616 		smoke[idx].max_batch /= num_online_cpus() + 1;
1617 		pr_debug("Limiting batches to %d requests on %s\n",
1618 			 smoke[idx].max_batch, engine->name);
1619 
1620 		for (n = 0; n < ncpus; n++) {
1621 			struct task_struct *tsk;
1622 
1623 			tsk = kthread_run(__igt_breadcrumbs_smoketest,
1624 					  &smoke[idx], "igt/%d.%d", idx, n);
1625 			if (IS_ERR(tsk)) {
1626 				ret = PTR_ERR(tsk);
1627 				goto out_flush;
1628 			}
1629 
1630 			get_task_struct(tsk);
1631 			threads[idx * ncpus + n] = tsk;
1632 		}
1633 
1634 		idx++;
1635 	}
1636 
1637 	yield(); /* start all threads before we begin */
1638 	msleep(jiffies_to_msecs(i915_selftest.timeout_jiffies));
1639 
1640 out_flush:
1641 	idx = 0;
1642 	num_waits = 0;
1643 	num_fences = 0;
1644 	for_each_uabi_engine(engine, i915) {
1645 		for (n = 0; n < ncpus; n++) {
1646 			struct task_struct *tsk = threads[idx * ncpus + n];
1647 			int err;
1648 
1649 			if (!tsk)
1650 				continue;
1651 
1652 			err = kthread_stop(tsk);
1653 			if (err < 0 && !ret)
1654 				ret = err;
1655 
1656 			put_task_struct(tsk);
1657 		}
1658 
1659 		num_waits += atomic_long_read(&smoke[idx].num_waits);
1660 		num_fences += atomic_long_read(&smoke[idx].num_fences);
1661 		idx++;
1662 	}
1663 	pr_info("Completed %lu waits for %lu fences across %d engines and %d cpus\n",
1664 		num_waits, num_fences, idx, ncpus);
1665 
1666 	ret = igt_live_test_end(&live) ?: ret;
1667 out_contexts:
1668 	kfree(smoke[0].contexts);
1669 out_threads:
1670 	kfree(threads);
1671 out_smoke:
1672 	kfree(smoke);
1673 out_file:
1674 	fput(file);
1675 out_rpm:
1676 	intel_runtime_pm_put(&i915->runtime_pm, wakeref);
1677 
1678 	return ret;
1679 }
1680 
1681 int i915_request_live_selftests(struct drm_i915_private *i915)
1682 {
1683 	static const struct i915_subtest tests[] = {
1684 		SUBTEST(live_nop_request),
1685 		SUBTEST(live_all_engines),
1686 		SUBTEST(live_sequential_engines),
1687 		SUBTEST(live_parallel_engines),
1688 		SUBTEST(live_empty_request),
1689 		SUBTEST(live_cancel_request),
1690 		SUBTEST(live_breadcrumbs_smoketest),
1691 	};
1692 
1693 	if (intel_gt_is_wedged(&i915->gt))
1694 		return 0;
1695 
1696 	return i915_subtests(tests, i915);
1697 }
1698 
1699 static int switch_to_kernel_sync(struct intel_context *ce, int err)
1700 {
1701 	struct i915_request *rq;
1702 	struct dma_fence *fence;
1703 
1704 	rq = intel_engine_create_kernel_request(ce->engine);
1705 	if (IS_ERR(rq))
1706 		return PTR_ERR(rq);
1707 
1708 	fence = i915_active_fence_get(&ce->timeline->last_request);
1709 	if (fence) {
1710 		i915_request_await_dma_fence(rq, fence);
1711 		dma_fence_put(fence);
1712 	}
1713 
1714 	rq = i915_request_get(rq);
1715 	i915_request_add(rq);
1716 	if (i915_request_wait(rq, 0, HZ / 2) < 0 && !err)
1717 		err = -ETIME;
1718 	i915_request_put(rq);
1719 
1720 	while (!err && !intel_engine_is_idle(ce->engine))
1721 		intel_engine_flush_submission(ce->engine);
1722 
1723 	return err;
1724 }
1725 
1726 struct perf_stats {
1727 	struct intel_engine_cs *engine;
1728 	unsigned long count;
1729 	ktime_t time;
1730 	ktime_t busy;
1731 	u64 runtime;
1732 };
1733 
1734 struct perf_series {
1735 	struct drm_i915_private *i915;
1736 	unsigned int nengines;
1737 	struct intel_context *ce[];
1738 };
1739 
1740 static int cmp_u32(const void *A, const void *B)
1741 {
1742 	const u32 *a = A, *b = B;
1743 
1744 	return *a - *b;
1745 }
1746 
1747 static u32 trifilter(u32 *a)
1748 {
1749 	u64 sum;
1750 
1751 #define TF_COUNT 5
1752 	sort(a, TF_COUNT, sizeof(*a), cmp_u32, NULL);
1753 
1754 	sum = mul_u32_u32(a[2], 2);
1755 	sum += a[1];
1756 	sum += a[3];
1757 
1758 	GEM_BUG_ON(sum > U32_MAX);
1759 	return sum;
1760 #define TF_BIAS 2
1761 }
1762 
1763 static u64 cycles_to_ns(struct intel_engine_cs *engine, u32 cycles)
1764 {
1765 	u64 ns = intel_gt_clock_interval_to_ns(engine->gt, cycles);
1766 
1767 	return DIV_ROUND_CLOSEST(ns, 1 << TF_BIAS);
1768 }
1769 
1770 static u32 *emit_timestamp_store(u32 *cs, struct intel_context *ce, u32 offset)
1771 {
1772 	*cs++ = MI_STORE_REGISTER_MEM_GEN8 | MI_USE_GGTT;
1773 	*cs++ = i915_mmio_reg_offset(RING_TIMESTAMP((ce->engine->mmio_base)));
1774 	*cs++ = offset;
1775 	*cs++ = 0;
1776 
1777 	return cs;
1778 }
1779 
1780 static u32 *emit_store_dw(u32 *cs, u32 offset, u32 value)
1781 {
1782 	*cs++ = MI_STORE_DWORD_IMM_GEN4 | MI_USE_GGTT;
1783 	*cs++ = offset;
1784 	*cs++ = 0;
1785 	*cs++ = value;
1786 
1787 	return cs;
1788 }
1789 
1790 static u32 *emit_semaphore_poll(u32 *cs, u32 mode, u32 value, u32 offset)
1791 {
1792 	*cs++ = MI_SEMAPHORE_WAIT |
1793 		MI_SEMAPHORE_GLOBAL_GTT |
1794 		MI_SEMAPHORE_POLL |
1795 		mode;
1796 	*cs++ = value;
1797 	*cs++ = offset;
1798 	*cs++ = 0;
1799 
1800 	return cs;
1801 }
1802 
1803 static u32 *emit_semaphore_poll_until(u32 *cs, u32 offset, u32 value)
1804 {
1805 	return emit_semaphore_poll(cs, MI_SEMAPHORE_SAD_EQ_SDD, value, offset);
1806 }
1807 
1808 static void semaphore_set(u32 *sema, u32 value)
1809 {
1810 	WRITE_ONCE(*sema, value);
1811 	wmb(); /* flush the update to the cache, and beyond */
1812 }
1813 
1814 static u32 *hwsp_scratch(const struct intel_context *ce)
1815 {
1816 	return memset32(ce->engine->status_page.addr + 1000, 0, 21);
1817 }
1818 
1819 static u32 hwsp_offset(const struct intel_context *ce, u32 *dw)
1820 {
1821 	return (i915_ggtt_offset(ce->engine->status_page.vma) +
1822 		offset_in_page(dw));
1823 }
1824 
1825 static int measure_semaphore_response(struct intel_context *ce)
1826 {
1827 	u32 *sema = hwsp_scratch(ce);
1828 	const u32 offset = hwsp_offset(ce, sema);
1829 	u32 elapsed[TF_COUNT], cycles;
1830 	struct i915_request *rq;
1831 	u32 *cs;
1832 	int err;
1833 	int i;
1834 
1835 	/*
1836 	 * Measure how many cycles it takes for the HW to detect the change
1837 	 * in a semaphore value.
1838 	 *
1839 	 *    A: read CS_TIMESTAMP from CPU
1840 	 *    poke semaphore
1841 	 *    B: read CS_TIMESTAMP on GPU
1842 	 *
1843 	 * Semaphore latency: B - A
1844 	 */
1845 
1846 	semaphore_set(sema, -1);
1847 
1848 	rq = i915_request_create(ce);
1849 	if (IS_ERR(rq))
1850 		return PTR_ERR(rq);
1851 
1852 	cs = intel_ring_begin(rq, 4 + 12 * ARRAY_SIZE(elapsed));
1853 	if (IS_ERR(cs)) {
1854 		i915_request_add(rq);
1855 		err = PTR_ERR(cs);
1856 		goto err;
1857 	}
1858 
1859 	cs = emit_store_dw(cs, offset, 0);
1860 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1861 		cs = emit_semaphore_poll_until(cs, offset, i);
1862 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1863 		cs = emit_store_dw(cs, offset, 0);
1864 	}
1865 
1866 	intel_ring_advance(rq, cs);
1867 	i915_request_add(rq);
1868 
1869 	if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1870 		err = -EIO;
1871 		goto err;
1872 	}
1873 
1874 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1875 		preempt_disable();
1876 		cycles = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1877 		semaphore_set(sema, i);
1878 		preempt_enable();
1879 
1880 		if (wait_for(READ_ONCE(*sema) == 0, 50)) {
1881 			err = -EIO;
1882 			goto err;
1883 		}
1884 
1885 		elapsed[i - 1] = sema[i] - cycles;
1886 	}
1887 
1888 	cycles = trifilter(elapsed);
1889 	pr_info("%s: semaphore response %d cycles, %lluns\n",
1890 		ce->engine->name, cycles >> TF_BIAS,
1891 		cycles_to_ns(ce->engine, cycles));
1892 
1893 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1894 
1895 err:
1896 	intel_gt_set_wedged(ce->engine->gt);
1897 	return err;
1898 }
1899 
1900 static int measure_idle_dispatch(struct intel_context *ce)
1901 {
1902 	u32 *sema = hwsp_scratch(ce);
1903 	const u32 offset = hwsp_offset(ce, sema);
1904 	u32 elapsed[TF_COUNT], cycles;
1905 	u32 *cs;
1906 	int err;
1907 	int i;
1908 
1909 	/*
1910 	 * Measure how long it takes for us to submit a request while the
1911 	 * engine is idle, but is resting in our context.
1912 	 *
1913 	 *    A: read CS_TIMESTAMP from CPU
1914 	 *    submit request
1915 	 *    B: read CS_TIMESTAMP on GPU
1916 	 *
1917 	 * Submission latency: B - A
1918 	 */
1919 
1920 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
1921 		struct i915_request *rq;
1922 
1923 		err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1924 		if (err)
1925 			return err;
1926 
1927 		rq = i915_request_create(ce);
1928 		if (IS_ERR(rq)) {
1929 			err = PTR_ERR(rq);
1930 			goto err;
1931 		}
1932 
1933 		cs = intel_ring_begin(rq, 4);
1934 		if (IS_ERR(cs)) {
1935 			i915_request_add(rq);
1936 			err = PTR_ERR(cs);
1937 			goto err;
1938 		}
1939 
1940 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
1941 
1942 		intel_ring_advance(rq, cs);
1943 
1944 		preempt_disable();
1945 		local_bh_disable();
1946 		elapsed[i] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
1947 		i915_request_add(rq);
1948 		local_bh_enable();
1949 		preempt_enable();
1950 	}
1951 
1952 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
1953 	if (err)
1954 		goto err;
1955 
1956 	for (i = 0; i < ARRAY_SIZE(elapsed); i++)
1957 		elapsed[i] = sema[i] - elapsed[i];
1958 
1959 	cycles = trifilter(elapsed);
1960 	pr_info("%s: idle dispatch latency %d cycles, %lluns\n",
1961 		ce->engine->name, cycles >> TF_BIAS,
1962 		cycles_to_ns(ce->engine, cycles));
1963 
1964 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
1965 
1966 err:
1967 	intel_gt_set_wedged(ce->engine->gt);
1968 	return err;
1969 }
1970 
1971 static int measure_busy_dispatch(struct intel_context *ce)
1972 {
1973 	u32 *sema = hwsp_scratch(ce);
1974 	const u32 offset = hwsp_offset(ce, sema);
1975 	u32 elapsed[TF_COUNT + 1], cycles;
1976 	u32 *cs;
1977 	int err;
1978 	int i;
1979 
1980 	/*
1981 	 * Measure how long it takes for us to submit a request while the
1982 	 * engine is busy, polling on a semaphore in our context. With
1983 	 * direct submission, this will include the cost of a lite restore.
1984 	 *
1985 	 *    A: read CS_TIMESTAMP from CPU
1986 	 *    submit request
1987 	 *    B: read CS_TIMESTAMP on GPU
1988 	 *
1989 	 * Submission latency: B - A
1990 	 */
1991 
1992 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
1993 		struct i915_request *rq;
1994 
1995 		rq = i915_request_create(ce);
1996 		if (IS_ERR(rq)) {
1997 			err = PTR_ERR(rq);
1998 			goto err;
1999 		}
2000 
2001 		cs = intel_ring_begin(rq, 12);
2002 		if (IS_ERR(cs)) {
2003 			i915_request_add(rq);
2004 			err = PTR_ERR(cs);
2005 			goto err;
2006 		}
2007 
2008 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2009 		cs = emit_semaphore_poll_until(cs, offset, i);
2010 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2011 
2012 		intel_ring_advance(rq, cs);
2013 
2014 		if (i > 1 && wait_for(READ_ONCE(sema[i - 1]), 500)) {
2015 			err = -EIO;
2016 			goto err;
2017 		}
2018 
2019 		preempt_disable();
2020 		local_bh_disable();
2021 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2022 		i915_request_add(rq);
2023 		local_bh_enable();
2024 		semaphore_set(sema, i - 1);
2025 		preempt_enable();
2026 	}
2027 
2028 	wait_for(READ_ONCE(sema[i - 1]), 500);
2029 	semaphore_set(sema, i - 1);
2030 
2031 	for (i = 1; i <= TF_COUNT; i++) {
2032 		GEM_BUG_ON(sema[i] == -1);
2033 		elapsed[i - 1] = sema[i] - elapsed[i];
2034 	}
2035 
2036 	cycles = trifilter(elapsed);
2037 	pr_info("%s: busy dispatch latency %d cycles, %lluns\n",
2038 		ce->engine->name, cycles >> TF_BIAS,
2039 		cycles_to_ns(ce->engine, cycles));
2040 
2041 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2042 
2043 err:
2044 	intel_gt_set_wedged(ce->engine->gt);
2045 	return err;
2046 }
2047 
2048 static int plug(struct intel_engine_cs *engine, u32 *sema, u32 mode, int value)
2049 {
2050 	const u32 offset =
2051 		i915_ggtt_offset(engine->status_page.vma) +
2052 		offset_in_page(sema);
2053 	struct i915_request *rq;
2054 	u32 *cs;
2055 
2056 	rq = i915_request_create(engine->kernel_context);
2057 	if (IS_ERR(rq))
2058 		return PTR_ERR(rq);
2059 
2060 	cs = intel_ring_begin(rq, 4);
2061 	if (IS_ERR(cs)) {
2062 		i915_request_add(rq);
2063 		return PTR_ERR(cs);
2064 	}
2065 
2066 	cs = emit_semaphore_poll(cs, mode, value, offset);
2067 
2068 	intel_ring_advance(rq, cs);
2069 	i915_request_add(rq);
2070 
2071 	return 0;
2072 }
2073 
2074 static int measure_inter_request(struct intel_context *ce)
2075 {
2076 	u32 *sema = hwsp_scratch(ce);
2077 	const u32 offset = hwsp_offset(ce, sema);
2078 	u32 elapsed[TF_COUNT + 1], cycles;
2079 	struct i915_sw_fence *submit;
2080 	int i, err;
2081 
2082 	/*
2083 	 * Measure how long it takes to advance from one request into the
2084 	 * next. Between each request we flush the GPU caches to memory,
2085 	 * update the breadcrumbs, and then invalidate those caches.
2086 	 * We queue up all the requests to be submitted in one batch so
2087 	 * it should be one set of contiguous measurements.
2088 	 *
2089 	 *    A: read CS_TIMESTAMP on GPU
2090 	 *    advance request
2091 	 *    B: read CS_TIMESTAMP on GPU
2092 	 *
2093 	 * Request latency: B - A
2094 	 */
2095 
2096 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2097 	if (err)
2098 		return err;
2099 
2100 	submit = heap_fence_create(GFP_KERNEL);
2101 	if (!submit) {
2102 		semaphore_set(sema, 1);
2103 		return -ENOMEM;
2104 	}
2105 
2106 	intel_engine_flush_submission(ce->engine);
2107 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2108 		struct i915_request *rq;
2109 		u32 *cs;
2110 
2111 		rq = i915_request_create(ce);
2112 		if (IS_ERR(rq)) {
2113 			err = PTR_ERR(rq);
2114 			goto err_submit;
2115 		}
2116 
2117 		err = i915_sw_fence_await_sw_fence_gfp(&rq->submit,
2118 						       submit,
2119 						       GFP_KERNEL);
2120 		if (err < 0) {
2121 			i915_request_add(rq);
2122 			goto err_submit;
2123 		}
2124 
2125 		cs = intel_ring_begin(rq, 4);
2126 		if (IS_ERR(cs)) {
2127 			i915_request_add(rq);
2128 			err = PTR_ERR(cs);
2129 			goto err_submit;
2130 		}
2131 
2132 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2133 
2134 		intel_ring_advance(rq, cs);
2135 		i915_request_add(rq);
2136 	}
2137 	i915_sw_fence_commit(submit);
2138 	intel_engine_flush_submission(ce->engine);
2139 	heap_fence_put(submit);
2140 
2141 	semaphore_set(sema, 1);
2142 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2143 	if (err)
2144 		goto err;
2145 
2146 	for (i = 1; i <= TF_COUNT; i++)
2147 		elapsed[i - 1] = sema[i + 1] - sema[i];
2148 
2149 	cycles = trifilter(elapsed);
2150 	pr_info("%s: inter-request latency %d cycles, %lluns\n",
2151 		ce->engine->name, cycles >> TF_BIAS,
2152 		cycles_to_ns(ce->engine, cycles));
2153 
2154 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2155 
2156 err_submit:
2157 	i915_sw_fence_commit(submit);
2158 	heap_fence_put(submit);
2159 	semaphore_set(sema, 1);
2160 err:
2161 	intel_gt_set_wedged(ce->engine->gt);
2162 	return err;
2163 }
2164 
2165 static int measure_context_switch(struct intel_context *ce)
2166 {
2167 	u32 *sema = hwsp_scratch(ce);
2168 	const u32 offset = hwsp_offset(ce, sema);
2169 	struct i915_request *fence = NULL;
2170 	u32 elapsed[TF_COUNT + 1], cycles;
2171 	int i, j, err;
2172 	u32 *cs;
2173 
2174 	/*
2175 	 * Measure how long it takes to advance from one request in one
2176 	 * context to a request in another context. This allows us to
2177 	 * measure how long the context save/restore take, along with all
2178 	 * the inter-context setup we require.
2179 	 *
2180 	 *    A: read CS_TIMESTAMP on GPU
2181 	 *    switch context
2182 	 *    B: read CS_TIMESTAMP on GPU
2183 	 *
2184 	 * Context switch latency: B - A
2185 	 */
2186 
2187 	err = plug(ce->engine, sema, MI_SEMAPHORE_SAD_NEQ_SDD, 0);
2188 	if (err)
2189 		return err;
2190 
2191 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2192 		struct intel_context *arr[] = {
2193 			ce, ce->engine->kernel_context
2194 		};
2195 		u32 addr = offset + ARRAY_SIZE(arr) * i * sizeof(u32);
2196 
2197 		for (j = 0; j < ARRAY_SIZE(arr); j++) {
2198 			struct i915_request *rq;
2199 
2200 			rq = i915_request_create(arr[j]);
2201 			if (IS_ERR(rq)) {
2202 				err = PTR_ERR(rq);
2203 				goto err_fence;
2204 			}
2205 
2206 			if (fence) {
2207 				err = i915_request_await_dma_fence(rq,
2208 								   &fence->fence);
2209 				if (err) {
2210 					i915_request_add(rq);
2211 					goto err_fence;
2212 				}
2213 			}
2214 
2215 			cs = intel_ring_begin(rq, 4);
2216 			if (IS_ERR(cs)) {
2217 				i915_request_add(rq);
2218 				err = PTR_ERR(cs);
2219 				goto err_fence;
2220 			}
2221 
2222 			cs = emit_timestamp_store(cs, ce, addr);
2223 			addr += sizeof(u32);
2224 
2225 			intel_ring_advance(rq, cs);
2226 
2227 			i915_request_put(fence);
2228 			fence = i915_request_get(rq);
2229 
2230 			i915_request_add(rq);
2231 		}
2232 	}
2233 	i915_request_put(fence);
2234 	intel_engine_flush_submission(ce->engine);
2235 
2236 	semaphore_set(sema, 1);
2237 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2238 	if (err)
2239 		goto err;
2240 
2241 	for (i = 1; i <= TF_COUNT; i++)
2242 		elapsed[i - 1] = sema[2 * i + 2] - sema[2 * i + 1];
2243 
2244 	cycles = trifilter(elapsed);
2245 	pr_info("%s: context switch latency %d cycles, %lluns\n",
2246 		ce->engine->name, cycles >> TF_BIAS,
2247 		cycles_to_ns(ce->engine, cycles));
2248 
2249 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2250 
2251 err_fence:
2252 	i915_request_put(fence);
2253 	semaphore_set(sema, 1);
2254 err:
2255 	intel_gt_set_wedged(ce->engine->gt);
2256 	return err;
2257 }
2258 
2259 static int measure_preemption(struct intel_context *ce)
2260 {
2261 	u32 *sema = hwsp_scratch(ce);
2262 	const u32 offset = hwsp_offset(ce, sema);
2263 	u32 elapsed[TF_COUNT], cycles;
2264 	u32 *cs;
2265 	int err;
2266 	int i;
2267 
2268 	/*
2269 	 * We measure two latencies while triggering preemption. The first
2270 	 * latency is how long it takes for us to submit a preempting request.
2271 	 * The second latency is how it takes for us to return from the
2272 	 * preemption back to the original context.
2273 	 *
2274 	 *    A: read CS_TIMESTAMP from CPU
2275 	 *    submit preemption
2276 	 *    B: read CS_TIMESTAMP on GPU (in preempting context)
2277 	 *    context switch
2278 	 *    C: read CS_TIMESTAMP on GPU (in original context)
2279 	 *
2280 	 * Preemption dispatch latency: B - A
2281 	 * Preemption switch latency: C - B
2282 	 */
2283 
2284 	if (!intel_engine_has_preemption(ce->engine))
2285 		return 0;
2286 
2287 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2288 		u32 addr = offset + 2 * i * sizeof(u32);
2289 		struct i915_request *rq;
2290 
2291 		rq = i915_request_create(ce);
2292 		if (IS_ERR(rq)) {
2293 			err = PTR_ERR(rq);
2294 			goto err;
2295 		}
2296 
2297 		cs = intel_ring_begin(rq, 12);
2298 		if (IS_ERR(cs)) {
2299 			i915_request_add(rq);
2300 			err = PTR_ERR(cs);
2301 			goto err;
2302 		}
2303 
2304 		cs = emit_store_dw(cs, addr, -1);
2305 		cs = emit_semaphore_poll_until(cs, offset, i);
2306 		cs = emit_timestamp_store(cs, ce, addr + sizeof(u32));
2307 
2308 		intel_ring_advance(rq, cs);
2309 		i915_request_add(rq);
2310 
2311 		if (wait_for(READ_ONCE(sema[2 * i]) == -1, 500)) {
2312 			err = -EIO;
2313 			goto err;
2314 		}
2315 
2316 		rq = i915_request_create(ce->engine->kernel_context);
2317 		if (IS_ERR(rq)) {
2318 			err = PTR_ERR(rq);
2319 			goto err;
2320 		}
2321 
2322 		cs = intel_ring_begin(rq, 8);
2323 		if (IS_ERR(cs)) {
2324 			i915_request_add(rq);
2325 			err = PTR_ERR(cs);
2326 			goto err;
2327 		}
2328 
2329 		cs = emit_timestamp_store(cs, ce, addr);
2330 		cs = emit_store_dw(cs, offset, i);
2331 
2332 		intel_ring_advance(rq, cs);
2333 		rq->sched.attr.priority = I915_PRIORITY_BARRIER;
2334 
2335 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2336 		i915_request_add(rq);
2337 	}
2338 
2339 	if (wait_for(READ_ONCE(sema[2 * i - 2]) != -1, 500)) {
2340 		err = -EIO;
2341 		goto err;
2342 	}
2343 
2344 	for (i = 1; i <= TF_COUNT; i++)
2345 		elapsed[i - 1] = sema[2 * i + 0] - elapsed[i - 1];
2346 
2347 	cycles = trifilter(elapsed);
2348 	pr_info("%s: preemption dispatch latency %d cycles, %lluns\n",
2349 		ce->engine->name, cycles >> TF_BIAS,
2350 		cycles_to_ns(ce->engine, cycles));
2351 
2352 	for (i = 1; i <= TF_COUNT; i++)
2353 		elapsed[i - 1] = sema[2 * i + 1] - sema[2 * i + 0];
2354 
2355 	cycles = trifilter(elapsed);
2356 	pr_info("%s: preemption switch latency %d cycles, %lluns\n",
2357 		ce->engine->name, cycles >> TF_BIAS,
2358 		cycles_to_ns(ce->engine, cycles));
2359 
2360 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2361 
2362 err:
2363 	intel_gt_set_wedged(ce->engine->gt);
2364 	return err;
2365 }
2366 
2367 struct signal_cb {
2368 	struct dma_fence_cb base;
2369 	bool seen;
2370 };
2371 
2372 static void signal_cb(struct dma_fence *fence, struct dma_fence_cb *cb)
2373 {
2374 	struct signal_cb *s = container_of(cb, typeof(*s), base);
2375 
2376 	smp_store_mb(s->seen, true); /* be safe, be strong */
2377 }
2378 
2379 static int measure_completion(struct intel_context *ce)
2380 {
2381 	u32 *sema = hwsp_scratch(ce);
2382 	const u32 offset = hwsp_offset(ce, sema);
2383 	u32 elapsed[TF_COUNT], cycles;
2384 	u32 *cs;
2385 	int err;
2386 	int i;
2387 
2388 	/*
2389 	 * Measure how long it takes for the signal (interrupt) to be
2390 	 * sent from the GPU to be processed by the CPU.
2391 	 *
2392 	 *    A: read CS_TIMESTAMP on GPU
2393 	 *    signal
2394 	 *    B: read CS_TIMESTAMP from CPU
2395 	 *
2396 	 * Completion latency: B - A
2397 	 */
2398 
2399 	for (i = 1; i <= ARRAY_SIZE(elapsed); i++) {
2400 		struct signal_cb cb = { .seen = false };
2401 		struct i915_request *rq;
2402 
2403 		rq = i915_request_create(ce);
2404 		if (IS_ERR(rq)) {
2405 			err = PTR_ERR(rq);
2406 			goto err;
2407 		}
2408 
2409 		cs = intel_ring_begin(rq, 12);
2410 		if (IS_ERR(cs)) {
2411 			i915_request_add(rq);
2412 			err = PTR_ERR(cs);
2413 			goto err;
2414 		}
2415 
2416 		cs = emit_store_dw(cs, offset + i * sizeof(u32), -1);
2417 		cs = emit_semaphore_poll_until(cs, offset, i);
2418 		cs = emit_timestamp_store(cs, ce, offset + i * sizeof(u32));
2419 
2420 		intel_ring_advance(rq, cs);
2421 
2422 		dma_fence_add_callback(&rq->fence, &cb.base, signal_cb);
2423 		i915_request_add(rq);
2424 
2425 		intel_engine_flush_submission(ce->engine);
2426 		if (wait_for(READ_ONCE(sema[i]) == -1, 50)) {
2427 			err = -EIO;
2428 			goto err;
2429 		}
2430 
2431 		preempt_disable();
2432 		semaphore_set(sema, i);
2433 		while (!READ_ONCE(cb.seen))
2434 			cpu_relax();
2435 
2436 		elapsed[i - 1] = ENGINE_READ_FW(ce->engine, RING_TIMESTAMP);
2437 		preempt_enable();
2438 	}
2439 
2440 	err = intel_gt_wait_for_idle(ce->engine->gt, HZ / 2);
2441 	if (err)
2442 		goto err;
2443 
2444 	for (i = 0; i < ARRAY_SIZE(elapsed); i++) {
2445 		GEM_BUG_ON(sema[i + 1] == -1);
2446 		elapsed[i] = elapsed[i] - sema[i + 1];
2447 	}
2448 
2449 	cycles = trifilter(elapsed);
2450 	pr_info("%s: completion latency %d cycles, %lluns\n",
2451 		ce->engine->name, cycles >> TF_BIAS,
2452 		cycles_to_ns(ce->engine, cycles));
2453 
2454 	return intel_gt_wait_for_idle(ce->engine->gt, HZ);
2455 
2456 err:
2457 	intel_gt_set_wedged(ce->engine->gt);
2458 	return err;
2459 }
2460 
2461 static void rps_pin(struct intel_gt *gt)
2462 {
2463 	/* Pin the frequency to max */
2464 	atomic_inc(&gt->rps.num_waiters);
2465 	intel_uncore_forcewake_get(gt->uncore, FORCEWAKE_ALL);
2466 
2467 	mutex_lock(&gt->rps.lock);
2468 	intel_rps_set(&gt->rps, gt->rps.max_freq);
2469 	mutex_unlock(&gt->rps.lock);
2470 }
2471 
2472 static void rps_unpin(struct intel_gt *gt)
2473 {
2474 	intel_uncore_forcewake_put(gt->uncore, FORCEWAKE_ALL);
2475 	atomic_dec(&gt->rps.num_waiters);
2476 }
2477 
2478 static int perf_request_latency(void *arg)
2479 {
2480 	struct drm_i915_private *i915 = arg;
2481 	struct intel_engine_cs *engine;
2482 	struct pm_qos_request qos;
2483 	int err = 0;
2484 
2485 	if (INTEL_GEN(i915) < 8) /* per-engine CS timestamp, semaphores */
2486 		return 0;
2487 
2488 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2489 
2490 	for_each_uabi_engine(engine, i915) {
2491 		struct intel_context *ce;
2492 
2493 		ce = intel_context_create(engine);
2494 		if (IS_ERR(ce)) {
2495 			err = PTR_ERR(ce);
2496 			goto out;
2497 		}
2498 
2499 		err = intel_context_pin(ce);
2500 		if (err) {
2501 			intel_context_put(ce);
2502 			goto out;
2503 		}
2504 
2505 		st_engine_heartbeat_disable(engine);
2506 		rps_pin(engine->gt);
2507 
2508 		if (err == 0)
2509 			err = measure_semaphore_response(ce);
2510 		if (err == 0)
2511 			err = measure_idle_dispatch(ce);
2512 		if (err == 0)
2513 			err = measure_busy_dispatch(ce);
2514 		if (err == 0)
2515 			err = measure_inter_request(ce);
2516 		if (err == 0)
2517 			err = measure_context_switch(ce);
2518 		if (err == 0)
2519 			err = measure_preemption(ce);
2520 		if (err == 0)
2521 			err = measure_completion(ce);
2522 
2523 		rps_unpin(engine->gt);
2524 		st_engine_heartbeat_enable(engine);
2525 
2526 		intel_context_unpin(ce);
2527 		intel_context_put(ce);
2528 		if (err)
2529 			goto out;
2530 	}
2531 
2532 out:
2533 	if (igt_flush_test(i915))
2534 		err = -EIO;
2535 
2536 	cpu_latency_qos_remove_request(&qos);
2537 	return err;
2538 }
2539 
2540 static int s_sync0(void *arg)
2541 {
2542 	struct perf_series *ps = arg;
2543 	IGT_TIMEOUT(end_time);
2544 	unsigned int idx = 0;
2545 	int err = 0;
2546 
2547 	GEM_BUG_ON(!ps->nengines);
2548 	do {
2549 		struct i915_request *rq;
2550 
2551 		rq = i915_request_create(ps->ce[idx]);
2552 		if (IS_ERR(rq)) {
2553 			err = PTR_ERR(rq);
2554 			break;
2555 		}
2556 
2557 		i915_request_get(rq);
2558 		i915_request_add(rq);
2559 
2560 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2561 			err = -ETIME;
2562 		i915_request_put(rq);
2563 		if (err)
2564 			break;
2565 
2566 		if (++idx == ps->nengines)
2567 			idx = 0;
2568 	} while (!__igt_timeout(end_time, NULL));
2569 
2570 	return err;
2571 }
2572 
2573 static int s_sync1(void *arg)
2574 {
2575 	struct perf_series *ps = arg;
2576 	struct i915_request *prev = NULL;
2577 	IGT_TIMEOUT(end_time);
2578 	unsigned int idx = 0;
2579 	int err = 0;
2580 
2581 	GEM_BUG_ON(!ps->nengines);
2582 	do {
2583 		struct i915_request *rq;
2584 
2585 		rq = i915_request_create(ps->ce[idx]);
2586 		if (IS_ERR(rq)) {
2587 			err = PTR_ERR(rq);
2588 			break;
2589 		}
2590 
2591 		i915_request_get(rq);
2592 		i915_request_add(rq);
2593 
2594 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2595 			err = -ETIME;
2596 		i915_request_put(prev);
2597 		prev = rq;
2598 		if (err)
2599 			break;
2600 
2601 		if (++idx == ps->nengines)
2602 			idx = 0;
2603 	} while (!__igt_timeout(end_time, NULL));
2604 	i915_request_put(prev);
2605 
2606 	return err;
2607 }
2608 
2609 static int s_many(void *arg)
2610 {
2611 	struct perf_series *ps = arg;
2612 	IGT_TIMEOUT(end_time);
2613 	unsigned int idx = 0;
2614 
2615 	GEM_BUG_ON(!ps->nengines);
2616 	do {
2617 		struct i915_request *rq;
2618 
2619 		rq = i915_request_create(ps->ce[idx]);
2620 		if (IS_ERR(rq))
2621 			return PTR_ERR(rq);
2622 
2623 		i915_request_add(rq);
2624 
2625 		if (++idx == ps->nengines)
2626 			idx = 0;
2627 	} while (!__igt_timeout(end_time, NULL));
2628 
2629 	return 0;
2630 }
2631 
2632 static int perf_series_engines(void *arg)
2633 {
2634 	struct drm_i915_private *i915 = arg;
2635 	static int (* const func[])(void *arg) = {
2636 		s_sync0,
2637 		s_sync1,
2638 		s_many,
2639 		NULL,
2640 	};
2641 	const unsigned int nengines = num_uabi_engines(i915);
2642 	struct intel_engine_cs *engine;
2643 	int (* const *fn)(void *arg);
2644 	struct pm_qos_request qos;
2645 	struct perf_stats *stats;
2646 	struct perf_series *ps;
2647 	unsigned int idx;
2648 	int err = 0;
2649 
2650 	stats = kcalloc(nengines, sizeof(*stats), GFP_KERNEL);
2651 	if (!stats)
2652 		return -ENOMEM;
2653 
2654 	ps = kzalloc(struct_size(ps, ce, nengines), GFP_KERNEL);
2655 	if (!ps) {
2656 		kfree(stats);
2657 		return -ENOMEM;
2658 	}
2659 
2660 	cpu_latency_qos_add_request(&qos, 0); /* disable cstates */
2661 
2662 	ps->i915 = i915;
2663 	ps->nengines = nengines;
2664 
2665 	idx = 0;
2666 	for_each_uabi_engine(engine, i915) {
2667 		struct intel_context *ce;
2668 
2669 		ce = intel_context_create(engine);
2670 		if (IS_ERR(ce)) {
2671 			err = PTR_ERR(ce);
2672 			goto out;
2673 		}
2674 
2675 		err = intel_context_pin(ce);
2676 		if (err) {
2677 			intel_context_put(ce);
2678 			goto out;
2679 		}
2680 
2681 		ps->ce[idx++] = ce;
2682 	}
2683 	GEM_BUG_ON(idx != ps->nengines);
2684 
2685 	for (fn = func; *fn && !err; fn++) {
2686 		char name[KSYM_NAME_LEN];
2687 		struct igt_live_test t;
2688 
2689 		snprintf(name, sizeof(name), "%ps", *fn);
2690 		err = igt_live_test_begin(&t, i915, __func__, name);
2691 		if (err)
2692 			break;
2693 
2694 		for (idx = 0; idx < nengines; idx++) {
2695 			struct perf_stats *p =
2696 				memset(&stats[idx], 0, sizeof(stats[idx]));
2697 			struct intel_context *ce = ps->ce[idx];
2698 
2699 			p->engine = ps->ce[idx]->engine;
2700 			intel_engine_pm_get(p->engine);
2701 
2702 			if (intel_engine_supports_stats(p->engine))
2703 				p->busy = intel_engine_get_busy_time(p->engine,
2704 								     &p->time) + 1;
2705 			else
2706 				p->time = ktime_get();
2707 			p->runtime = -intel_context_get_total_runtime_ns(ce);
2708 		}
2709 
2710 		err = (*fn)(ps);
2711 		if (igt_live_test_end(&t))
2712 			err = -EIO;
2713 
2714 		for (idx = 0; idx < nengines; idx++) {
2715 			struct perf_stats *p = &stats[idx];
2716 			struct intel_context *ce = ps->ce[idx];
2717 			int integer, decimal;
2718 			u64 busy, dt, now;
2719 
2720 			if (p->busy)
2721 				p->busy = ktime_sub(intel_engine_get_busy_time(p->engine,
2722 									       &now),
2723 						    p->busy - 1);
2724 			else
2725 				now = ktime_get();
2726 			p->time = ktime_sub(now, p->time);
2727 
2728 			err = switch_to_kernel_sync(ce, err);
2729 			p->runtime += intel_context_get_total_runtime_ns(ce);
2730 			intel_engine_pm_put(p->engine);
2731 
2732 			busy = 100 * ktime_to_ns(p->busy);
2733 			dt = ktime_to_ns(p->time);
2734 			if (dt) {
2735 				integer = div64_u64(busy, dt);
2736 				busy -= integer * dt;
2737 				decimal = div64_u64(100 * busy, dt);
2738 			} else {
2739 				integer = 0;
2740 				decimal = 0;
2741 			}
2742 
2743 			pr_info("%s %5s: { seqno:%d, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
2744 				name, p->engine->name, ce->timeline->seqno,
2745 				integer, decimal,
2746 				div_u64(p->runtime, 1000 * 1000),
2747 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
2748 		}
2749 	}
2750 
2751 out:
2752 	for (idx = 0; idx < nengines; idx++) {
2753 		if (IS_ERR_OR_NULL(ps->ce[idx]))
2754 			break;
2755 
2756 		intel_context_unpin(ps->ce[idx]);
2757 		intel_context_put(ps->ce[idx]);
2758 	}
2759 	kfree(ps);
2760 
2761 	cpu_latency_qos_remove_request(&qos);
2762 	kfree(stats);
2763 	return err;
2764 }
2765 
2766 static int p_sync0(void *arg)
2767 {
2768 	struct perf_stats *p = arg;
2769 	struct intel_engine_cs *engine = p->engine;
2770 	struct intel_context *ce;
2771 	IGT_TIMEOUT(end_time);
2772 	unsigned long count;
2773 	bool busy;
2774 	int err = 0;
2775 
2776 	ce = intel_context_create(engine);
2777 	if (IS_ERR(ce))
2778 		return PTR_ERR(ce);
2779 
2780 	err = intel_context_pin(ce);
2781 	if (err) {
2782 		intel_context_put(ce);
2783 		return err;
2784 	}
2785 
2786 	if (intel_engine_supports_stats(engine)) {
2787 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2788 		busy = true;
2789 	} else {
2790 		p->time = ktime_get();
2791 		busy = false;
2792 	}
2793 
2794 	count = 0;
2795 	do {
2796 		struct i915_request *rq;
2797 
2798 		rq = i915_request_create(ce);
2799 		if (IS_ERR(rq)) {
2800 			err = PTR_ERR(rq);
2801 			break;
2802 		}
2803 
2804 		i915_request_get(rq);
2805 		i915_request_add(rq);
2806 
2807 		err = 0;
2808 		if (i915_request_wait(rq, 0, HZ / 5) < 0)
2809 			err = -ETIME;
2810 		i915_request_put(rq);
2811 		if (err)
2812 			break;
2813 
2814 		count++;
2815 	} while (!__igt_timeout(end_time, NULL));
2816 
2817 	if (busy) {
2818 		ktime_t now;
2819 
2820 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2821 				    p->busy);
2822 		p->time = ktime_sub(now, p->time);
2823 	} else {
2824 		p->time = ktime_sub(ktime_get(), p->time);
2825 	}
2826 
2827 	err = switch_to_kernel_sync(ce, err);
2828 	p->runtime = intel_context_get_total_runtime_ns(ce);
2829 	p->count = count;
2830 
2831 	intel_context_unpin(ce);
2832 	intel_context_put(ce);
2833 	return err;
2834 }
2835 
2836 static int p_sync1(void *arg)
2837 {
2838 	struct perf_stats *p = arg;
2839 	struct intel_engine_cs *engine = p->engine;
2840 	struct i915_request *prev = NULL;
2841 	struct intel_context *ce;
2842 	IGT_TIMEOUT(end_time);
2843 	unsigned long count;
2844 	bool busy;
2845 	int err = 0;
2846 
2847 	ce = intel_context_create(engine);
2848 	if (IS_ERR(ce))
2849 		return PTR_ERR(ce);
2850 
2851 	err = intel_context_pin(ce);
2852 	if (err) {
2853 		intel_context_put(ce);
2854 		return err;
2855 	}
2856 
2857 	if (intel_engine_supports_stats(engine)) {
2858 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2859 		busy = true;
2860 	} else {
2861 		p->time = ktime_get();
2862 		busy = false;
2863 	}
2864 
2865 	count = 0;
2866 	do {
2867 		struct i915_request *rq;
2868 
2869 		rq = i915_request_create(ce);
2870 		if (IS_ERR(rq)) {
2871 			err = PTR_ERR(rq);
2872 			break;
2873 		}
2874 
2875 		i915_request_get(rq);
2876 		i915_request_add(rq);
2877 
2878 		err = 0;
2879 		if (prev && i915_request_wait(prev, 0, HZ / 5) < 0)
2880 			err = -ETIME;
2881 		i915_request_put(prev);
2882 		prev = rq;
2883 		if (err)
2884 			break;
2885 
2886 		count++;
2887 	} while (!__igt_timeout(end_time, NULL));
2888 	i915_request_put(prev);
2889 
2890 	if (busy) {
2891 		ktime_t now;
2892 
2893 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2894 				    p->busy);
2895 		p->time = ktime_sub(now, p->time);
2896 	} else {
2897 		p->time = ktime_sub(ktime_get(), p->time);
2898 	}
2899 
2900 	err = switch_to_kernel_sync(ce, err);
2901 	p->runtime = intel_context_get_total_runtime_ns(ce);
2902 	p->count = count;
2903 
2904 	intel_context_unpin(ce);
2905 	intel_context_put(ce);
2906 	return err;
2907 }
2908 
2909 static int p_many(void *arg)
2910 {
2911 	struct perf_stats *p = arg;
2912 	struct intel_engine_cs *engine = p->engine;
2913 	struct intel_context *ce;
2914 	IGT_TIMEOUT(end_time);
2915 	unsigned long count;
2916 	int err = 0;
2917 	bool busy;
2918 
2919 	ce = intel_context_create(engine);
2920 	if (IS_ERR(ce))
2921 		return PTR_ERR(ce);
2922 
2923 	err = intel_context_pin(ce);
2924 	if (err) {
2925 		intel_context_put(ce);
2926 		return err;
2927 	}
2928 
2929 	if (intel_engine_supports_stats(engine)) {
2930 		p->busy = intel_engine_get_busy_time(engine, &p->time);
2931 		busy = true;
2932 	} else {
2933 		p->time = ktime_get();
2934 		busy = false;
2935 	}
2936 
2937 	count = 0;
2938 	do {
2939 		struct i915_request *rq;
2940 
2941 		rq = i915_request_create(ce);
2942 		if (IS_ERR(rq)) {
2943 			err = PTR_ERR(rq);
2944 			break;
2945 		}
2946 
2947 		i915_request_add(rq);
2948 		count++;
2949 	} while (!__igt_timeout(end_time, NULL));
2950 
2951 	if (busy) {
2952 		ktime_t now;
2953 
2954 		p->busy = ktime_sub(intel_engine_get_busy_time(engine, &now),
2955 				    p->busy);
2956 		p->time = ktime_sub(now, p->time);
2957 	} else {
2958 		p->time = ktime_sub(ktime_get(), p->time);
2959 	}
2960 
2961 	err = switch_to_kernel_sync(ce, err);
2962 	p->runtime = intel_context_get_total_runtime_ns(ce);
2963 	p->count = count;
2964 
2965 	intel_context_unpin(ce);
2966 	intel_context_put(ce);
2967 	return err;
2968 }
2969 
2970 static int perf_parallel_engines(void *arg)
2971 {
2972 	struct drm_i915_private *i915 = arg;
2973 	static int (* const func[])(void *arg) = {
2974 		p_sync0,
2975 		p_sync1,
2976 		p_many,
2977 		NULL,
2978 	};
2979 	const unsigned int nengines = num_uabi_engines(i915);
2980 	struct intel_engine_cs *engine;
2981 	int (* const *fn)(void *arg);
2982 	struct pm_qos_request qos;
2983 	struct {
2984 		struct perf_stats p;
2985 		struct task_struct *tsk;
2986 	} *engines;
2987 	int err = 0;
2988 
2989 	engines = kcalloc(nengines, sizeof(*engines), GFP_KERNEL);
2990 	if (!engines)
2991 		return -ENOMEM;
2992 
2993 	cpu_latency_qos_add_request(&qos, 0);
2994 
2995 	for (fn = func; *fn; fn++) {
2996 		char name[KSYM_NAME_LEN];
2997 		struct igt_live_test t;
2998 		unsigned int idx;
2999 
3000 		snprintf(name, sizeof(name), "%ps", *fn);
3001 		err = igt_live_test_begin(&t, i915, __func__, name);
3002 		if (err)
3003 			break;
3004 
3005 		atomic_set(&i915->selftest.counter, nengines);
3006 
3007 		idx = 0;
3008 		for_each_uabi_engine(engine, i915) {
3009 			intel_engine_pm_get(engine);
3010 
3011 			memset(&engines[idx].p, 0, sizeof(engines[idx].p));
3012 			engines[idx].p.engine = engine;
3013 
3014 			engines[idx].tsk = kthread_run(*fn, &engines[idx].p,
3015 						       "igt:%s", engine->name);
3016 			if (IS_ERR(engines[idx].tsk)) {
3017 				err = PTR_ERR(engines[idx].tsk);
3018 				intel_engine_pm_put(engine);
3019 				break;
3020 			}
3021 			get_task_struct(engines[idx++].tsk);
3022 		}
3023 
3024 		yield(); /* start all threads before we kthread_stop() */
3025 
3026 		idx = 0;
3027 		for_each_uabi_engine(engine, i915) {
3028 			int status;
3029 
3030 			if (IS_ERR(engines[idx].tsk))
3031 				break;
3032 
3033 			status = kthread_stop(engines[idx].tsk);
3034 			if (status && !err)
3035 				err = status;
3036 
3037 			intel_engine_pm_put(engine);
3038 			put_task_struct(engines[idx++].tsk);
3039 		}
3040 
3041 		if (igt_live_test_end(&t))
3042 			err = -EIO;
3043 		if (err)
3044 			break;
3045 
3046 		idx = 0;
3047 		for_each_uabi_engine(engine, i915) {
3048 			struct perf_stats *p = &engines[idx].p;
3049 			u64 busy = 100 * ktime_to_ns(p->busy);
3050 			u64 dt = ktime_to_ns(p->time);
3051 			int integer, decimal;
3052 
3053 			if (dt) {
3054 				integer = div64_u64(busy, dt);
3055 				busy -= integer * dt;
3056 				decimal = div64_u64(100 * busy, dt);
3057 			} else {
3058 				integer = 0;
3059 				decimal = 0;
3060 			}
3061 
3062 			GEM_BUG_ON(engine != p->engine);
3063 			pr_info("%s %5s: { count:%lu, busy:%d.%02d%%, runtime:%lldms, walltime:%lldms }\n",
3064 				name, engine->name, p->count, integer, decimal,
3065 				div_u64(p->runtime, 1000 * 1000),
3066 				div_u64(ktime_to_ns(p->time), 1000 * 1000));
3067 			idx++;
3068 		}
3069 	}
3070 
3071 	cpu_latency_qos_remove_request(&qos);
3072 	kfree(engines);
3073 	return err;
3074 }
3075 
3076 int i915_request_perf_selftests(struct drm_i915_private *i915)
3077 {
3078 	static const struct i915_subtest tests[] = {
3079 		SUBTEST(perf_request_latency),
3080 		SUBTEST(perf_series_engines),
3081 		SUBTEST(perf_parallel_engines),
3082 	};
3083 
3084 	if (intel_gt_is_wedged(&i915->gt))
3085 		return 0;
3086 
3087 	return i915_subtests(tests, i915);
3088 }
3089