xref: /openbmc/linux/drivers/gpu/drm/i915/gt/selftest_engine_heartbeat.c (revision 03ab8e6297acd1bc0eedaa050e2a1635c576fd11)
1 // SPDX-License-Identifier: MIT
2 /*
3  * Copyright © 2018 Intel Corporation
4  */
5 
6 #include <linux/sort.h>
7 
8 #include "i915_drv.h"
9 
10 #include "intel_gt_requests.h"
11 #include "i915_selftest.h"
12 #include "selftest_engine_heartbeat.h"
13 
reset_heartbeat(struct intel_engine_cs * engine)14 static void reset_heartbeat(struct intel_engine_cs *engine)
15 {
16 	intel_engine_set_heartbeat(engine,
17 				   engine->defaults.heartbeat_interval_ms);
18 }
19 
timeline_sync(struct intel_timeline * tl)20 static int timeline_sync(struct intel_timeline *tl)
21 {
22 	struct dma_fence *fence;
23 	long timeout;
24 
25 	fence = i915_active_fence_get(&tl->last_request);
26 	if (!fence)
27 		return 0;
28 
29 	timeout = dma_fence_wait_timeout(fence, true, HZ / 2);
30 	dma_fence_put(fence);
31 	if (timeout < 0)
32 		return timeout;
33 
34 	return 0;
35 }
36 
engine_sync_barrier(struct intel_engine_cs * engine)37 static int engine_sync_barrier(struct intel_engine_cs *engine)
38 {
39 	return timeline_sync(engine->kernel_context->timeline);
40 }
41 
42 struct pulse {
43 	struct i915_active active;
44 	struct kref kref;
45 };
46 
pulse_active(struct i915_active * active)47 static int pulse_active(struct i915_active *active)
48 {
49 	kref_get(&container_of(active, struct pulse, active)->kref);
50 	return 0;
51 }
52 
pulse_free(struct kref * kref)53 static void pulse_free(struct kref *kref)
54 {
55 	struct pulse *p = container_of(kref, typeof(*p), kref);
56 
57 	i915_active_fini(&p->active);
58 	kfree(p);
59 }
60 
pulse_put(struct pulse * p)61 static void pulse_put(struct pulse *p)
62 {
63 	kref_put(&p->kref, pulse_free);
64 }
65 
pulse_retire(struct i915_active * active)66 static void pulse_retire(struct i915_active *active)
67 {
68 	pulse_put(container_of(active, struct pulse, active));
69 }
70 
pulse_create(void)71 static struct pulse *pulse_create(void)
72 {
73 	struct pulse *p;
74 
75 	p = kmalloc(sizeof(*p), GFP_KERNEL);
76 	if (!p)
77 		return p;
78 
79 	kref_init(&p->kref);
80 	i915_active_init(&p->active, pulse_active, pulse_retire, 0);
81 
82 	return p;
83 }
84 
pulse_unlock_wait(struct pulse * p)85 static void pulse_unlock_wait(struct pulse *p)
86 {
87 	i915_active_unlock_wait(&p->active);
88 }
89 
__live_idle_pulse(struct intel_engine_cs * engine,int (* fn)(struct intel_engine_cs * cs))90 static int __live_idle_pulse(struct intel_engine_cs *engine,
91 			     int (*fn)(struct intel_engine_cs *cs))
92 {
93 	struct pulse *p;
94 	int err;
95 
96 	GEM_BUG_ON(!intel_engine_pm_is_awake(engine));
97 
98 	p = pulse_create();
99 	if (!p)
100 		return -ENOMEM;
101 
102 	err = i915_active_acquire(&p->active);
103 	if (err)
104 		goto out;
105 
106 	err = i915_active_acquire_preallocate_barrier(&p->active, engine);
107 	if (err) {
108 		i915_active_release(&p->active);
109 		goto out;
110 	}
111 
112 	i915_active_acquire_barrier(&p->active);
113 	i915_active_release(&p->active);
114 
115 	GEM_BUG_ON(i915_active_is_idle(&p->active));
116 	GEM_BUG_ON(llist_empty(&engine->barrier_tasks));
117 
118 	err = fn(engine);
119 	if (err)
120 		goto out;
121 
122 	GEM_BUG_ON(!llist_empty(&engine->barrier_tasks));
123 
124 	if (engine_sync_barrier(engine)) {
125 		struct drm_printer m = drm_err_printer("pulse");
126 
127 		pr_err("%s: no heartbeat pulse?\n", engine->name);
128 		intel_engine_dump(engine, &m, "%s", engine->name);
129 
130 		err = -ETIME;
131 		goto out;
132 	}
133 
134 	GEM_BUG_ON(READ_ONCE(engine->serial) != engine->wakeref_serial);
135 
136 	pulse_unlock_wait(p); /* synchronize with the retirement callback */
137 
138 	if (!i915_active_is_idle(&p->active)) {
139 		struct drm_printer m = drm_err_printer("pulse");
140 
141 		pr_err("%s: heartbeat pulse did not flush idle tasks\n",
142 		       engine->name);
143 		i915_active_print(&p->active, &m);
144 
145 		err = -EINVAL;
146 		goto out;
147 	}
148 
149 out:
150 	pulse_put(p);
151 	return err;
152 }
153 
live_idle_flush(void * arg)154 static int live_idle_flush(void *arg)
155 {
156 	struct intel_gt *gt = arg;
157 	struct intel_engine_cs *engine;
158 	enum intel_engine_id id;
159 	int err = 0;
160 
161 	/* Check that we can flush the idle barriers */
162 
163 	for_each_engine(engine, gt, id) {
164 		st_engine_heartbeat_disable(engine);
165 		err = __live_idle_pulse(engine, intel_engine_flush_barriers);
166 		st_engine_heartbeat_enable(engine);
167 		if (err)
168 			break;
169 	}
170 
171 	return err;
172 }
173 
live_idle_pulse(void * arg)174 static int live_idle_pulse(void *arg)
175 {
176 	struct intel_gt *gt = arg;
177 	struct intel_engine_cs *engine;
178 	enum intel_engine_id id;
179 	int err = 0;
180 
181 	/* Check that heartbeat pulses flush the idle barriers */
182 
183 	for_each_engine(engine, gt, id) {
184 		st_engine_heartbeat_disable(engine);
185 		err = __live_idle_pulse(engine, intel_engine_pulse);
186 		st_engine_heartbeat_enable(engine);
187 		if (err && err != -ENODEV)
188 			break;
189 
190 		err = 0;
191 	}
192 
193 	return err;
194 }
195 
cmp_u32(const void * _a,const void * _b)196 static int cmp_u32(const void *_a, const void *_b)
197 {
198 	const u32 *a = _a, *b = _b;
199 
200 	return *a - *b;
201 }
202 
__live_heartbeat_fast(struct intel_engine_cs * engine)203 static int __live_heartbeat_fast(struct intel_engine_cs *engine)
204 {
205 	const unsigned int error_threshold = max(20000u, jiffies_to_usecs(6));
206 	struct intel_context *ce;
207 	struct i915_request *rq;
208 	ktime_t t0, t1;
209 	u32 times[5];
210 	int err;
211 	int i;
212 
213 	ce = intel_context_create(engine);
214 	if (IS_ERR(ce))
215 		return PTR_ERR(ce);
216 
217 	intel_engine_pm_get(engine);
218 
219 	err = intel_engine_set_heartbeat(engine, 1);
220 	if (err)
221 		goto err_pm;
222 
223 	for (i = 0; i < ARRAY_SIZE(times); i++) {
224 		do {
225 			/* Manufacture a tick */
226 			intel_engine_park_heartbeat(engine);
227 			GEM_BUG_ON(engine->heartbeat.systole);
228 			engine->serial++; /*  pretend we are not idle! */
229 			intel_engine_unpark_heartbeat(engine);
230 
231 			flush_delayed_work(&engine->heartbeat.work);
232 			if (!delayed_work_pending(&engine->heartbeat.work)) {
233 				pr_err("%s: heartbeat %d did not start\n",
234 				       engine->name, i);
235 				err = -EINVAL;
236 				goto err_pm;
237 			}
238 
239 			rcu_read_lock();
240 			rq = READ_ONCE(engine->heartbeat.systole);
241 			if (rq)
242 				rq = i915_request_get_rcu(rq);
243 			rcu_read_unlock();
244 		} while (!rq);
245 
246 		t0 = ktime_get();
247 		while (rq == READ_ONCE(engine->heartbeat.systole))
248 			yield(); /* work is on the local cpu! */
249 		t1 = ktime_get();
250 
251 		i915_request_put(rq);
252 		times[i] = ktime_us_delta(t1, t0);
253 	}
254 
255 	sort(times, ARRAY_SIZE(times), sizeof(times[0]), cmp_u32, NULL);
256 
257 	pr_info("%s: Heartbeat delay: %uus [%u, %u]\n",
258 		engine->name,
259 		times[ARRAY_SIZE(times) / 2],
260 		times[0],
261 		times[ARRAY_SIZE(times) - 1]);
262 
263 	/*
264 	 * Ideally, the upper bound on min work delay would be something like
265 	 * 2 * 2 (worst), +1 for scheduling, +1 for slack. In practice, we
266 	 * are, even with system_wq_highpri, at the mercy of the CPU scheduler
267 	 * and may be stuck behind some slow work for many millisecond. Such
268 	 * as our very own display workers.
269 	 */
270 	if (times[ARRAY_SIZE(times) / 2] > error_threshold) {
271 		pr_err("%s: Heartbeat delay was %uus, expected less than %dus\n",
272 		       engine->name,
273 		       times[ARRAY_SIZE(times) / 2],
274 		       error_threshold);
275 		err = -EINVAL;
276 	}
277 
278 	reset_heartbeat(engine);
279 err_pm:
280 	intel_engine_pm_put(engine);
281 	intel_context_put(ce);
282 	return err;
283 }
284 
live_heartbeat_fast(void * arg)285 static int live_heartbeat_fast(void *arg)
286 {
287 	struct intel_gt *gt = arg;
288 	struct intel_engine_cs *engine;
289 	enum intel_engine_id id;
290 	int err = 0;
291 
292 	/* Check that the heartbeat ticks at the desired rate. */
293 	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
294 		return 0;
295 
296 	for_each_engine(engine, gt, id) {
297 		err = __live_heartbeat_fast(engine);
298 		if (err)
299 			break;
300 	}
301 
302 	return err;
303 }
304 
__live_heartbeat_off(struct intel_engine_cs * engine)305 static int __live_heartbeat_off(struct intel_engine_cs *engine)
306 {
307 	int err;
308 
309 	intel_engine_pm_get(engine);
310 
311 	engine->serial++;
312 	flush_delayed_work(&engine->heartbeat.work);
313 	if (!delayed_work_pending(&engine->heartbeat.work)) {
314 		pr_err("%s: heartbeat not running\n",
315 		       engine->name);
316 		err = -EINVAL;
317 		goto err_pm;
318 	}
319 
320 	err = intel_engine_set_heartbeat(engine, 0);
321 	if (err)
322 		goto err_pm;
323 
324 	engine->serial++;
325 	flush_delayed_work(&engine->heartbeat.work);
326 	if (delayed_work_pending(&engine->heartbeat.work)) {
327 		pr_err("%s: heartbeat still running\n",
328 		       engine->name);
329 		err = -EINVAL;
330 		goto err_beat;
331 	}
332 
333 	if (READ_ONCE(engine->heartbeat.systole)) {
334 		pr_err("%s: heartbeat still allocated\n",
335 		       engine->name);
336 		err = -EINVAL;
337 		goto err_beat;
338 	}
339 
340 err_beat:
341 	reset_heartbeat(engine);
342 err_pm:
343 	intel_engine_pm_put(engine);
344 	return err;
345 }
346 
live_heartbeat_off(void * arg)347 static int live_heartbeat_off(void *arg)
348 {
349 	struct intel_gt *gt = arg;
350 	struct intel_engine_cs *engine;
351 	enum intel_engine_id id;
352 	int err = 0;
353 
354 	/* Check that we can turn off heartbeat and not interrupt VIP */
355 	if (!CONFIG_DRM_I915_HEARTBEAT_INTERVAL)
356 		return 0;
357 
358 	for_each_engine(engine, gt, id) {
359 		if (!intel_engine_has_preemption(engine))
360 			continue;
361 
362 		err = __live_heartbeat_off(engine);
363 		if (err)
364 			break;
365 	}
366 
367 	return err;
368 }
369 
intel_heartbeat_live_selftests(struct drm_i915_private * i915)370 int intel_heartbeat_live_selftests(struct drm_i915_private *i915)
371 {
372 	static const struct i915_subtest tests[] = {
373 		SUBTEST(live_idle_flush),
374 		SUBTEST(live_idle_pulse),
375 		SUBTEST(live_heartbeat_fast),
376 		SUBTEST(live_heartbeat_off),
377 	};
378 	int saved_hangcheck;
379 	int err;
380 
381 	if (intel_gt_is_wedged(to_gt(i915)))
382 		return 0;
383 
384 	saved_hangcheck = i915->params.enable_hangcheck;
385 	i915->params.enable_hangcheck = INT_MAX;
386 
387 	err = intel_gt_live_subtests(tests, to_gt(i915));
388 
389 	i915->params.enable_hangcheck = saved_hangcheck;
390 	return err;
391 }
392 
st_engine_heartbeat_disable(struct intel_engine_cs * engine)393 void st_engine_heartbeat_disable(struct intel_engine_cs *engine)
394 {
395 	engine->props.heartbeat_interval_ms = 0;
396 
397 	intel_engine_pm_get(engine);
398 	intel_engine_park_heartbeat(engine);
399 }
400 
st_engine_heartbeat_enable(struct intel_engine_cs * engine)401 void st_engine_heartbeat_enable(struct intel_engine_cs *engine)
402 {
403 	intel_engine_pm_put(engine);
404 
405 	engine->props.heartbeat_interval_ms =
406 		engine->defaults.heartbeat_interval_ms;
407 }
408 
st_engine_heartbeat_disable_no_pm(struct intel_engine_cs * engine)409 void st_engine_heartbeat_disable_no_pm(struct intel_engine_cs *engine)
410 {
411 	engine->props.heartbeat_interval_ms = 0;
412 
413 	/*
414 	 * Park the heartbeat but without holding the PM lock as that
415 	 * makes the engines appear not-idle. Note that if/when unpark
416 	 * is called due to the PM lock being acquired later the
417 	 * heartbeat still won't be enabled because of the above = 0.
418 	 */
419 	if (intel_engine_pm_get_if_awake(engine)) {
420 		intel_engine_park_heartbeat(engine);
421 		intel_engine_pm_put(engine);
422 	}
423 }
424 
st_engine_heartbeat_enable_no_pm(struct intel_engine_cs * engine)425 void st_engine_heartbeat_enable_no_pm(struct intel_engine_cs *engine)
426 {
427 	engine->props.heartbeat_interval_ms =
428 		engine->defaults.heartbeat_interval_ms;
429 }
430