xref: /openbmc/linux/drivers/gpu/drm/v3d/v3d_sched.c (revision 25b892b5)
1 // SPDX-License-Identifier: GPL-2.0+
2 /* Copyright (C) 2018 Broadcom */
3 
4 /**
5  * DOC: Broadcom V3D scheduling
6  *
7  * The shared DRM GPU scheduler is used to coordinate submitting jobs
8  * to the hardware.  Each DRM fd (roughly a client process) gets its
9  * own scheduler entity, which will process jobs in order.  The GPU
10  * scheduler will round-robin between clients to submit the next job.
11  *
12  * For simplicity, and in order to keep latency low for interactive
13  * jobs when bulk background jobs are queued up, we submit a new job
14  * to the HW only when it has completed the last one, instead of
15  * filling up the CT[01]Q FIFOs with jobs.  Similarly, we use
16  * v3d_job_dependency() to manage the dependency between bin and
17  * render, instead of having the clients submit jobs using the HW's
18  * semaphores to interlock between them.
19  */
20 
21 #include <linux/kthread.h>
22 
23 #include "v3d_drv.h"
24 #include "v3d_regs.h"
25 #include "v3d_trace.h"
26 
27 static struct v3d_job *
28 to_v3d_job(struct drm_sched_job *sched_job)
29 {
30 	return container_of(sched_job, struct v3d_job, base);
31 }
32 
33 static struct v3d_bin_job *
34 to_bin_job(struct drm_sched_job *sched_job)
35 {
36 	return container_of(sched_job, struct v3d_bin_job, base.base);
37 }
38 
39 static struct v3d_render_job *
40 to_render_job(struct drm_sched_job *sched_job)
41 {
42 	return container_of(sched_job, struct v3d_render_job, base.base);
43 }
44 
45 static struct v3d_tfu_job *
46 to_tfu_job(struct drm_sched_job *sched_job)
47 {
48 	return container_of(sched_job, struct v3d_tfu_job, base.base);
49 }
50 
51 static struct v3d_csd_job *
52 to_csd_job(struct drm_sched_job *sched_job)
53 {
54 	return container_of(sched_job, struct v3d_csd_job, base.base);
55 }
56 
57 static void
58 v3d_job_free(struct drm_sched_job *sched_job)
59 {
60 	struct v3d_job *job = to_v3d_job(sched_job);
61 
62 	drm_sched_job_cleanup(sched_job);
63 	v3d_job_put(job);
64 }
65 
66 static void
67 v3d_switch_perfmon(struct v3d_dev *v3d, struct v3d_job *job)
68 {
69 	if (job->perfmon != v3d->active_perfmon)
70 		v3d_perfmon_stop(v3d, v3d->active_perfmon, true);
71 
72 	if (job->perfmon && v3d->active_perfmon != job->perfmon)
73 		v3d_perfmon_start(v3d, job->perfmon);
74 }
75 
76 /*
77  * Returns the fences that the job depends on, one by one.
78  *
79  * If placed in the scheduler's .dependency method, the corresponding
80  * .run_job won't be called until all of them have been signaled.
81  */
82 static struct dma_fence *
83 v3d_job_dependency(struct drm_sched_job *sched_job,
84 		   struct drm_sched_entity *s_entity)
85 {
86 	struct v3d_job *job = to_v3d_job(sched_job);
87 
88 	/* XXX: Wait on a fence for switching the GMP if necessary,
89 	 * and then do so.
90 	 */
91 
92 	if (!xa_empty(&job->deps))
93 		return xa_erase(&job->deps, job->last_dep++);
94 
95 	return NULL;
96 }
97 
98 static struct dma_fence *v3d_bin_job_run(struct drm_sched_job *sched_job)
99 {
100 	struct v3d_bin_job *job = to_bin_job(sched_job);
101 	struct v3d_dev *v3d = job->base.v3d;
102 	struct drm_device *dev = &v3d->drm;
103 	struct dma_fence *fence;
104 	unsigned long irqflags;
105 
106 	if (unlikely(job->base.base.s_fence->finished.error))
107 		return NULL;
108 
109 	/* Lock required around bin_job update vs
110 	 * v3d_overflow_mem_work().
111 	 */
112 	spin_lock_irqsave(&v3d->job_lock, irqflags);
113 	v3d->bin_job = job;
114 	/* Clear out the overflow allocation, so we don't
115 	 * reuse the overflow attached to a previous job.
116 	 */
117 	V3D_CORE_WRITE(0, V3D_PTB_BPOS, 0);
118 	spin_unlock_irqrestore(&v3d->job_lock, irqflags);
119 
120 	v3d_invalidate_caches(v3d);
121 
122 	fence = v3d_fence_create(v3d, V3D_BIN);
123 	if (IS_ERR(fence))
124 		return NULL;
125 
126 	if (job->base.irq_fence)
127 		dma_fence_put(job->base.irq_fence);
128 	job->base.irq_fence = dma_fence_get(fence);
129 
130 	trace_v3d_submit_cl(dev, false, to_v3d_fence(fence)->seqno,
131 			    job->start, job->end);
132 
133 	v3d_switch_perfmon(v3d, &job->base);
134 
135 	/* Set the current and end address of the control list.
136 	 * Writing the end register is what starts the job.
137 	 */
138 	if (job->qma) {
139 		V3D_CORE_WRITE(0, V3D_CLE_CT0QMA, job->qma);
140 		V3D_CORE_WRITE(0, V3D_CLE_CT0QMS, job->qms);
141 	}
142 	if (job->qts) {
143 		V3D_CORE_WRITE(0, V3D_CLE_CT0QTS,
144 			       V3D_CLE_CT0QTS_ENABLE |
145 			       job->qts);
146 	}
147 	V3D_CORE_WRITE(0, V3D_CLE_CT0QBA, job->start);
148 	V3D_CORE_WRITE(0, V3D_CLE_CT0QEA, job->end);
149 
150 	return fence;
151 }
152 
153 static struct dma_fence *v3d_render_job_run(struct drm_sched_job *sched_job)
154 {
155 	struct v3d_render_job *job = to_render_job(sched_job);
156 	struct v3d_dev *v3d = job->base.v3d;
157 	struct drm_device *dev = &v3d->drm;
158 	struct dma_fence *fence;
159 
160 	if (unlikely(job->base.base.s_fence->finished.error))
161 		return NULL;
162 
163 	v3d->render_job = job;
164 
165 	/* Can we avoid this flush?  We need to be careful of
166 	 * scheduling, though -- imagine job0 rendering to texture and
167 	 * job1 reading, and them being executed as bin0, bin1,
168 	 * render0, render1, so that render1's flush at bin time
169 	 * wasn't enough.
170 	 */
171 	v3d_invalidate_caches(v3d);
172 
173 	fence = v3d_fence_create(v3d, V3D_RENDER);
174 	if (IS_ERR(fence))
175 		return NULL;
176 
177 	if (job->base.irq_fence)
178 		dma_fence_put(job->base.irq_fence);
179 	job->base.irq_fence = dma_fence_get(fence);
180 
181 	trace_v3d_submit_cl(dev, true, to_v3d_fence(fence)->seqno,
182 			    job->start, job->end);
183 
184 	v3d_switch_perfmon(v3d, &job->base);
185 
186 	/* XXX: Set the QCFG */
187 
188 	/* Set the current and end address of the control list.
189 	 * Writing the end register is what starts the job.
190 	 */
191 	V3D_CORE_WRITE(0, V3D_CLE_CT1QBA, job->start);
192 	V3D_CORE_WRITE(0, V3D_CLE_CT1QEA, job->end);
193 
194 	return fence;
195 }
196 
197 static struct dma_fence *
198 v3d_tfu_job_run(struct drm_sched_job *sched_job)
199 {
200 	struct v3d_tfu_job *job = to_tfu_job(sched_job);
201 	struct v3d_dev *v3d = job->base.v3d;
202 	struct drm_device *dev = &v3d->drm;
203 	struct dma_fence *fence;
204 
205 	fence = v3d_fence_create(v3d, V3D_TFU);
206 	if (IS_ERR(fence))
207 		return NULL;
208 
209 	v3d->tfu_job = job;
210 	if (job->base.irq_fence)
211 		dma_fence_put(job->base.irq_fence);
212 	job->base.irq_fence = dma_fence_get(fence);
213 
214 	trace_v3d_submit_tfu(dev, to_v3d_fence(fence)->seqno);
215 
216 	V3D_WRITE(V3D_TFU_IIA, job->args.iia);
217 	V3D_WRITE(V3D_TFU_IIS, job->args.iis);
218 	V3D_WRITE(V3D_TFU_ICA, job->args.ica);
219 	V3D_WRITE(V3D_TFU_IUA, job->args.iua);
220 	V3D_WRITE(V3D_TFU_IOA, job->args.ioa);
221 	V3D_WRITE(V3D_TFU_IOS, job->args.ios);
222 	V3D_WRITE(V3D_TFU_COEF0, job->args.coef[0]);
223 	if (job->args.coef[0] & V3D_TFU_COEF0_USECOEF) {
224 		V3D_WRITE(V3D_TFU_COEF1, job->args.coef[1]);
225 		V3D_WRITE(V3D_TFU_COEF2, job->args.coef[2]);
226 		V3D_WRITE(V3D_TFU_COEF3, job->args.coef[3]);
227 	}
228 	/* ICFG kicks off the job. */
229 	V3D_WRITE(V3D_TFU_ICFG, job->args.icfg | V3D_TFU_ICFG_IOC);
230 
231 	return fence;
232 }
233 
234 static struct dma_fence *
235 v3d_csd_job_run(struct drm_sched_job *sched_job)
236 {
237 	struct v3d_csd_job *job = to_csd_job(sched_job);
238 	struct v3d_dev *v3d = job->base.v3d;
239 	struct drm_device *dev = &v3d->drm;
240 	struct dma_fence *fence;
241 	int i;
242 
243 	v3d->csd_job = job;
244 
245 	v3d_invalidate_caches(v3d);
246 
247 	fence = v3d_fence_create(v3d, V3D_CSD);
248 	if (IS_ERR(fence))
249 		return NULL;
250 
251 	if (job->base.irq_fence)
252 		dma_fence_put(job->base.irq_fence);
253 	job->base.irq_fence = dma_fence_get(fence);
254 
255 	trace_v3d_submit_csd(dev, to_v3d_fence(fence)->seqno);
256 
257 	v3d_switch_perfmon(v3d, &job->base);
258 
259 	for (i = 1; i <= 6; i++)
260 		V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0 + 4 * i, job->args.cfg[i]);
261 	/* CFG0 write kicks off the job. */
262 	V3D_CORE_WRITE(0, V3D_CSD_QUEUED_CFG0, job->args.cfg[0]);
263 
264 	return fence;
265 }
266 
267 static struct dma_fence *
268 v3d_cache_clean_job_run(struct drm_sched_job *sched_job)
269 {
270 	struct v3d_job *job = to_v3d_job(sched_job);
271 	struct v3d_dev *v3d = job->v3d;
272 
273 	v3d_clean_caches(v3d);
274 
275 	return NULL;
276 }
277 
278 static enum drm_gpu_sched_stat
279 v3d_gpu_reset_for_timeout(struct v3d_dev *v3d, struct drm_sched_job *sched_job)
280 {
281 	enum v3d_queue q;
282 
283 	mutex_lock(&v3d->reset_lock);
284 
285 	/* block scheduler */
286 	for (q = 0; q < V3D_MAX_QUEUES; q++)
287 		drm_sched_stop(&v3d->queue[q].sched, sched_job);
288 
289 	if (sched_job)
290 		drm_sched_increase_karma(sched_job);
291 
292 	/* get the GPU back into the init state */
293 	v3d_reset(v3d);
294 
295 	for (q = 0; q < V3D_MAX_QUEUES; q++)
296 		drm_sched_resubmit_jobs(&v3d->queue[q].sched);
297 
298 	/* Unblock schedulers and restart their jobs. */
299 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
300 		drm_sched_start(&v3d->queue[q].sched, true);
301 	}
302 
303 	mutex_unlock(&v3d->reset_lock);
304 
305 	return DRM_GPU_SCHED_STAT_NOMINAL;
306 }
307 
308 /* If the current address or return address have changed, then the GPU
309  * has probably made progress and we should delay the reset.  This
310  * could fail if the GPU got in an infinite loop in the CL, but that
311  * is pretty unlikely outside of an i-g-t testcase.
312  */
313 static enum drm_gpu_sched_stat
314 v3d_cl_job_timedout(struct drm_sched_job *sched_job, enum v3d_queue q,
315 		    u32 *timedout_ctca, u32 *timedout_ctra)
316 {
317 	struct v3d_job *job = to_v3d_job(sched_job);
318 	struct v3d_dev *v3d = job->v3d;
319 	u32 ctca = V3D_CORE_READ(0, V3D_CLE_CTNCA(q));
320 	u32 ctra = V3D_CORE_READ(0, V3D_CLE_CTNRA(q));
321 
322 	if (*timedout_ctca != ctca || *timedout_ctra != ctra) {
323 		*timedout_ctca = ctca;
324 		*timedout_ctra = ctra;
325 		return DRM_GPU_SCHED_STAT_NOMINAL;
326 	}
327 
328 	return v3d_gpu_reset_for_timeout(v3d, sched_job);
329 }
330 
331 static enum drm_gpu_sched_stat
332 v3d_bin_job_timedout(struct drm_sched_job *sched_job)
333 {
334 	struct v3d_bin_job *job = to_bin_job(sched_job);
335 
336 	return v3d_cl_job_timedout(sched_job, V3D_BIN,
337 				   &job->timedout_ctca, &job->timedout_ctra);
338 }
339 
340 static enum drm_gpu_sched_stat
341 v3d_render_job_timedout(struct drm_sched_job *sched_job)
342 {
343 	struct v3d_render_job *job = to_render_job(sched_job);
344 
345 	return v3d_cl_job_timedout(sched_job, V3D_RENDER,
346 				   &job->timedout_ctca, &job->timedout_ctra);
347 }
348 
349 static enum drm_gpu_sched_stat
350 v3d_generic_job_timedout(struct drm_sched_job *sched_job)
351 {
352 	struct v3d_job *job = to_v3d_job(sched_job);
353 
354 	return v3d_gpu_reset_for_timeout(job->v3d, sched_job);
355 }
356 
357 static enum drm_gpu_sched_stat
358 v3d_csd_job_timedout(struct drm_sched_job *sched_job)
359 {
360 	struct v3d_csd_job *job = to_csd_job(sched_job);
361 	struct v3d_dev *v3d = job->base.v3d;
362 	u32 batches = V3D_CORE_READ(0, V3D_CSD_CURRENT_CFG4);
363 
364 	/* If we've made progress, skip reset and let the timer get
365 	 * rearmed.
366 	 */
367 	if (job->timedout_batches != batches) {
368 		job->timedout_batches = batches;
369 		return DRM_GPU_SCHED_STAT_NOMINAL;
370 	}
371 
372 	return v3d_gpu_reset_for_timeout(v3d, sched_job);
373 }
374 
375 static const struct drm_sched_backend_ops v3d_bin_sched_ops = {
376 	.dependency = v3d_job_dependency,
377 	.run_job = v3d_bin_job_run,
378 	.timedout_job = v3d_bin_job_timedout,
379 	.free_job = v3d_job_free,
380 };
381 
382 static const struct drm_sched_backend_ops v3d_render_sched_ops = {
383 	.dependency = v3d_job_dependency,
384 	.run_job = v3d_render_job_run,
385 	.timedout_job = v3d_render_job_timedout,
386 	.free_job = v3d_job_free,
387 };
388 
389 static const struct drm_sched_backend_ops v3d_tfu_sched_ops = {
390 	.dependency = v3d_job_dependency,
391 	.run_job = v3d_tfu_job_run,
392 	.timedout_job = v3d_generic_job_timedout,
393 	.free_job = v3d_job_free,
394 };
395 
396 static const struct drm_sched_backend_ops v3d_csd_sched_ops = {
397 	.dependency = v3d_job_dependency,
398 	.run_job = v3d_csd_job_run,
399 	.timedout_job = v3d_csd_job_timedout,
400 	.free_job = v3d_job_free
401 };
402 
403 static const struct drm_sched_backend_ops v3d_cache_clean_sched_ops = {
404 	.dependency = v3d_job_dependency,
405 	.run_job = v3d_cache_clean_job_run,
406 	.timedout_job = v3d_generic_job_timedout,
407 	.free_job = v3d_job_free
408 };
409 
410 int
411 v3d_sched_init(struct v3d_dev *v3d)
412 {
413 	int hw_jobs_limit = 1;
414 	int job_hang_limit = 0;
415 	int hang_limit_ms = 500;
416 	int ret;
417 
418 	ret = drm_sched_init(&v3d->queue[V3D_BIN].sched,
419 			     &v3d_bin_sched_ops,
420 			     hw_jobs_limit, job_hang_limit,
421 			     msecs_to_jiffies(hang_limit_ms), NULL,
422 			     NULL, "v3d_bin");
423 	if (ret) {
424 		dev_err(v3d->drm.dev, "Failed to create bin scheduler: %d.", ret);
425 		return ret;
426 	}
427 
428 	ret = drm_sched_init(&v3d->queue[V3D_RENDER].sched,
429 			     &v3d_render_sched_ops,
430 			     hw_jobs_limit, job_hang_limit,
431 			     msecs_to_jiffies(hang_limit_ms), NULL,
432 			     NULL, "v3d_render");
433 	if (ret) {
434 		dev_err(v3d->drm.dev, "Failed to create render scheduler: %d.",
435 			ret);
436 		v3d_sched_fini(v3d);
437 		return ret;
438 	}
439 
440 	ret = drm_sched_init(&v3d->queue[V3D_TFU].sched,
441 			     &v3d_tfu_sched_ops,
442 			     hw_jobs_limit, job_hang_limit,
443 			     msecs_to_jiffies(hang_limit_ms), NULL,
444 			     NULL, "v3d_tfu");
445 	if (ret) {
446 		dev_err(v3d->drm.dev, "Failed to create TFU scheduler: %d.",
447 			ret);
448 		v3d_sched_fini(v3d);
449 		return ret;
450 	}
451 
452 	if (v3d_has_csd(v3d)) {
453 		ret = drm_sched_init(&v3d->queue[V3D_CSD].sched,
454 				     &v3d_csd_sched_ops,
455 				     hw_jobs_limit, job_hang_limit,
456 				     msecs_to_jiffies(hang_limit_ms), NULL,
457 				     NULL, "v3d_csd");
458 		if (ret) {
459 			dev_err(v3d->drm.dev, "Failed to create CSD scheduler: %d.",
460 				ret);
461 			v3d_sched_fini(v3d);
462 			return ret;
463 		}
464 
465 		ret = drm_sched_init(&v3d->queue[V3D_CACHE_CLEAN].sched,
466 				     &v3d_cache_clean_sched_ops,
467 				     hw_jobs_limit, job_hang_limit,
468 				     msecs_to_jiffies(hang_limit_ms), NULL,
469 				     NULL, "v3d_cache_clean");
470 		if (ret) {
471 			dev_err(v3d->drm.dev, "Failed to create CACHE_CLEAN scheduler: %d.",
472 				ret);
473 			v3d_sched_fini(v3d);
474 			return ret;
475 		}
476 	}
477 
478 	return 0;
479 }
480 
481 void
482 v3d_sched_fini(struct v3d_dev *v3d)
483 {
484 	enum v3d_queue q;
485 
486 	for (q = 0; q < V3D_MAX_QUEUES; q++) {
487 		if (v3d->queue[q].sched.ready)
488 			drm_sched_fini(&v3d->queue[q].sched);
489 	}
490 }
491