1 /*
2  * Copyright 2015 Advanced Micro Devices, Inc.
3  *
4  * Permission is hereby granted, free of charge, to any person obtaining a
5  * copy of this software and associated documentation files (the "Software"),
6  * to deal in the Software without restriction, including without limitation
7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
8  * and/or sell copies of the Software, and to permit persons to whom the
9  * Software is furnished to do so, subject to the following conditions:
10  *
11  * The above copyright notice and this permission notice shall be included in
12  * all copies or substantial portions of the Software.
13  *
14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
20  * OTHER DEALINGS IN THE SOFTWARE.
21  *
22  */
23 
24 /**
25  * DOC: Overview
26  *
27  * The GPU scheduler provides entities which allow userspace to push jobs
28  * into software queues which are then scheduled on a hardware run queue.
29  * The software queues have a priority among them. The scheduler selects the entities
30  * from the run queue using a FIFO. The scheduler provides dependency handling
31  * features among jobs. The driver is supposed to provide callback functions for
32  * backend operations to the scheduler like submitting a job to hardware run queue,
33  * returning the dependencies of a job etc.
34  *
35  * The organisation of the scheduler is the following:
36  *
37  * 1. Each hw run queue has one scheduler
38  * 2. Each scheduler has multiple run queues with different priorities
39  *    (e.g., HIGH_HW,HIGH_SW, KERNEL, NORMAL)
40  * 3. Each scheduler run queue has a queue of entities to schedule
41  * 4. Entities themselves maintain a queue of jobs that will be scheduled on
42  *    the hardware.
43  *
44  * The jobs in a entity are always scheduled in the order that they were pushed.
45  */
46 
47 #include <linux/kthread.h>
48 #include <linux/wait.h>
49 #include <linux/sched.h>
50 #include <uapi/linux/sched/types.h>
51 #include <drm/drmP.h>
52 #include <drm/gpu_scheduler.h>
53 #include <drm/spsc_queue.h>
54 
55 #define CREATE_TRACE_POINTS
56 #include "gpu_scheduler_trace.h"
57 
58 #define to_drm_sched_job(sched_job)		\
59 		container_of((sched_job), struct drm_sched_job, queue_node)
60 
61 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb);
62 
63 static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job);
64 
65 /**
66  * drm_sched_rq_init - initialize a given run queue struct
67  *
68  * @rq: scheduler run queue
69  *
70  * Initializes a scheduler runqueue.
71  */
72 static void drm_sched_rq_init(struct drm_gpu_scheduler *sched,
73 			      struct drm_sched_rq *rq)
74 {
75 	spin_lock_init(&rq->lock);
76 	INIT_LIST_HEAD(&rq->entities);
77 	rq->current_entity = NULL;
78 	rq->sched = sched;
79 }
80 
81 /**
82  * drm_sched_rq_add_entity - add an entity
83  *
84  * @rq: scheduler run queue
85  * @entity: scheduler entity
86  *
87  * Adds a scheduler entity to the run queue.
88  */
89 void drm_sched_rq_add_entity(struct drm_sched_rq *rq,
90 			     struct drm_sched_entity *entity)
91 {
92 	if (!list_empty(&entity->list))
93 		return;
94 	spin_lock(&rq->lock);
95 	list_add_tail(&entity->list, &rq->entities);
96 	spin_unlock(&rq->lock);
97 }
98 
99 /**
100  * drm_sched_rq_remove_entity - remove an entity
101  *
102  * @rq: scheduler run queue
103  * @entity: scheduler entity
104  *
105  * Removes a scheduler entity from the run queue.
106  */
107 void drm_sched_rq_remove_entity(struct drm_sched_rq *rq,
108 				struct drm_sched_entity *entity)
109 {
110 	if (list_empty(&entity->list))
111 		return;
112 	spin_lock(&rq->lock);
113 	list_del_init(&entity->list);
114 	if (rq->current_entity == entity)
115 		rq->current_entity = NULL;
116 	spin_unlock(&rq->lock);
117 }
118 
119 /**
120  * drm_sched_rq_select_entity - Select an entity which could provide a job to run
121  *
122  * @rq: scheduler run queue to check.
123  *
124  * Try to find a ready entity, returns NULL if none found.
125  */
126 static struct drm_sched_entity *
127 drm_sched_rq_select_entity(struct drm_sched_rq *rq)
128 {
129 	struct drm_sched_entity *entity;
130 
131 	spin_lock(&rq->lock);
132 
133 	entity = rq->current_entity;
134 	if (entity) {
135 		list_for_each_entry_continue(entity, &rq->entities, list) {
136 			if (drm_sched_entity_is_ready(entity)) {
137 				rq->current_entity = entity;
138 				spin_unlock(&rq->lock);
139 				return entity;
140 			}
141 		}
142 	}
143 
144 	list_for_each_entry(entity, &rq->entities, list) {
145 
146 		if (drm_sched_entity_is_ready(entity)) {
147 			rq->current_entity = entity;
148 			spin_unlock(&rq->lock);
149 			return entity;
150 		}
151 
152 		if (entity == rq->current_entity)
153 			break;
154 	}
155 
156 	spin_unlock(&rq->lock);
157 
158 	return NULL;
159 }
160 
161 /**
162  * drm_sched_dependency_optimized
163  *
164  * @fence: the dependency fence
165  * @entity: the entity which depends on the above fence
166  *
167  * Returns true if the dependency can be optimized and false otherwise
168  */
169 bool drm_sched_dependency_optimized(struct dma_fence* fence,
170 				    struct drm_sched_entity *entity)
171 {
172 	struct drm_gpu_scheduler *sched = entity->rq->sched;
173 	struct drm_sched_fence *s_fence;
174 
175 	if (!fence || dma_fence_is_signaled(fence))
176 		return false;
177 	if (fence->context == entity->fence_context)
178 		return true;
179 	s_fence = to_drm_sched_fence(fence);
180 	if (s_fence && s_fence->sched == sched)
181 		return true;
182 
183 	return false;
184 }
185 EXPORT_SYMBOL(drm_sched_dependency_optimized);
186 
187 /**
188  * drm_sched_start_timeout - start timeout for reset worker
189  *
190  * @sched: scheduler instance to start the worker for
191  *
192  * Start the timeout for the given scheduler.
193  */
194 static void drm_sched_start_timeout(struct drm_gpu_scheduler *sched)
195 {
196 	if (sched->timeout != MAX_SCHEDULE_TIMEOUT &&
197 	    !list_empty(&sched->ring_mirror_list))
198 		schedule_delayed_work(&sched->work_tdr, sched->timeout);
199 }
200 
201 /**
202  * drm_sched_fault - immediately start timeout handler
203  *
204  * @sched: scheduler where the timeout handling should be started.
205  *
206  * Start timeout handling immediately when the driver detects a hardware fault.
207  */
208 void drm_sched_fault(struct drm_gpu_scheduler *sched)
209 {
210 	mod_delayed_work(system_wq, &sched->work_tdr, 0);
211 }
212 EXPORT_SYMBOL(drm_sched_fault);
213 
214 /* job_finish is called after hw fence signaled
215  */
216 static void drm_sched_job_finish(struct work_struct *work)
217 {
218 	struct drm_sched_job *s_job = container_of(work, struct drm_sched_job,
219 						   finish_work);
220 	struct drm_gpu_scheduler *sched = s_job->sched;
221 
222 	/*
223 	 * Canceling the timeout without removing our job from the ring mirror
224 	 * list is safe, as we will only end up in this worker if our jobs
225 	 * finished fence has been signaled. So even if some another worker
226 	 * manages to find this job as the next job in the list, the fence
227 	 * signaled check below will prevent the timeout to be restarted.
228 	 */
229 	cancel_delayed_work_sync(&sched->work_tdr);
230 
231 	spin_lock(&sched->job_list_lock);
232 	/* remove job from ring_mirror_list */
233 	list_del_init(&s_job->node);
234 	/* queue TDR for next job */
235 	drm_sched_start_timeout(sched);
236 	spin_unlock(&sched->job_list_lock);
237 
238 	sched->ops->free_job(s_job);
239 }
240 
241 static void drm_sched_job_finish_cb(struct dma_fence *f,
242 				    struct dma_fence_cb *cb)
243 {
244 	struct drm_sched_job *job = container_of(cb, struct drm_sched_job,
245 						 finish_cb);
246 	schedule_work(&job->finish_work);
247 }
248 
249 static void drm_sched_job_begin(struct drm_sched_job *s_job)
250 {
251 	struct drm_gpu_scheduler *sched = s_job->sched;
252 
253 	dma_fence_add_callback(&s_job->s_fence->finished, &s_job->finish_cb,
254 			       drm_sched_job_finish_cb);
255 
256 	spin_lock(&sched->job_list_lock);
257 	list_add_tail(&s_job->node, &sched->ring_mirror_list);
258 	drm_sched_start_timeout(sched);
259 	spin_unlock(&sched->job_list_lock);
260 }
261 
262 static void drm_sched_job_timedout(struct work_struct *work)
263 {
264 	struct drm_gpu_scheduler *sched;
265 	struct drm_sched_job *job;
266 
267 	sched = container_of(work, struct drm_gpu_scheduler, work_tdr.work);
268 	job = list_first_entry_or_null(&sched->ring_mirror_list,
269 				       struct drm_sched_job, node);
270 
271 	if (job)
272 		job->sched->ops->timedout_job(job);
273 
274 	spin_lock(&sched->job_list_lock);
275 	drm_sched_start_timeout(sched);
276 	spin_unlock(&sched->job_list_lock);
277 }
278 
279 /**
280  * drm_sched_hw_job_reset - stop the scheduler if it contains the bad job
281  *
282  * @sched: scheduler instance
283  * @bad: bad scheduler job
284  *
285  */
286 void drm_sched_hw_job_reset(struct drm_gpu_scheduler *sched, struct drm_sched_job *bad)
287 {
288 	struct drm_sched_job *s_job;
289 	struct drm_sched_entity *entity, *tmp;
290 	int i;
291 
292 	spin_lock(&sched->job_list_lock);
293 	list_for_each_entry_reverse(s_job, &sched->ring_mirror_list, node) {
294 		if (s_job->s_fence->parent &&
295 		    dma_fence_remove_callback(s_job->s_fence->parent,
296 					      &s_job->s_fence->cb)) {
297 			dma_fence_put(s_job->s_fence->parent);
298 			s_job->s_fence->parent = NULL;
299 			atomic_dec(&sched->hw_rq_count);
300 		}
301 	}
302 	spin_unlock(&sched->job_list_lock);
303 
304 	if (bad && bad->s_priority != DRM_SCHED_PRIORITY_KERNEL) {
305 		atomic_inc(&bad->karma);
306 		/* don't increase @bad's karma if it's from KERNEL RQ,
307 		 * becuase sometimes GPU hang would cause kernel jobs (like VM updating jobs)
308 		 * corrupt but keep in mind that kernel jobs always considered good.
309 		 */
310 		for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_KERNEL; i++ ) {
311 			struct drm_sched_rq *rq = &sched->sched_rq[i];
312 
313 			spin_lock(&rq->lock);
314 			list_for_each_entry_safe(entity, tmp, &rq->entities, list) {
315 				if (bad->s_fence->scheduled.context == entity->fence_context) {
316 				    if (atomic_read(&bad->karma) > bad->sched->hang_limit)
317 						if (entity->guilty)
318 							atomic_set(entity->guilty, 1);
319 					break;
320 				}
321 			}
322 			spin_unlock(&rq->lock);
323 			if (&entity->list != &rq->entities)
324 				break;
325 		}
326 	}
327 }
328 EXPORT_SYMBOL(drm_sched_hw_job_reset);
329 
330 /**
331  * drm_sched_job_recovery - recover jobs after a reset
332  *
333  * @sched: scheduler instance
334  *
335  */
336 void drm_sched_job_recovery(struct drm_gpu_scheduler *sched)
337 {
338 	struct drm_sched_job *s_job, *tmp;
339 	bool found_guilty = false;
340 	int r;
341 
342 	spin_lock(&sched->job_list_lock);
343 	list_for_each_entry_safe(s_job, tmp, &sched->ring_mirror_list, node) {
344 		struct drm_sched_fence *s_fence = s_job->s_fence;
345 		struct dma_fence *fence;
346 		uint64_t guilty_context;
347 
348 		if (!found_guilty && atomic_read(&s_job->karma) > sched->hang_limit) {
349 			found_guilty = true;
350 			guilty_context = s_job->s_fence->scheduled.context;
351 		}
352 
353 		if (found_guilty && s_job->s_fence->scheduled.context == guilty_context)
354 			dma_fence_set_error(&s_fence->finished, -ECANCELED);
355 
356 		spin_unlock(&sched->job_list_lock);
357 		fence = sched->ops->run_job(s_job);
358 		atomic_inc(&sched->hw_rq_count);
359 
360 		if (fence) {
361 			s_fence->parent = dma_fence_get(fence);
362 			r = dma_fence_add_callback(fence, &s_fence->cb,
363 						   drm_sched_process_job);
364 			if (r == -ENOENT)
365 				drm_sched_process_job(fence, &s_fence->cb);
366 			else if (r)
367 				DRM_ERROR("fence add callback failed (%d)\n",
368 					  r);
369 			dma_fence_put(fence);
370 		} else {
371 			if (s_fence->finished.error < 0)
372 				drm_sched_expel_job_unlocked(s_job);
373 			drm_sched_process_job(NULL, &s_fence->cb);
374 		}
375 		spin_lock(&sched->job_list_lock);
376 	}
377 	drm_sched_start_timeout(sched);
378 	spin_unlock(&sched->job_list_lock);
379 }
380 EXPORT_SYMBOL(drm_sched_job_recovery);
381 
382 /**
383  * drm_sched_job_init - init a scheduler job
384  *
385  * @job: scheduler job to init
386  * @entity: scheduler entity to use
387  * @owner: job owner for debugging
388  *
389  * Refer to drm_sched_entity_push_job() documentation
390  * for locking considerations.
391  *
392  * Returns 0 for success, negative error code otherwise.
393  */
394 int drm_sched_job_init(struct drm_sched_job *job,
395 		       struct drm_sched_entity *entity,
396 		       void *owner)
397 {
398 	struct drm_gpu_scheduler *sched;
399 
400 	drm_sched_entity_select_rq(entity);
401 	if (!entity->rq)
402 		return -ENOENT;
403 
404 	sched = entity->rq->sched;
405 
406 	job->sched = sched;
407 	job->entity = entity;
408 	job->s_priority = entity->rq - sched->sched_rq;
409 	job->s_fence = drm_sched_fence_create(entity, owner);
410 	if (!job->s_fence)
411 		return -ENOMEM;
412 	job->id = atomic64_inc_return(&sched->job_id_count);
413 
414 	INIT_WORK(&job->finish_work, drm_sched_job_finish);
415 	INIT_LIST_HEAD(&job->node);
416 
417 	return 0;
418 }
419 EXPORT_SYMBOL(drm_sched_job_init);
420 
421 /**
422  * drm_sched_job_cleanup - clean up scheduler job resources
423  *
424  * @job: scheduler job to clean up
425  */
426 void drm_sched_job_cleanup(struct drm_sched_job *job)
427 {
428 	dma_fence_put(&job->s_fence->finished);
429 	job->s_fence = NULL;
430 }
431 EXPORT_SYMBOL(drm_sched_job_cleanup);
432 
433 /**
434  * drm_sched_ready - is the scheduler ready
435  *
436  * @sched: scheduler instance
437  *
438  * Return true if we can push more jobs to the hw, otherwise false.
439  */
440 static bool drm_sched_ready(struct drm_gpu_scheduler *sched)
441 {
442 	return atomic_read(&sched->hw_rq_count) <
443 		sched->hw_submission_limit;
444 }
445 
446 /**
447  * drm_sched_wakeup - Wake up the scheduler when it is ready
448  *
449  * @sched: scheduler instance
450  *
451  */
452 void drm_sched_wakeup(struct drm_gpu_scheduler *sched)
453 {
454 	if (drm_sched_ready(sched))
455 		wake_up_interruptible(&sched->wake_up_worker);
456 }
457 
458 /**
459  * drm_sched_select_entity - Select next entity to process
460  *
461  * @sched: scheduler instance
462  *
463  * Returns the entity to process or NULL if none are found.
464  */
465 static struct drm_sched_entity *
466 drm_sched_select_entity(struct drm_gpu_scheduler *sched)
467 {
468 	struct drm_sched_entity *entity;
469 	int i;
470 
471 	if (!drm_sched_ready(sched))
472 		return NULL;
473 
474 	/* Kernel run queue has higher priority than normal run queue*/
475 	for (i = DRM_SCHED_PRIORITY_MAX - 1; i >= DRM_SCHED_PRIORITY_MIN; i--) {
476 		entity = drm_sched_rq_select_entity(&sched->sched_rq[i]);
477 		if (entity)
478 			break;
479 	}
480 
481 	return entity;
482 }
483 
484 /**
485  * drm_sched_process_job - process a job
486  *
487  * @f: fence
488  * @cb: fence callbacks
489  *
490  * Called after job has finished execution.
491  */
492 static void drm_sched_process_job(struct dma_fence *f, struct dma_fence_cb *cb)
493 {
494 	struct drm_sched_fence *s_fence =
495 		container_of(cb, struct drm_sched_fence, cb);
496 	struct drm_gpu_scheduler *sched = s_fence->sched;
497 
498 	dma_fence_get(&s_fence->finished);
499 	atomic_dec(&sched->hw_rq_count);
500 	atomic_dec(&sched->num_jobs);
501 	drm_sched_fence_finished(s_fence);
502 
503 	trace_drm_sched_process_job(s_fence);
504 	dma_fence_put(&s_fence->finished);
505 	wake_up_interruptible(&sched->wake_up_worker);
506 }
507 
508 /**
509  * drm_sched_blocked - check if the scheduler is blocked
510  *
511  * @sched: scheduler instance
512  *
513  * Returns true if blocked, otherwise false.
514  */
515 static bool drm_sched_blocked(struct drm_gpu_scheduler *sched)
516 {
517 	if (kthread_should_park()) {
518 		kthread_parkme();
519 		return true;
520 	}
521 
522 	return false;
523 }
524 
525 /**
526  * drm_sched_main - main scheduler thread
527  *
528  * @param: scheduler instance
529  *
530  * Returns 0.
531  */
532 static int drm_sched_main(void *param)
533 {
534 	struct sched_param sparam = {.sched_priority = 1};
535 	struct drm_gpu_scheduler *sched = (struct drm_gpu_scheduler *)param;
536 	int r;
537 
538 	sched_setscheduler(current, SCHED_FIFO, &sparam);
539 
540 	while (!kthread_should_stop()) {
541 		struct drm_sched_entity *entity = NULL;
542 		struct drm_sched_fence *s_fence;
543 		struct drm_sched_job *sched_job;
544 		struct dma_fence *fence;
545 
546 		wait_event_interruptible(sched->wake_up_worker,
547 					 (!drm_sched_blocked(sched) &&
548 					  (entity = drm_sched_select_entity(sched))) ||
549 					 kthread_should_stop());
550 
551 		if (!entity)
552 			continue;
553 
554 		sched_job = drm_sched_entity_pop_job(entity);
555 		if (!sched_job)
556 			continue;
557 
558 		s_fence = sched_job->s_fence;
559 
560 		atomic_inc(&sched->hw_rq_count);
561 		drm_sched_job_begin(sched_job);
562 
563 		fence = sched->ops->run_job(sched_job);
564 		drm_sched_fence_scheduled(s_fence);
565 
566 		if (fence) {
567 			s_fence->parent = dma_fence_get(fence);
568 			r = dma_fence_add_callback(fence, &s_fence->cb,
569 						   drm_sched_process_job);
570 			if (r == -ENOENT)
571 				drm_sched_process_job(fence, &s_fence->cb);
572 			else if (r)
573 				DRM_ERROR("fence add callback failed (%d)\n",
574 					  r);
575 			dma_fence_put(fence);
576 		} else {
577 			if (s_fence->finished.error < 0)
578 				drm_sched_expel_job_unlocked(sched_job);
579 			drm_sched_process_job(NULL, &s_fence->cb);
580 		}
581 
582 		wake_up(&sched->job_scheduled);
583 	}
584 	return 0;
585 }
586 
587 static void drm_sched_expel_job_unlocked(struct drm_sched_job *s_job)
588 {
589 	struct drm_gpu_scheduler *sched = s_job->sched;
590 
591 	spin_lock(&sched->job_list_lock);
592 	list_del_init(&s_job->node);
593 	spin_unlock(&sched->job_list_lock);
594 }
595 
596 /**
597  * drm_sched_init - Init a gpu scheduler instance
598  *
599  * @sched: scheduler instance
600  * @ops: backend operations for this scheduler
601  * @hw_submission: number of hw submissions that can be in flight
602  * @hang_limit: number of times to allow a job to hang before dropping it
603  * @timeout: timeout value in jiffies for the scheduler
604  * @name: name used for debugging
605  *
606  * Return 0 on success, otherwise error code.
607  */
608 int drm_sched_init(struct drm_gpu_scheduler *sched,
609 		   const struct drm_sched_backend_ops *ops,
610 		   unsigned hw_submission,
611 		   unsigned hang_limit,
612 		   long timeout,
613 		   const char *name)
614 {
615 	int i;
616 	sched->ops = ops;
617 	sched->hw_submission_limit = hw_submission;
618 	sched->name = name;
619 	sched->timeout = timeout;
620 	sched->hang_limit = hang_limit;
621 	for (i = DRM_SCHED_PRIORITY_MIN; i < DRM_SCHED_PRIORITY_MAX; i++)
622 		drm_sched_rq_init(sched, &sched->sched_rq[i]);
623 
624 	init_waitqueue_head(&sched->wake_up_worker);
625 	init_waitqueue_head(&sched->job_scheduled);
626 	INIT_LIST_HEAD(&sched->ring_mirror_list);
627 	spin_lock_init(&sched->job_list_lock);
628 	atomic_set(&sched->hw_rq_count, 0);
629 	INIT_DELAYED_WORK(&sched->work_tdr, drm_sched_job_timedout);
630 	atomic_set(&sched->num_jobs, 0);
631 	atomic64_set(&sched->job_id_count, 0);
632 
633 	/* Each scheduler will run on a seperate kernel thread */
634 	sched->thread = kthread_run(drm_sched_main, sched, sched->name);
635 	if (IS_ERR(sched->thread)) {
636 		DRM_ERROR("Failed to create scheduler for %s.\n", name);
637 		return PTR_ERR(sched->thread);
638 	}
639 
640 	sched->ready = true;
641 	return 0;
642 }
643 EXPORT_SYMBOL(drm_sched_init);
644 
645 /**
646  * drm_sched_fini - Destroy a gpu scheduler
647  *
648  * @sched: scheduler instance
649  *
650  * Tears down and cleans up the scheduler.
651  */
652 void drm_sched_fini(struct drm_gpu_scheduler *sched)
653 {
654 	if (sched->thread)
655 		kthread_stop(sched->thread);
656 
657 	sched->ready = false;
658 }
659 EXPORT_SYMBOL(drm_sched_fini);
660