xref: /openbmc/linux/block/blk-mq-sched.c (revision 1c2dd16a)
1 /*
2  * blk-mq scheduling framework
3  *
4  * Copyright (C) 2016 Jens Axboe
5  */
6 #include <linux/kernel.h>
7 #include <linux/module.h>
8 #include <linux/blk-mq.h>
9 
10 #include <trace/events/block.h>
11 
12 #include "blk.h"
13 #include "blk-mq.h"
14 #include "blk-mq-sched.h"
15 #include "blk-mq-tag.h"
16 #include "blk-wbt.h"
17 
18 void blk_mq_sched_free_hctx_data(struct request_queue *q,
19 				 void (*exit)(struct blk_mq_hw_ctx *))
20 {
21 	struct blk_mq_hw_ctx *hctx;
22 	int i;
23 
24 	queue_for_each_hw_ctx(q, hctx, i) {
25 		if (exit && hctx->sched_data)
26 			exit(hctx);
27 		kfree(hctx->sched_data);
28 		hctx->sched_data = NULL;
29 	}
30 }
31 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
32 
33 static void __blk_mq_sched_assign_ioc(struct request_queue *q,
34 				      struct request *rq,
35 				      struct bio *bio,
36 				      struct io_context *ioc)
37 {
38 	struct io_cq *icq;
39 
40 	spin_lock_irq(q->queue_lock);
41 	icq = ioc_lookup_icq(ioc, q);
42 	spin_unlock_irq(q->queue_lock);
43 
44 	if (!icq) {
45 		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
46 		if (!icq)
47 			return;
48 	}
49 
50 	rq->elv.icq = icq;
51 	if (!blk_mq_sched_get_rq_priv(q, rq, bio)) {
52 		rq->rq_flags |= RQF_ELVPRIV;
53 		get_io_context(icq->ioc);
54 		return;
55 	}
56 
57 	rq->elv.icq = NULL;
58 }
59 
60 static void blk_mq_sched_assign_ioc(struct request_queue *q,
61 				    struct request *rq, struct bio *bio)
62 {
63 	struct io_context *ioc;
64 
65 	ioc = rq_ioc(bio);
66 	if (ioc)
67 		__blk_mq_sched_assign_ioc(q, rq, bio, ioc);
68 }
69 
70 struct request *blk_mq_sched_get_request(struct request_queue *q,
71 					 struct bio *bio,
72 					 unsigned int op,
73 					 struct blk_mq_alloc_data *data)
74 {
75 	struct elevator_queue *e = q->elevator;
76 	struct request *rq;
77 
78 	blk_queue_enter_live(q);
79 	data->q = q;
80 	if (likely(!data->ctx))
81 		data->ctx = blk_mq_get_ctx(q);
82 	if (likely(!data->hctx))
83 		data->hctx = blk_mq_map_queue(q, data->ctx->cpu);
84 
85 	/*
86 	 * For a reserved tag, allocate a normal request since we might
87 	 * have driver dependencies on the value of the internal tag.
88 	 */
89 	if (e && !(data->flags & BLK_MQ_REQ_RESERVED)) {
90 		data->flags |= BLK_MQ_REQ_INTERNAL;
91 
92 		/*
93 		 * Flush requests are special and go directly to the
94 		 * dispatch list.
95 		 */
96 		if (!op_is_flush(op) && e->type->ops.mq.get_request) {
97 			rq = e->type->ops.mq.get_request(q, op, data);
98 			if (rq)
99 				rq->rq_flags |= RQF_QUEUED;
100 		} else
101 			rq = __blk_mq_alloc_request(data, op);
102 	} else {
103 		rq = __blk_mq_alloc_request(data, op);
104 	}
105 
106 	if (rq) {
107 		if (!op_is_flush(op)) {
108 			rq->elv.icq = NULL;
109 			if (e && e->type->icq_cache)
110 				blk_mq_sched_assign_ioc(q, rq, bio);
111 		}
112 		data->hctx->queued++;
113 		return rq;
114 	}
115 
116 	blk_queue_exit(q);
117 	return NULL;
118 }
119 
120 void blk_mq_sched_put_request(struct request *rq)
121 {
122 	struct request_queue *q = rq->q;
123 	struct elevator_queue *e = q->elevator;
124 
125 	if (rq->rq_flags & RQF_ELVPRIV) {
126 		blk_mq_sched_put_rq_priv(rq->q, rq);
127 		if (rq->elv.icq) {
128 			put_io_context(rq->elv.icq->ioc);
129 			rq->elv.icq = NULL;
130 		}
131 	}
132 
133 	if ((rq->rq_flags & RQF_QUEUED) && e && e->type->ops.mq.put_request)
134 		e->type->ops.mq.put_request(rq);
135 	else
136 		blk_mq_finish_request(rq);
137 }
138 
139 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
140 {
141 	struct request_queue *q = hctx->queue;
142 	struct elevator_queue *e = q->elevator;
143 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
144 	bool did_work = false;
145 	LIST_HEAD(rq_list);
146 
147 	if (unlikely(blk_mq_hctx_stopped(hctx)))
148 		return;
149 
150 	hctx->run++;
151 
152 	/*
153 	 * If we have previous entries on our dispatch list, grab them first for
154 	 * more fair dispatch.
155 	 */
156 	if (!list_empty_careful(&hctx->dispatch)) {
157 		spin_lock(&hctx->lock);
158 		if (!list_empty(&hctx->dispatch))
159 			list_splice_init(&hctx->dispatch, &rq_list);
160 		spin_unlock(&hctx->lock);
161 	}
162 
163 	/*
164 	 * Only ask the scheduler for requests, if we didn't have residual
165 	 * requests from the dispatch list. This is to avoid the case where
166 	 * we only ever dispatch a fraction of the requests available because
167 	 * of low device queue depth. Once we pull requests out of the IO
168 	 * scheduler, we can no longer merge or sort them. So it's best to
169 	 * leave them there for as long as we can. Mark the hw queue as
170 	 * needing a restart in that case.
171 	 */
172 	if (!list_empty(&rq_list)) {
173 		blk_mq_sched_mark_restart_hctx(hctx);
174 		did_work = blk_mq_dispatch_rq_list(q, &rq_list);
175 	} else if (!has_sched_dispatch) {
176 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
177 		blk_mq_dispatch_rq_list(q, &rq_list);
178 	}
179 
180 	/*
181 	 * We want to dispatch from the scheduler if we had no work left
182 	 * on the dispatch list, OR if we did have work but weren't able
183 	 * to make progress.
184 	 */
185 	if (!did_work && has_sched_dispatch) {
186 		do {
187 			struct request *rq;
188 
189 			rq = e->type->ops.mq.dispatch_request(hctx);
190 			if (!rq)
191 				break;
192 			list_add(&rq->queuelist, &rq_list);
193 		} while (blk_mq_dispatch_rq_list(q, &rq_list));
194 	}
195 }
196 
197 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
198 			    struct request **merged_request)
199 {
200 	struct request *rq;
201 
202 	switch (elv_merge(q, &rq, bio)) {
203 	case ELEVATOR_BACK_MERGE:
204 		if (!blk_mq_sched_allow_merge(q, rq, bio))
205 			return false;
206 		if (!bio_attempt_back_merge(q, rq, bio))
207 			return false;
208 		*merged_request = attempt_back_merge(q, rq);
209 		if (!*merged_request)
210 			elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
211 		return true;
212 	case ELEVATOR_FRONT_MERGE:
213 		if (!blk_mq_sched_allow_merge(q, rq, bio))
214 			return false;
215 		if (!bio_attempt_front_merge(q, rq, bio))
216 			return false;
217 		*merged_request = attempt_front_merge(q, rq);
218 		if (!*merged_request)
219 			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
220 		return true;
221 	default:
222 		return false;
223 	}
224 }
225 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
226 
227 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
228 {
229 	struct elevator_queue *e = q->elevator;
230 
231 	if (e->type->ops.mq.bio_merge) {
232 		struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
233 		struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
234 
235 		blk_mq_put_ctx(ctx);
236 		return e->type->ops.mq.bio_merge(hctx, bio);
237 	}
238 
239 	return false;
240 }
241 
242 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
243 {
244 	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
245 }
246 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
247 
248 void blk_mq_sched_request_inserted(struct request *rq)
249 {
250 	trace_block_rq_insert(rq->q, rq);
251 }
252 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
253 
254 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
255 				       struct request *rq)
256 {
257 	if (rq->tag == -1) {
258 		rq->rq_flags |= RQF_SORTED;
259 		return false;
260 	}
261 
262 	/*
263 	 * If we already have a real request tag, send directly to
264 	 * the dispatch list.
265 	 */
266 	spin_lock(&hctx->lock);
267 	list_add(&rq->queuelist, &hctx->dispatch);
268 	spin_unlock(&hctx->lock);
269 	return true;
270 }
271 
272 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx)
273 {
274 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) {
275 		clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
276 		if (blk_mq_hctx_has_pending(hctx)) {
277 			blk_mq_run_hw_queue(hctx, true);
278 			return true;
279 		}
280 	}
281 	return false;
282 }
283 
284 /**
285  * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list
286  * @pos:    loop cursor.
287  * @skip:   the list element that will not be examined. Iteration starts at
288  *          @skip->next.
289  * @head:   head of the list to examine. This list must have at least one
290  *          element, namely @skip.
291  * @member: name of the list_head structure within typeof(*pos).
292  */
293 #define list_for_each_entry_rcu_rr(pos, skip, head, member)		\
294 	for ((pos) = (skip);						\
295 	     (pos = (pos)->member.next != (head) ? list_entry_rcu(	\
296 			(pos)->member.next, typeof(*pos), member) :	\
297 	      list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \
298 	     (pos) != (skip); )
299 
300 /*
301  * Called after a driver tag has been freed to check whether a hctx needs to
302  * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware
303  * queues in a round-robin fashion if the tag set of @hctx is shared with other
304  * hardware queues.
305  */
306 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx)
307 {
308 	struct blk_mq_tags *const tags = hctx->tags;
309 	struct blk_mq_tag_set *const set = hctx->queue->tag_set;
310 	struct request_queue *const queue = hctx->queue, *q;
311 	struct blk_mq_hw_ctx *hctx2;
312 	unsigned int i, j;
313 
314 	if (set->flags & BLK_MQ_F_TAG_SHARED) {
315 		rcu_read_lock();
316 		list_for_each_entry_rcu_rr(q, queue, &set->tag_list,
317 					   tag_set_list) {
318 			queue_for_each_hw_ctx(q, hctx2, i)
319 				if (hctx2->tags == tags &&
320 				    blk_mq_sched_restart_hctx(hctx2))
321 					goto done;
322 		}
323 		j = hctx->queue_num + 1;
324 		for (i = 0; i < queue->nr_hw_queues; i++, j++) {
325 			if (j == queue->nr_hw_queues)
326 				j = 0;
327 			hctx2 = queue->queue_hw_ctx[j];
328 			if (hctx2->tags == tags &&
329 			    blk_mq_sched_restart_hctx(hctx2))
330 				break;
331 		}
332 done:
333 		rcu_read_unlock();
334 	} else {
335 		blk_mq_sched_restart_hctx(hctx);
336 	}
337 }
338 
339 /*
340  * Add flush/fua to the queue. If we fail getting a driver tag, then
341  * punt to the requeue list. Requeue will re-invoke us from a context
342  * that's safe to block from.
343  */
344 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
345 				      struct request *rq, bool can_block)
346 {
347 	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
348 		blk_insert_flush(rq);
349 		blk_mq_run_hw_queue(hctx, true);
350 	} else
351 		blk_mq_add_to_requeue_list(rq, false, true);
352 }
353 
354 void blk_mq_sched_insert_request(struct request *rq, bool at_head,
355 				 bool run_queue, bool async, bool can_block)
356 {
357 	struct request_queue *q = rq->q;
358 	struct elevator_queue *e = q->elevator;
359 	struct blk_mq_ctx *ctx = rq->mq_ctx;
360 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
361 
362 	if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
363 		blk_mq_sched_insert_flush(hctx, rq, can_block);
364 		return;
365 	}
366 
367 	if (e && blk_mq_sched_bypass_insert(hctx, rq))
368 		goto run;
369 
370 	if (e && e->type->ops.mq.insert_requests) {
371 		LIST_HEAD(list);
372 
373 		list_add(&rq->queuelist, &list);
374 		e->type->ops.mq.insert_requests(hctx, &list, at_head);
375 	} else {
376 		spin_lock(&ctx->lock);
377 		__blk_mq_insert_request(hctx, rq, at_head);
378 		spin_unlock(&ctx->lock);
379 	}
380 
381 run:
382 	if (run_queue)
383 		blk_mq_run_hw_queue(hctx, async);
384 }
385 
386 void blk_mq_sched_insert_requests(struct request_queue *q,
387 				  struct blk_mq_ctx *ctx,
388 				  struct list_head *list, bool run_queue_async)
389 {
390 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
391 	struct elevator_queue *e = hctx->queue->elevator;
392 
393 	if (e) {
394 		struct request *rq, *next;
395 
396 		/*
397 		 * We bypass requests that already have a driver tag assigned,
398 		 * which should only be flushes. Flushes are only ever inserted
399 		 * as single requests, so we shouldn't ever hit the
400 		 * WARN_ON_ONCE() below (but let's handle it just in case).
401 		 */
402 		list_for_each_entry_safe(rq, next, list, queuelist) {
403 			if (WARN_ON_ONCE(rq->tag != -1)) {
404 				list_del_init(&rq->queuelist);
405 				blk_mq_sched_bypass_insert(hctx, rq);
406 			}
407 		}
408 	}
409 
410 	if (e && e->type->ops.mq.insert_requests)
411 		e->type->ops.mq.insert_requests(hctx, list, false);
412 	else
413 		blk_mq_insert_requests(hctx, ctx, list);
414 
415 	blk_mq_run_hw_queue(hctx, run_queue_async);
416 }
417 
418 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
419 				   struct blk_mq_hw_ctx *hctx,
420 				   unsigned int hctx_idx)
421 {
422 	if (hctx->sched_tags) {
423 		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
424 		blk_mq_free_rq_map(hctx->sched_tags);
425 		hctx->sched_tags = NULL;
426 	}
427 }
428 
429 static int blk_mq_sched_alloc_tags(struct request_queue *q,
430 				   struct blk_mq_hw_ctx *hctx,
431 				   unsigned int hctx_idx)
432 {
433 	struct blk_mq_tag_set *set = q->tag_set;
434 	int ret;
435 
436 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
437 					       set->reserved_tags);
438 	if (!hctx->sched_tags)
439 		return -ENOMEM;
440 
441 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
442 	if (ret)
443 		blk_mq_sched_free_tags(set, hctx, hctx_idx);
444 
445 	return ret;
446 }
447 
448 static void blk_mq_sched_tags_teardown(struct request_queue *q)
449 {
450 	struct blk_mq_tag_set *set = q->tag_set;
451 	struct blk_mq_hw_ctx *hctx;
452 	int i;
453 
454 	queue_for_each_hw_ctx(q, hctx, i)
455 		blk_mq_sched_free_tags(set, hctx, i);
456 }
457 
458 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
459 			   unsigned int hctx_idx)
460 {
461 	struct elevator_queue *e = q->elevator;
462 	int ret;
463 
464 	if (!e)
465 		return 0;
466 
467 	ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
468 	if (ret)
469 		return ret;
470 
471 	if (e->type->ops.mq.init_hctx) {
472 		ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
473 		if (ret) {
474 			blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
475 			return ret;
476 		}
477 	}
478 
479 	return 0;
480 }
481 
482 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
483 			    unsigned int hctx_idx)
484 {
485 	struct elevator_queue *e = q->elevator;
486 
487 	if (!e)
488 		return;
489 
490 	if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
491 		e->type->ops.mq.exit_hctx(hctx, hctx_idx);
492 		hctx->sched_data = NULL;
493 	}
494 
495 	blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
496 }
497 
498 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
499 {
500 	struct blk_mq_hw_ctx *hctx;
501 	struct elevator_queue *eq;
502 	unsigned int i;
503 	int ret;
504 
505 	if (!e) {
506 		q->elevator = NULL;
507 		return 0;
508 	}
509 
510 	/*
511 	 * Default to 256, since we don't split into sync/async like the
512 	 * old code did. Additionally, this is a per-hw queue depth.
513 	 */
514 	q->nr_requests = 2 * BLKDEV_MAX_RQ;
515 
516 	queue_for_each_hw_ctx(q, hctx, i) {
517 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
518 		if (ret)
519 			goto err;
520 	}
521 
522 	ret = e->ops.mq.init_sched(q, e);
523 	if (ret)
524 		goto err;
525 
526 	if (e->ops.mq.init_hctx) {
527 		queue_for_each_hw_ctx(q, hctx, i) {
528 			ret = e->ops.mq.init_hctx(hctx, i);
529 			if (ret) {
530 				eq = q->elevator;
531 				blk_mq_exit_sched(q, eq);
532 				kobject_put(&eq->kobj);
533 				return ret;
534 			}
535 		}
536 	}
537 
538 	return 0;
539 
540 err:
541 	blk_mq_sched_tags_teardown(q);
542 	q->elevator = NULL;
543 	return ret;
544 }
545 
546 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
547 {
548 	struct blk_mq_hw_ctx *hctx;
549 	unsigned int i;
550 
551 	if (e->type->ops.mq.exit_hctx) {
552 		queue_for_each_hw_ctx(q, hctx, i) {
553 			if (hctx->sched_data) {
554 				e->type->ops.mq.exit_hctx(hctx, i);
555 				hctx->sched_data = NULL;
556 			}
557 		}
558 	}
559 	if (e->type->ops.mq.exit_sched)
560 		e->type->ops.mq.exit_sched(e);
561 	blk_mq_sched_tags_teardown(q);
562 	q->elevator = NULL;
563 }
564 
565 int blk_mq_sched_init(struct request_queue *q)
566 {
567 	int ret;
568 
569 	mutex_lock(&q->sysfs_lock);
570 	ret = elevator_init(q, NULL);
571 	mutex_unlock(&q->sysfs_lock);
572 
573 	return ret;
574 }
575