xref: /openbmc/linux/block/blk-mq-sched.c (revision 1f460b63)
1bd166ef1SJens Axboe /*
2bd166ef1SJens Axboe  * blk-mq scheduling framework
3bd166ef1SJens Axboe  *
4bd166ef1SJens Axboe  * Copyright (C) 2016 Jens Axboe
5bd166ef1SJens Axboe  */
6bd166ef1SJens Axboe #include <linux/kernel.h>
7bd166ef1SJens Axboe #include <linux/module.h>
8bd166ef1SJens Axboe #include <linux/blk-mq.h>
9bd166ef1SJens Axboe 
10bd166ef1SJens Axboe #include <trace/events/block.h>
11bd166ef1SJens Axboe 
12bd166ef1SJens Axboe #include "blk.h"
13bd166ef1SJens Axboe #include "blk-mq.h"
14d332ce09SOmar Sandoval #include "blk-mq-debugfs.h"
15bd166ef1SJens Axboe #include "blk-mq-sched.h"
16bd166ef1SJens Axboe #include "blk-mq-tag.h"
17bd166ef1SJens Axboe #include "blk-wbt.h"
18bd166ef1SJens Axboe 
19bd166ef1SJens Axboe void blk_mq_sched_free_hctx_data(struct request_queue *q,
20bd166ef1SJens Axboe 				 void (*exit)(struct blk_mq_hw_ctx *))
21bd166ef1SJens Axboe {
22bd166ef1SJens Axboe 	struct blk_mq_hw_ctx *hctx;
23bd166ef1SJens Axboe 	int i;
24bd166ef1SJens Axboe 
25bd166ef1SJens Axboe 	queue_for_each_hw_ctx(q, hctx, i) {
26bd166ef1SJens Axboe 		if (exit && hctx->sched_data)
27bd166ef1SJens Axboe 			exit(hctx);
28bd166ef1SJens Axboe 		kfree(hctx->sched_data);
29bd166ef1SJens Axboe 		hctx->sched_data = NULL;
30bd166ef1SJens Axboe 	}
31bd166ef1SJens Axboe }
32bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data);
33bd166ef1SJens Axboe 
3444e8c2bfSChristoph Hellwig void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio)
35bd166ef1SJens Axboe {
3644e8c2bfSChristoph Hellwig 	struct request_queue *q = rq->q;
3744e8c2bfSChristoph Hellwig 	struct io_context *ioc = rq_ioc(bio);
38bd166ef1SJens Axboe 	struct io_cq *icq;
39bd166ef1SJens Axboe 
40bd166ef1SJens Axboe 	spin_lock_irq(q->queue_lock);
41bd166ef1SJens Axboe 	icq = ioc_lookup_icq(ioc, q);
42bd166ef1SJens Axboe 	spin_unlock_irq(q->queue_lock);
43bd166ef1SJens Axboe 
44bd166ef1SJens Axboe 	if (!icq) {
45bd166ef1SJens Axboe 		icq = ioc_create_icq(ioc, q, GFP_ATOMIC);
46bd166ef1SJens Axboe 		if (!icq)
47bd166ef1SJens Axboe 			return;
48bd166ef1SJens Axboe 	}
49ea511e3cSChristoph Hellwig 	get_io_context(icq->ioc);
5044e8c2bfSChristoph Hellwig 	rq->elv.icq = icq;
51bd166ef1SJens Axboe }
52bd166ef1SJens Axboe 
538e8320c9SJens Axboe /*
548e8320c9SJens Axboe  * Mark a hardware queue as needing a restart. For shared queues, maintain
558e8320c9SJens Axboe  * a count of how many hardware queues are marked for restart.
568e8320c9SJens Axboe  */
578e8320c9SJens Axboe static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx)
588e8320c9SJens Axboe {
598e8320c9SJens Axboe 	if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
608e8320c9SJens Axboe 		return;
618e8320c9SJens Axboe 
628e8320c9SJens Axboe 	if (hctx->flags & BLK_MQ_F_TAG_SHARED) {
638e8320c9SJens Axboe 		struct request_queue *q = hctx->queue;
648e8320c9SJens Axboe 
658e8320c9SJens Axboe 		if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
668e8320c9SJens Axboe 			atomic_inc(&q->shared_hctx_restart);
678e8320c9SJens Axboe 	} else
688e8320c9SJens Axboe 		set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
698e8320c9SJens Axboe }
708e8320c9SJens Axboe 
71358a3a6bSMing Lei void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx)
728e8320c9SJens Axboe {
738e8320c9SJens Axboe 	if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state))
74358a3a6bSMing Lei 		return;
758e8320c9SJens Axboe 
768e8320c9SJens Axboe 	clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state);
778e8320c9SJens Axboe 
788e8320c9SJens Axboe 	if (blk_mq_hctx_has_pending(hctx)) {
798e8320c9SJens Axboe 		blk_mq_run_hw_queue(hctx, true);
80358a3a6bSMing Lei 		return;
818e8320c9SJens Axboe 	}
828e8320c9SJens Axboe }
838e8320c9SJens Axboe 
841f460b63SMing Lei /*
851f460b63SMing Lei  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
861f460b63SMing Lei  * its queue by itself in its completion handler, so we don't need to
871f460b63SMing Lei  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
881f460b63SMing Lei  */
891f460b63SMing Lei static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx)
90caf8eb0dSMing Lei {
91caf8eb0dSMing Lei 	struct request_queue *q = hctx->queue;
92caf8eb0dSMing Lei 	struct elevator_queue *e = q->elevator;
93caf8eb0dSMing Lei 	LIST_HEAD(rq_list);
94caf8eb0dSMing Lei 
95caf8eb0dSMing Lei 	do {
96de148297SMing Lei 		struct request *rq;
97de148297SMing Lei 		blk_status_t ret;
98caf8eb0dSMing Lei 
99de148297SMing Lei 		if (e->type->ops.mq.has_work &&
100de148297SMing Lei 				!e->type->ops.mq.has_work(hctx))
101caf8eb0dSMing Lei 			break;
102de148297SMing Lei 
103de148297SMing Lei 		ret = blk_mq_get_dispatch_budget(hctx);
104de148297SMing Lei 		if (ret == BLK_STS_RESOURCE)
1051f460b63SMing Lei 			break;
106de148297SMing Lei 
107de148297SMing Lei 		rq = e->type->ops.mq.dispatch_request(hctx);
108de148297SMing Lei 		if (!rq) {
109de148297SMing Lei 			blk_mq_put_dispatch_budget(hctx);
110de148297SMing Lei 			break;
111de148297SMing Lei 		} else if (ret != BLK_STS_OK) {
112de148297SMing Lei 			blk_mq_end_request(rq, ret);
113de148297SMing Lei 			continue;
114caf8eb0dSMing Lei 		}
115caf8eb0dSMing Lei 
116de148297SMing Lei 		/*
117de148297SMing Lei 		 * Now this rq owns the budget which has to be released
118de148297SMing Lei 		 * if this rq won't be queued to driver via .queue_rq()
119de148297SMing Lei 		 * in blk_mq_dispatch_rq_list().
120de148297SMing Lei 		 */
121de148297SMing Lei 		list_add(&rq->queuelist, &rq_list);
122de148297SMing Lei 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
123de148297SMing Lei }
124de148297SMing Lei 
125b347689fSMing Lei static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx,
126b347689fSMing Lei 					  struct blk_mq_ctx *ctx)
127b347689fSMing Lei {
128b347689fSMing Lei 	unsigned idx = ctx->index_hw;
129b347689fSMing Lei 
130b347689fSMing Lei 	if (++idx == hctx->nr_ctx)
131b347689fSMing Lei 		idx = 0;
132b347689fSMing Lei 
133b347689fSMing Lei 	return hctx->ctxs[idx];
134b347689fSMing Lei }
135b347689fSMing Lei 
1361f460b63SMing Lei /*
1371f460b63SMing Lei  * Only SCSI implements .get_budget and .put_budget, and SCSI restarts
1381f460b63SMing Lei  * its queue by itself in its completion handler, so we don't need to
1391f460b63SMing Lei  * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE.
1401f460b63SMing Lei  */
1411f460b63SMing Lei static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx)
142b347689fSMing Lei {
143b347689fSMing Lei 	struct request_queue *q = hctx->queue;
144b347689fSMing Lei 	LIST_HEAD(rq_list);
145b347689fSMing Lei 	struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from);
146b347689fSMing Lei 
147b347689fSMing Lei 	do {
148b347689fSMing Lei 		struct request *rq;
149b347689fSMing Lei 		blk_status_t ret;
150b347689fSMing Lei 
151b347689fSMing Lei 		if (!sbitmap_any_bit_set(&hctx->ctx_map))
152b347689fSMing Lei 			break;
153b347689fSMing Lei 
154b347689fSMing Lei 		ret = blk_mq_get_dispatch_budget(hctx);
155b347689fSMing Lei 		if (ret == BLK_STS_RESOURCE)
1561f460b63SMing Lei 			break;
157b347689fSMing Lei 
158b347689fSMing Lei 		rq = blk_mq_dequeue_from_ctx(hctx, ctx);
159b347689fSMing Lei 		if (!rq) {
160b347689fSMing Lei 			blk_mq_put_dispatch_budget(hctx);
161b347689fSMing Lei 			break;
162b347689fSMing Lei 		} else if (ret != BLK_STS_OK) {
163b347689fSMing Lei 			blk_mq_end_request(rq, ret);
164b347689fSMing Lei 			continue;
165b347689fSMing Lei 		}
166b347689fSMing Lei 
167b347689fSMing Lei 		/*
168b347689fSMing Lei 		 * Now this rq owns the budget which has to be released
169b347689fSMing Lei 		 * if this rq won't be queued to driver via .queue_rq()
170b347689fSMing Lei 		 * in blk_mq_dispatch_rq_list().
171b347689fSMing Lei 		 */
172b347689fSMing Lei 		list_add(&rq->queuelist, &rq_list);
173b347689fSMing Lei 
174b347689fSMing Lei 		/* round robin for fair dispatch */
175b347689fSMing Lei 		ctx = blk_mq_next_ctx(hctx, rq->mq_ctx);
176b347689fSMing Lei 
177b347689fSMing Lei 	} while (blk_mq_dispatch_rq_list(q, &rq_list, true));
178b347689fSMing Lei 
179b347689fSMing Lei 	WRITE_ONCE(hctx->dispatch_from, ctx);
180b347689fSMing Lei }
181b347689fSMing Lei 
182de148297SMing Lei /* return true if hw queue need to be run again */
1831f460b63SMing Lei void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx)
184bd166ef1SJens Axboe {
18581380ca1SOmar Sandoval 	struct request_queue *q = hctx->queue;
18681380ca1SOmar Sandoval 	struct elevator_queue *e = q->elevator;
18764765a75SJens Axboe 	const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request;
188bd166ef1SJens Axboe 	LIST_HEAD(rq_list);
189bd166ef1SJens Axboe 
190f4560ffeSMing Lei 	/* RCU or SRCU read lock is needed before checking quiesced flag */
191f4560ffeSMing Lei 	if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)))
1921f460b63SMing Lei 		return;
193bd166ef1SJens Axboe 
194bd166ef1SJens Axboe 	hctx->run++;
195bd166ef1SJens Axboe 
196bd166ef1SJens Axboe 	/*
197bd166ef1SJens Axboe 	 * If we have previous entries on our dispatch list, grab them first for
198bd166ef1SJens Axboe 	 * more fair dispatch.
199bd166ef1SJens Axboe 	 */
200bd166ef1SJens Axboe 	if (!list_empty_careful(&hctx->dispatch)) {
201bd166ef1SJens Axboe 		spin_lock(&hctx->lock);
202bd166ef1SJens Axboe 		if (!list_empty(&hctx->dispatch))
203bd166ef1SJens Axboe 			list_splice_init(&hctx->dispatch, &rq_list);
204bd166ef1SJens Axboe 		spin_unlock(&hctx->lock);
205bd166ef1SJens Axboe 	}
206bd166ef1SJens Axboe 
207bd166ef1SJens Axboe 	/*
208bd166ef1SJens Axboe 	 * Only ask the scheduler for requests, if we didn't have residual
209bd166ef1SJens Axboe 	 * requests from the dispatch list. This is to avoid the case where
210bd166ef1SJens Axboe 	 * we only ever dispatch a fraction of the requests available because
211bd166ef1SJens Axboe 	 * of low device queue depth. Once we pull requests out of the IO
212bd166ef1SJens Axboe 	 * scheduler, we can no longer merge or sort them. So it's best to
213bd166ef1SJens Axboe 	 * leave them there for as long as we can. Mark the hw queue as
214bd166ef1SJens Axboe 	 * needing a restart in that case.
215caf8eb0dSMing Lei 	 *
2165e3d02bbSMing Lei 	 * We want to dispatch from the scheduler if there was nothing
2175e3d02bbSMing Lei 	 * on the dispatch list or we were able to dispatch from the
2185e3d02bbSMing Lei 	 * dispatch list.
21964765a75SJens Axboe 	 */
220caf8eb0dSMing Lei 	if (!list_empty(&rq_list)) {
221caf8eb0dSMing Lei 		blk_mq_sched_mark_restart_hctx(hctx);
222b347689fSMing Lei 		if (blk_mq_dispatch_rq_list(q, &rq_list, false)) {
223b347689fSMing Lei 			if (has_sched_dispatch)
2241f460b63SMing Lei 				blk_mq_do_dispatch_sched(hctx);
225b347689fSMing Lei 			else
2261f460b63SMing Lei 				blk_mq_do_dispatch_ctx(hctx);
227b347689fSMing Lei 		}
228caf8eb0dSMing Lei 	} else if (has_sched_dispatch) {
2291f460b63SMing Lei 		blk_mq_do_dispatch_sched(hctx);
230b347689fSMing Lei 	} else if (q->mq_ops->get_budget) {
231b347689fSMing Lei 		/*
232b347689fSMing Lei 		 * If we need to get budget before queuing request, we
233b347689fSMing Lei 		 * dequeue request one by one from sw queue for avoiding
234b347689fSMing Lei 		 * to mess up I/O merge when dispatch runs out of resource.
235b347689fSMing Lei 		 *
236b347689fSMing Lei 		 * TODO: get more budgets, and dequeue more requests in
237b347689fSMing Lei 		 * one time.
238b347689fSMing Lei 		 */
2391f460b63SMing Lei 		blk_mq_do_dispatch_ctx(hctx);
240caf8eb0dSMing Lei 	} else {
241caf8eb0dSMing Lei 		blk_mq_flush_busy_ctxs(hctx, &rq_list);
242de148297SMing Lei 		blk_mq_dispatch_rq_list(q, &rq_list, false);
243c13660a0SJens Axboe 	}
244bd166ef1SJens Axboe }
245bd166ef1SJens Axboe 
246e4d750c9SJens Axboe bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio,
247e4d750c9SJens Axboe 			    struct request **merged_request)
248bd166ef1SJens Axboe {
249bd166ef1SJens Axboe 	struct request *rq;
250bd166ef1SJens Axboe 
25134fe7c05SChristoph Hellwig 	switch (elv_merge(q, &rq, bio)) {
25234fe7c05SChristoph Hellwig 	case ELEVATOR_BACK_MERGE:
253bd166ef1SJens Axboe 		if (!blk_mq_sched_allow_merge(q, rq, bio))
254bd166ef1SJens Axboe 			return false;
25534fe7c05SChristoph Hellwig 		if (!bio_attempt_back_merge(q, rq, bio))
25634fe7c05SChristoph Hellwig 			return false;
257e4d750c9SJens Axboe 		*merged_request = attempt_back_merge(q, rq);
258e4d750c9SJens Axboe 		if (!*merged_request)
25934fe7c05SChristoph Hellwig 			elv_merged_request(q, rq, ELEVATOR_BACK_MERGE);
260bd166ef1SJens Axboe 		return true;
26134fe7c05SChristoph Hellwig 	case ELEVATOR_FRONT_MERGE:
262bd166ef1SJens Axboe 		if (!blk_mq_sched_allow_merge(q, rq, bio))
263bd166ef1SJens Axboe 			return false;
26434fe7c05SChristoph Hellwig 		if (!bio_attempt_front_merge(q, rq, bio))
26534fe7c05SChristoph Hellwig 			return false;
266e4d750c9SJens Axboe 		*merged_request = attempt_front_merge(q, rq);
267e4d750c9SJens Axboe 		if (!*merged_request)
26834fe7c05SChristoph Hellwig 			elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE);
269bd166ef1SJens Axboe 		return true;
27034fe7c05SChristoph Hellwig 	default:
271bd166ef1SJens Axboe 		return false;
272bd166ef1SJens Axboe 	}
27334fe7c05SChristoph Hellwig }
274bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge);
275bd166ef1SJens Axboe 
2769bddeb2aSMing Lei /*
2779bddeb2aSMing Lei  * Reverse check our software queue for entries that we could potentially
2789bddeb2aSMing Lei  * merge with. Currently includes a hand-wavy stop count of 8, to not spend
2799bddeb2aSMing Lei  * too much time checking for merges.
2809bddeb2aSMing Lei  */
2819bddeb2aSMing Lei static bool blk_mq_attempt_merge(struct request_queue *q,
2829bddeb2aSMing Lei 				 struct blk_mq_ctx *ctx, struct bio *bio)
2839bddeb2aSMing Lei {
2849bddeb2aSMing Lei 	struct request *rq;
2859bddeb2aSMing Lei 	int checked = 8;
2869bddeb2aSMing Lei 
2877b607814SBart Van Assche 	lockdep_assert_held(&ctx->lock);
2887b607814SBart Van Assche 
2899bddeb2aSMing Lei 	list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) {
2909bddeb2aSMing Lei 		bool merged = false;
2919bddeb2aSMing Lei 
2929bddeb2aSMing Lei 		if (!checked--)
2939bddeb2aSMing Lei 			break;
2949bddeb2aSMing Lei 
2959bddeb2aSMing Lei 		if (!blk_rq_merge_ok(rq, bio))
2969bddeb2aSMing Lei 			continue;
2979bddeb2aSMing Lei 
2989bddeb2aSMing Lei 		switch (blk_try_merge(rq, bio)) {
2999bddeb2aSMing Lei 		case ELEVATOR_BACK_MERGE:
3009bddeb2aSMing Lei 			if (blk_mq_sched_allow_merge(q, rq, bio))
3019bddeb2aSMing Lei 				merged = bio_attempt_back_merge(q, rq, bio);
3029bddeb2aSMing Lei 			break;
3039bddeb2aSMing Lei 		case ELEVATOR_FRONT_MERGE:
3049bddeb2aSMing Lei 			if (blk_mq_sched_allow_merge(q, rq, bio))
3059bddeb2aSMing Lei 				merged = bio_attempt_front_merge(q, rq, bio);
3069bddeb2aSMing Lei 			break;
3079bddeb2aSMing Lei 		case ELEVATOR_DISCARD_MERGE:
3089bddeb2aSMing Lei 			merged = bio_attempt_discard_merge(q, rq, bio);
3099bddeb2aSMing Lei 			break;
3109bddeb2aSMing Lei 		default:
3119bddeb2aSMing Lei 			continue;
3129bddeb2aSMing Lei 		}
3139bddeb2aSMing Lei 
3149bddeb2aSMing Lei 		if (merged)
3159bddeb2aSMing Lei 			ctx->rq_merged++;
3169bddeb2aSMing Lei 		return merged;
3179bddeb2aSMing Lei 	}
3189bddeb2aSMing Lei 
3199bddeb2aSMing Lei 	return false;
3209bddeb2aSMing Lei }
3219bddeb2aSMing Lei 
322bd166ef1SJens Axboe bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio)
323bd166ef1SJens Axboe {
324bd166ef1SJens Axboe 	struct elevator_queue *e = q->elevator;
325bd166ef1SJens Axboe 	struct blk_mq_ctx *ctx = blk_mq_get_ctx(q);
326bd166ef1SJens Axboe 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
3279bddeb2aSMing Lei 	bool ret = false;
328bd166ef1SJens Axboe 
3299bddeb2aSMing Lei 	if (e && e->type->ops.mq.bio_merge) {
330bd166ef1SJens Axboe 		blk_mq_put_ctx(ctx);
331bd166ef1SJens Axboe 		return e->type->ops.mq.bio_merge(hctx, bio);
332bd166ef1SJens Axboe 	}
333bd166ef1SJens Axboe 
3349bddeb2aSMing Lei 	if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) {
3359bddeb2aSMing Lei 		/* default per sw-queue merge */
3369bddeb2aSMing Lei 		spin_lock(&ctx->lock);
3379bddeb2aSMing Lei 		ret = blk_mq_attempt_merge(q, ctx, bio);
3389bddeb2aSMing Lei 		spin_unlock(&ctx->lock);
3399bddeb2aSMing Lei 	}
3409bddeb2aSMing Lei 
3419bddeb2aSMing Lei 	blk_mq_put_ctx(ctx);
3429bddeb2aSMing Lei 	return ret;
343bd166ef1SJens Axboe }
344bd166ef1SJens Axboe 
345bd166ef1SJens Axboe bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq)
346bd166ef1SJens Axboe {
347bd166ef1SJens Axboe 	return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq);
348bd166ef1SJens Axboe }
349bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge);
350bd166ef1SJens Axboe 
351bd166ef1SJens Axboe void blk_mq_sched_request_inserted(struct request *rq)
352bd166ef1SJens Axboe {
353bd166ef1SJens Axboe 	trace_block_rq_insert(rq->q, rq);
354bd166ef1SJens Axboe }
355bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted);
356bd166ef1SJens Axboe 
3570cacba6cSOmar Sandoval static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx,
3580cacba6cSOmar Sandoval 				       struct request *rq)
359bd166ef1SJens Axboe {
360bd166ef1SJens Axboe 	if (rq->tag == -1) {
361bd166ef1SJens Axboe 		rq->rq_flags |= RQF_SORTED;
362bd166ef1SJens Axboe 		return false;
363bd166ef1SJens Axboe 	}
364bd166ef1SJens Axboe 
365bd166ef1SJens Axboe 	/*
366bd166ef1SJens Axboe 	 * If we already have a real request tag, send directly to
367bd166ef1SJens Axboe 	 * the dispatch list.
368bd166ef1SJens Axboe 	 */
369bd166ef1SJens Axboe 	spin_lock(&hctx->lock);
370bd166ef1SJens Axboe 	list_add(&rq->queuelist, &hctx->dispatch);
371bd166ef1SJens Axboe 	spin_unlock(&hctx->lock);
372bd166ef1SJens Axboe 	return true;
373bd166ef1SJens Axboe }
374bd166ef1SJens Axboe 
375bd6737f1SJens Axboe /*
376bd6737f1SJens Axboe  * Add flush/fua to the queue. If we fail getting a driver tag, then
377bd6737f1SJens Axboe  * punt to the requeue list. Requeue will re-invoke us from a context
378bd6737f1SJens Axboe  * that's safe to block from.
379bd6737f1SJens Axboe  */
380bd6737f1SJens Axboe static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx,
381bd6737f1SJens Axboe 				      struct request *rq, bool can_block)
382bd6737f1SJens Axboe {
383bd6737f1SJens Axboe 	if (blk_mq_get_driver_tag(rq, &hctx, can_block)) {
384bd6737f1SJens Axboe 		blk_insert_flush(rq);
385bd6737f1SJens Axboe 		blk_mq_run_hw_queue(hctx, true);
386bd6737f1SJens Axboe 	} else
387c7a571b4SJens Axboe 		blk_mq_add_to_requeue_list(rq, false, true);
388bd6737f1SJens Axboe }
389bd6737f1SJens Axboe 
390bd6737f1SJens Axboe void blk_mq_sched_insert_request(struct request *rq, bool at_head,
391bd6737f1SJens Axboe 				 bool run_queue, bool async, bool can_block)
392bd6737f1SJens Axboe {
393bd6737f1SJens Axboe 	struct request_queue *q = rq->q;
394bd6737f1SJens Axboe 	struct elevator_queue *e = q->elevator;
395bd6737f1SJens Axboe 	struct blk_mq_ctx *ctx = rq->mq_ctx;
396bd6737f1SJens Axboe 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
397bd6737f1SJens Axboe 
398f3a8ab7dSJens Axboe 	if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) {
399bd6737f1SJens Axboe 		blk_mq_sched_insert_flush(hctx, rq, can_block);
400bd6737f1SJens Axboe 		return;
401bd6737f1SJens Axboe 	}
402bd6737f1SJens Axboe 
4030cacba6cSOmar Sandoval 	if (e && blk_mq_sched_bypass_insert(hctx, rq))
4040cacba6cSOmar Sandoval 		goto run;
4050cacba6cSOmar Sandoval 
406bd6737f1SJens Axboe 	if (e && e->type->ops.mq.insert_requests) {
407bd6737f1SJens Axboe 		LIST_HEAD(list);
408bd6737f1SJens Axboe 
409bd6737f1SJens Axboe 		list_add(&rq->queuelist, &list);
410bd6737f1SJens Axboe 		e->type->ops.mq.insert_requests(hctx, &list, at_head);
411bd6737f1SJens Axboe 	} else {
412bd6737f1SJens Axboe 		spin_lock(&ctx->lock);
413bd6737f1SJens Axboe 		__blk_mq_insert_request(hctx, rq, at_head);
414bd6737f1SJens Axboe 		spin_unlock(&ctx->lock);
415bd6737f1SJens Axboe 	}
416bd6737f1SJens Axboe 
4170cacba6cSOmar Sandoval run:
418bd6737f1SJens Axboe 	if (run_queue)
419bd6737f1SJens Axboe 		blk_mq_run_hw_queue(hctx, async);
420bd6737f1SJens Axboe }
421bd6737f1SJens Axboe 
422bd6737f1SJens Axboe void blk_mq_sched_insert_requests(struct request_queue *q,
423bd6737f1SJens Axboe 				  struct blk_mq_ctx *ctx,
424bd6737f1SJens Axboe 				  struct list_head *list, bool run_queue_async)
425bd6737f1SJens Axboe {
426bd6737f1SJens Axboe 	struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu);
427bd6737f1SJens Axboe 	struct elevator_queue *e = hctx->queue->elevator;
428bd6737f1SJens Axboe 
4290cacba6cSOmar Sandoval 	if (e) {
4300cacba6cSOmar Sandoval 		struct request *rq, *next;
4310cacba6cSOmar Sandoval 
4320cacba6cSOmar Sandoval 		/*
4330cacba6cSOmar Sandoval 		 * We bypass requests that already have a driver tag assigned,
4340cacba6cSOmar Sandoval 		 * which should only be flushes. Flushes are only ever inserted
4350cacba6cSOmar Sandoval 		 * as single requests, so we shouldn't ever hit the
4360cacba6cSOmar Sandoval 		 * WARN_ON_ONCE() below (but let's handle it just in case).
4370cacba6cSOmar Sandoval 		 */
4380cacba6cSOmar Sandoval 		list_for_each_entry_safe(rq, next, list, queuelist) {
4390cacba6cSOmar Sandoval 			if (WARN_ON_ONCE(rq->tag != -1)) {
4400cacba6cSOmar Sandoval 				list_del_init(&rq->queuelist);
4410cacba6cSOmar Sandoval 				blk_mq_sched_bypass_insert(hctx, rq);
4420cacba6cSOmar Sandoval 			}
4430cacba6cSOmar Sandoval 		}
4440cacba6cSOmar Sandoval 	}
4450cacba6cSOmar Sandoval 
446bd6737f1SJens Axboe 	if (e && e->type->ops.mq.insert_requests)
447bd6737f1SJens Axboe 		e->type->ops.mq.insert_requests(hctx, list, false);
448bd6737f1SJens Axboe 	else
449bd6737f1SJens Axboe 		blk_mq_insert_requests(hctx, ctx, list);
450bd6737f1SJens Axboe 
451bd6737f1SJens Axboe 	blk_mq_run_hw_queue(hctx, run_queue_async);
452bd6737f1SJens Axboe }
453bd6737f1SJens Axboe 
454bd166ef1SJens Axboe static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set,
455bd166ef1SJens Axboe 				   struct blk_mq_hw_ctx *hctx,
456bd166ef1SJens Axboe 				   unsigned int hctx_idx)
457bd166ef1SJens Axboe {
458bd166ef1SJens Axboe 	if (hctx->sched_tags) {
459bd166ef1SJens Axboe 		blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx);
460bd166ef1SJens Axboe 		blk_mq_free_rq_map(hctx->sched_tags);
461bd166ef1SJens Axboe 		hctx->sched_tags = NULL;
462bd166ef1SJens Axboe 	}
463bd166ef1SJens Axboe }
464bd166ef1SJens Axboe 
4656917ff0bSOmar Sandoval static int blk_mq_sched_alloc_tags(struct request_queue *q,
4666917ff0bSOmar Sandoval 				   struct blk_mq_hw_ctx *hctx,
4676917ff0bSOmar Sandoval 				   unsigned int hctx_idx)
468bd166ef1SJens Axboe {
469bd166ef1SJens Axboe 	struct blk_mq_tag_set *set = q->tag_set;
4706917ff0bSOmar Sandoval 	int ret;
471bd166ef1SJens Axboe 
4726917ff0bSOmar Sandoval 	hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests,
4736917ff0bSOmar Sandoval 					       set->reserved_tags);
474bd166ef1SJens Axboe 	if (!hctx->sched_tags)
4756917ff0bSOmar Sandoval 		return -ENOMEM;
4766917ff0bSOmar Sandoval 
4776917ff0bSOmar Sandoval 	ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests);
4786917ff0bSOmar Sandoval 	if (ret)
4796917ff0bSOmar Sandoval 		blk_mq_sched_free_tags(set, hctx, hctx_idx);
480bd166ef1SJens Axboe 
481bd166ef1SJens Axboe 	return ret;
482bd166ef1SJens Axboe }
483bd166ef1SJens Axboe 
48454d5329dSOmar Sandoval static void blk_mq_sched_tags_teardown(struct request_queue *q)
485bd166ef1SJens Axboe {
486bd166ef1SJens Axboe 	struct blk_mq_tag_set *set = q->tag_set;
487bd166ef1SJens Axboe 	struct blk_mq_hw_ctx *hctx;
488bd166ef1SJens Axboe 	int i;
489bd166ef1SJens Axboe 
490bd166ef1SJens Axboe 	queue_for_each_hw_ctx(q, hctx, i)
491bd166ef1SJens Axboe 		blk_mq_sched_free_tags(set, hctx, i);
492bd166ef1SJens Axboe }
493d3484991SJens Axboe 
49493252632SOmar Sandoval int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
49593252632SOmar Sandoval 			   unsigned int hctx_idx)
49693252632SOmar Sandoval {
49793252632SOmar Sandoval 	struct elevator_queue *e = q->elevator;
498ee056f98SOmar Sandoval 	int ret;
49993252632SOmar Sandoval 
50093252632SOmar Sandoval 	if (!e)
50193252632SOmar Sandoval 		return 0;
50293252632SOmar Sandoval 
503ee056f98SOmar Sandoval 	ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx);
504ee056f98SOmar Sandoval 	if (ret)
505ee056f98SOmar Sandoval 		return ret;
506ee056f98SOmar Sandoval 
507ee056f98SOmar Sandoval 	if (e->type->ops.mq.init_hctx) {
508ee056f98SOmar Sandoval 		ret = e->type->ops.mq.init_hctx(hctx, hctx_idx);
509ee056f98SOmar Sandoval 		if (ret) {
510ee056f98SOmar Sandoval 			blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
511ee056f98SOmar Sandoval 			return ret;
512ee056f98SOmar Sandoval 		}
513ee056f98SOmar Sandoval 	}
514ee056f98SOmar Sandoval 
515d332ce09SOmar Sandoval 	blk_mq_debugfs_register_sched_hctx(q, hctx);
516d332ce09SOmar Sandoval 
517ee056f98SOmar Sandoval 	return 0;
51893252632SOmar Sandoval }
51993252632SOmar Sandoval 
52093252632SOmar Sandoval void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx,
52193252632SOmar Sandoval 			    unsigned int hctx_idx)
52293252632SOmar Sandoval {
52393252632SOmar Sandoval 	struct elevator_queue *e = q->elevator;
52493252632SOmar Sandoval 
52593252632SOmar Sandoval 	if (!e)
52693252632SOmar Sandoval 		return;
52793252632SOmar Sandoval 
528d332ce09SOmar Sandoval 	blk_mq_debugfs_unregister_sched_hctx(hctx);
529d332ce09SOmar Sandoval 
530ee056f98SOmar Sandoval 	if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
531ee056f98SOmar Sandoval 		e->type->ops.mq.exit_hctx(hctx, hctx_idx);
532ee056f98SOmar Sandoval 		hctx->sched_data = NULL;
533ee056f98SOmar Sandoval 	}
534ee056f98SOmar Sandoval 
53593252632SOmar Sandoval 	blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx);
53693252632SOmar Sandoval }
53793252632SOmar Sandoval 
5386917ff0bSOmar Sandoval int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e)
5396917ff0bSOmar Sandoval {
5406917ff0bSOmar Sandoval 	struct blk_mq_hw_ctx *hctx;
541ee056f98SOmar Sandoval 	struct elevator_queue *eq;
5426917ff0bSOmar Sandoval 	unsigned int i;
5436917ff0bSOmar Sandoval 	int ret;
5446917ff0bSOmar Sandoval 
5456917ff0bSOmar Sandoval 	if (!e) {
5466917ff0bSOmar Sandoval 		q->elevator = NULL;
5476917ff0bSOmar Sandoval 		return 0;
5486917ff0bSOmar Sandoval 	}
5496917ff0bSOmar Sandoval 
5506917ff0bSOmar Sandoval 	/*
55132825c45SMing Lei 	 * Default to double of smaller one between hw queue_depth and 128,
55232825c45SMing Lei 	 * since we don't split into sync/async like the old code did.
55332825c45SMing Lei 	 * Additionally, this is a per-hw queue depth.
5546917ff0bSOmar Sandoval 	 */
55532825c45SMing Lei 	q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth,
55632825c45SMing Lei 				   BLKDEV_MAX_RQ);
5576917ff0bSOmar Sandoval 
5586917ff0bSOmar Sandoval 	queue_for_each_hw_ctx(q, hctx, i) {
5596917ff0bSOmar Sandoval 		ret = blk_mq_sched_alloc_tags(q, hctx, i);
5606917ff0bSOmar Sandoval 		if (ret)
5616917ff0bSOmar Sandoval 			goto err;
5626917ff0bSOmar Sandoval 	}
5636917ff0bSOmar Sandoval 
5646917ff0bSOmar Sandoval 	ret = e->ops.mq.init_sched(q, e);
5656917ff0bSOmar Sandoval 	if (ret)
5666917ff0bSOmar Sandoval 		goto err;
5676917ff0bSOmar Sandoval 
568d332ce09SOmar Sandoval 	blk_mq_debugfs_register_sched(q);
569d332ce09SOmar Sandoval 
570ee056f98SOmar Sandoval 	queue_for_each_hw_ctx(q, hctx, i) {
571d332ce09SOmar Sandoval 		if (e->ops.mq.init_hctx) {
572ee056f98SOmar Sandoval 			ret = e->ops.mq.init_hctx(hctx, i);
573ee056f98SOmar Sandoval 			if (ret) {
574ee056f98SOmar Sandoval 				eq = q->elevator;
575ee056f98SOmar Sandoval 				blk_mq_exit_sched(q, eq);
576ee056f98SOmar Sandoval 				kobject_put(&eq->kobj);
577ee056f98SOmar Sandoval 				return ret;
578ee056f98SOmar Sandoval 			}
579ee056f98SOmar Sandoval 		}
580d332ce09SOmar Sandoval 		blk_mq_debugfs_register_sched_hctx(q, hctx);
581ee056f98SOmar Sandoval 	}
582ee056f98SOmar Sandoval 
5836917ff0bSOmar Sandoval 	return 0;
5846917ff0bSOmar Sandoval 
5856917ff0bSOmar Sandoval err:
58654d5329dSOmar Sandoval 	blk_mq_sched_tags_teardown(q);
58754d5329dSOmar Sandoval 	q->elevator = NULL;
5886917ff0bSOmar Sandoval 	return ret;
5896917ff0bSOmar Sandoval }
5906917ff0bSOmar Sandoval 
59154d5329dSOmar Sandoval void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e)
59254d5329dSOmar Sandoval {
593ee056f98SOmar Sandoval 	struct blk_mq_hw_ctx *hctx;
594ee056f98SOmar Sandoval 	unsigned int i;
595ee056f98SOmar Sandoval 
596ee056f98SOmar Sandoval 	queue_for_each_hw_ctx(q, hctx, i) {
597d332ce09SOmar Sandoval 		blk_mq_debugfs_unregister_sched_hctx(hctx);
598d332ce09SOmar Sandoval 		if (e->type->ops.mq.exit_hctx && hctx->sched_data) {
599ee056f98SOmar Sandoval 			e->type->ops.mq.exit_hctx(hctx, i);
600ee056f98SOmar Sandoval 			hctx->sched_data = NULL;
601ee056f98SOmar Sandoval 		}
602ee056f98SOmar Sandoval 	}
603d332ce09SOmar Sandoval 	blk_mq_debugfs_unregister_sched(q);
60454d5329dSOmar Sandoval 	if (e->type->ops.mq.exit_sched)
60554d5329dSOmar Sandoval 		e->type->ops.mq.exit_sched(e);
60654d5329dSOmar Sandoval 	blk_mq_sched_tags_teardown(q);
60754d5329dSOmar Sandoval 	q->elevator = NULL;
60854d5329dSOmar Sandoval }
60954d5329dSOmar Sandoval 
610d3484991SJens Axboe int blk_mq_sched_init(struct request_queue *q)
611d3484991SJens Axboe {
612d3484991SJens Axboe 	int ret;
613d3484991SJens Axboe 
614d3484991SJens Axboe 	mutex_lock(&q->sysfs_lock);
615d3484991SJens Axboe 	ret = elevator_init(q, NULL);
616d3484991SJens Axboe 	mutex_unlock(&q->sysfs_lock);
617d3484991SJens Axboe 
618d3484991SJens Axboe 	return ret;
619d3484991SJens Axboe }
620