13dcf60bcSChristoph Hellwig // SPDX-License-Identifier: GPL-2.0 2bd166ef1SJens Axboe /* 3bd166ef1SJens Axboe * blk-mq scheduling framework 4bd166ef1SJens Axboe * 5bd166ef1SJens Axboe * Copyright (C) 2016 Jens Axboe 6bd166ef1SJens Axboe */ 7bd166ef1SJens Axboe #include <linux/kernel.h> 8bd166ef1SJens Axboe #include <linux/module.h> 9bd166ef1SJens Axboe #include <linux/blk-mq.h> 106e6fcbc2SMing Lei #include <linux/list_sort.h> 11bd166ef1SJens Axboe 12bd166ef1SJens Axboe #include <trace/events/block.h> 13bd166ef1SJens Axboe 14bd166ef1SJens Axboe #include "blk.h" 15bd166ef1SJens Axboe #include "blk-mq.h" 16d332ce09SOmar Sandoval #include "blk-mq-debugfs.h" 17bd166ef1SJens Axboe #include "blk-mq-sched.h" 18bd166ef1SJens Axboe #include "blk-mq-tag.h" 19bd166ef1SJens Axboe #include "blk-wbt.h" 20bd166ef1SJens Axboe 21bd166ef1SJens Axboe void blk_mq_sched_free_hctx_data(struct request_queue *q, 22bd166ef1SJens Axboe void (*exit)(struct blk_mq_hw_ctx *)) 23bd166ef1SJens Axboe { 24bd166ef1SJens Axboe struct blk_mq_hw_ctx *hctx; 25bd166ef1SJens Axboe int i; 26bd166ef1SJens Axboe 27bd166ef1SJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 28bd166ef1SJens Axboe if (exit && hctx->sched_data) 29bd166ef1SJens Axboe exit(hctx); 30bd166ef1SJens Axboe kfree(hctx->sched_data); 31bd166ef1SJens Axboe hctx->sched_data = NULL; 32bd166ef1SJens Axboe } 33bd166ef1SJens Axboe } 34bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 35bd166ef1SJens Axboe 36e2b3fa5aSDamien Le Moal void blk_mq_sched_assign_ioc(struct request *rq) 37bd166ef1SJens Axboe { 3844e8c2bfSChristoph Hellwig struct request_queue *q = rq->q; 390c62bff1SJens Axboe struct io_context *ioc; 40bd166ef1SJens Axboe struct io_cq *icq; 41bd166ef1SJens Axboe 420c62bff1SJens Axboe /* 430c62bff1SJens Axboe * May not have an IO context if it's a passthrough request 440c62bff1SJens Axboe */ 450c62bff1SJens Axboe ioc = current->io_context; 460c62bff1SJens Axboe if (!ioc) 470c62bff1SJens Axboe return; 480c62bff1SJens Axboe 490d945c1fSChristoph Hellwig spin_lock_irq(&q->queue_lock); 50bd166ef1SJens Axboe icq = ioc_lookup_icq(ioc, q); 510d945c1fSChristoph Hellwig spin_unlock_irq(&q->queue_lock); 52bd166ef1SJens Axboe 53bd166ef1SJens Axboe if (!icq) { 54bd166ef1SJens Axboe icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 55bd166ef1SJens Axboe if (!icq) 56bd166ef1SJens Axboe return; 57bd166ef1SJens Axboe } 58ea511e3cSChristoph Hellwig get_io_context(icq->ioc); 5944e8c2bfSChristoph Hellwig rq->elv.icq = icq; 60bd166ef1SJens Axboe } 61bd166ef1SJens Axboe 628e8320c9SJens Axboe /* 638e8320c9SJens Axboe * Mark a hardware queue as needing a restart. For shared queues, maintain 648e8320c9SJens Axboe * a count of how many hardware queues are marked for restart. 658e8320c9SJens Axboe */ 667211aef8SDamien Le Moal void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 678e8320c9SJens Axboe { 688e8320c9SJens Axboe if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 698e8320c9SJens Axboe return; 708e8320c9SJens Axboe 718e8320c9SJens Axboe set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 728e8320c9SJens Axboe } 737211aef8SDamien Le Moal EXPORT_SYMBOL_GPL(blk_mq_sched_mark_restart_hctx); 748e8320c9SJens Axboe 7597889f9aSMing Lei void blk_mq_sched_restart(struct blk_mq_hw_ctx *hctx) 768e8320c9SJens Axboe { 778e8320c9SJens Axboe if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 7897889f9aSMing Lei return; 798e8320c9SJens Axboe clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 808e8320c9SJens Axboe 8197889f9aSMing Lei blk_mq_run_hw_queue(hctx, true); 828e8320c9SJens Axboe } 838e8320c9SJens Axboe 846e6fcbc2SMing Lei static int sched_rq_cmp(void *priv, struct list_head *a, struct list_head *b) 856e6fcbc2SMing Lei { 866e6fcbc2SMing Lei struct request *rqa = container_of(a, struct request, queuelist); 876e6fcbc2SMing Lei struct request *rqb = container_of(b, struct request, queuelist); 886e6fcbc2SMing Lei 896e6fcbc2SMing Lei return rqa->mq_hctx > rqb->mq_hctx; 906e6fcbc2SMing Lei } 916e6fcbc2SMing Lei 926e6fcbc2SMing Lei static bool blk_mq_dispatch_hctx_list(struct list_head *rq_list) 936e6fcbc2SMing Lei { 946e6fcbc2SMing Lei struct blk_mq_hw_ctx *hctx = 956e6fcbc2SMing Lei list_first_entry(rq_list, struct request, queuelist)->mq_hctx; 966e6fcbc2SMing Lei struct request *rq; 976e6fcbc2SMing Lei LIST_HEAD(hctx_list); 986e6fcbc2SMing Lei unsigned int count = 0; 996e6fcbc2SMing Lei bool ret; 1006e6fcbc2SMing Lei 1016e6fcbc2SMing Lei list_for_each_entry(rq, rq_list, queuelist) { 1026e6fcbc2SMing Lei if (rq->mq_hctx != hctx) { 1036e6fcbc2SMing Lei list_cut_before(&hctx_list, rq_list, &rq->queuelist); 1046e6fcbc2SMing Lei goto dispatch; 1056e6fcbc2SMing Lei } 1066e6fcbc2SMing Lei count++; 1076e6fcbc2SMing Lei } 1086e6fcbc2SMing Lei list_splice_tail_init(rq_list, &hctx_list); 1096e6fcbc2SMing Lei 1106e6fcbc2SMing Lei dispatch: 1116e6fcbc2SMing Lei ret = blk_mq_dispatch_rq_list(hctx, &hctx_list, count); 1126e6fcbc2SMing Lei return ret; 1136e6fcbc2SMing Lei } 1146e6fcbc2SMing Lei 115a0823421SDouglas Anderson #define BLK_MQ_BUDGET_DELAY 3 /* ms units */ 116a0823421SDouglas Anderson 1171f460b63SMing Lei /* 1181f460b63SMing Lei * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 1191f460b63SMing Lei * its queue by itself in its completion handler, so we don't need to 1201f460b63SMing Lei * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 12128d65729SSalman Qazi * 12228d65729SSalman Qazi * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to 12328d65729SSalman Qazi * be run again. This is necessary to avoid starving flushes. 1241f460b63SMing Lei */ 1256e6fcbc2SMing Lei static int __blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) 126caf8eb0dSMing Lei { 127caf8eb0dSMing Lei struct request_queue *q = hctx->queue; 128caf8eb0dSMing Lei struct elevator_queue *e = q->elevator; 1296e6fcbc2SMing Lei bool multi_hctxs = false, run_queue = false; 1306e6fcbc2SMing Lei bool dispatched = false, busy = false; 1316e6fcbc2SMing Lei unsigned int max_dispatch; 132caf8eb0dSMing Lei LIST_HEAD(rq_list); 1336e6fcbc2SMing Lei int count = 0; 1346e6fcbc2SMing Lei 1356e6fcbc2SMing Lei if (hctx->dispatch_busy) 1366e6fcbc2SMing Lei max_dispatch = 1; 1376e6fcbc2SMing Lei else 1386e6fcbc2SMing Lei max_dispatch = hctx->queue->nr_requests; 139caf8eb0dSMing Lei 140445874e8SMing Lei do { 1416e6fcbc2SMing Lei struct request *rq; 1426e6fcbc2SMing Lei 143f9cd4bfeSJens Axboe if (e->type->ops.has_work && !e->type->ops.has_work(hctx)) 144caf8eb0dSMing Lei break; 145de148297SMing Lei 14628d65729SSalman Qazi if (!list_empty_careful(&hctx->dispatch)) { 1476e6fcbc2SMing Lei busy = true; 14828d65729SSalman Qazi break; 14928d65729SSalman Qazi } 15028d65729SSalman Qazi 15165c76369SMing Lei if (!blk_mq_get_dispatch_budget(q)) 1521f460b63SMing Lei break; 153de148297SMing Lei 154f9cd4bfeSJens Axboe rq = e->type->ops.dispatch_request(hctx); 155de148297SMing Lei if (!rq) { 15665c76369SMing Lei blk_mq_put_dispatch_budget(q); 157a0823421SDouglas Anderson /* 158a0823421SDouglas Anderson * We're releasing without dispatching. Holding the 159a0823421SDouglas Anderson * budget could have blocked any "hctx"s with the 160a0823421SDouglas Anderson * same queue and if we didn't dispatch then there's 161a0823421SDouglas Anderson * no guarantee anyone will kick the queue. Kick it 162a0823421SDouglas Anderson * ourselves. 163a0823421SDouglas Anderson */ 1646e6fcbc2SMing Lei run_queue = true; 165de148297SMing Lei break; 166caf8eb0dSMing Lei } 167caf8eb0dSMing Lei 168de148297SMing Lei /* 169de148297SMing Lei * Now this rq owns the budget which has to be released 170de148297SMing Lei * if this rq won't be queued to driver via .queue_rq() 171de148297SMing Lei * in blk_mq_dispatch_rq_list(). 172de148297SMing Lei */ 1736e6fcbc2SMing Lei list_add_tail(&rq->queuelist, &rq_list); 1746e6fcbc2SMing Lei if (rq->mq_hctx != hctx) 1756e6fcbc2SMing Lei multi_hctxs = true; 1766e6fcbc2SMing Lei } while (++count < max_dispatch); 1776e6fcbc2SMing Lei 1786e6fcbc2SMing Lei if (!count) { 1796e6fcbc2SMing Lei if (run_queue) 1806e6fcbc2SMing Lei blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); 1816e6fcbc2SMing Lei } else if (multi_hctxs) { 1826e6fcbc2SMing Lei /* 1836e6fcbc2SMing Lei * Requests from different hctx may be dequeued from some 1846e6fcbc2SMing Lei * schedulers, such as bfq and deadline. 1856e6fcbc2SMing Lei * 1866e6fcbc2SMing Lei * Sort the requests in the list according to their hctx, 1876e6fcbc2SMing Lei * dispatch batching requests from same hctx at a time. 1886e6fcbc2SMing Lei */ 1896e6fcbc2SMing Lei list_sort(NULL, &rq_list, sched_rq_cmp); 1906e6fcbc2SMing Lei do { 1916e6fcbc2SMing Lei dispatched |= blk_mq_dispatch_hctx_list(&rq_list); 1926e6fcbc2SMing Lei } while (!list_empty(&rq_list)); 1936e6fcbc2SMing Lei } else { 1946e6fcbc2SMing Lei dispatched = blk_mq_dispatch_rq_list(hctx, &rq_list, count); 1956e6fcbc2SMing Lei } 1966e6fcbc2SMing Lei 1976e6fcbc2SMing Lei if (busy) 1986e6fcbc2SMing Lei return -EAGAIN; 1996e6fcbc2SMing Lei return !!dispatched; 2006e6fcbc2SMing Lei } 2016e6fcbc2SMing Lei 2026e6fcbc2SMing Lei static int blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) 2036e6fcbc2SMing Lei { 2046e6fcbc2SMing Lei int ret; 2056e6fcbc2SMing Lei 2066e6fcbc2SMing Lei do { 2076e6fcbc2SMing Lei ret = __blk_mq_do_dispatch_sched(hctx); 2086e6fcbc2SMing Lei } while (ret == 1); 20928d65729SSalman Qazi 21028d65729SSalman Qazi return ret; 211de148297SMing Lei } 212de148297SMing Lei 213b347689fSMing Lei static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 214b347689fSMing Lei struct blk_mq_ctx *ctx) 215b347689fSMing Lei { 216f31967f0SJens Axboe unsigned short idx = ctx->index_hw[hctx->type]; 217b347689fSMing Lei 218b347689fSMing Lei if (++idx == hctx->nr_ctx) 219b347689fSMing Lei idx = 0; 220b347689fSMing Lei 221b347689fSMing Lei return hctx->ctxs[idx]; 222b347689fSMing Lei } 223b347689fSMing Lei 2241f460b63SMing Lei /* 2251f460b63SMing Lei * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 2261f460b63SMing Lei * its queue by itself in its completion handler, so we don't need to 2271f460b63SMing Lei * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 22828d65729SSalman Qazi * 22928d65729SSalman Qazi * Returns -EAGAIN if hctx->dispatch was found non-empty and run_work has to 23028d65729SSalman Qazi * to be run again. This is necessary to avoid starving flushes. 2311f460b63SMing Lei */ 23228d65729SSalman Qazi static int blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) 233b347689fSMing Lei { 234b347689fSMing Lei struct request_queue *q = hctx->queue; 235b347689fSMing Lei LIST_HEAD(rq_list); 236b347689fSMing Lei struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); 23728d65729SSalman Qazi int ret = 0; 238b347689fSMing Lei struct request *rq; 239b347689fSMing Lei 240445874e8SMing Lei do { 24128d65729SSalman Qazi if (!list_empty_careful(&hctx->dispatch)) { 24228d65729SSalman Qazi ret = -EAGAIN; 24328d65729SSalman Qazi break; 24428d65729SSalman Qazi } 24528d65729SSalman Qazi 246b347689fSMing Lei if (!sbitmap_any_bit_set(&hctx->ctx_map)) 247b347689fSMing Lei break; 248b347689fSMing Lei 24965c76369SMing Lei if (!blk_mq_get_dispatch_budget(q)) 2501f460b63SMing Lei break; 251b347689fSMing Lei 252b347689fSMing Lei rq = blk_mq_dequeue_from_ctx(hctx, ctx); 253b347689fSMing Lei if (!rq) { 25465c76369SMing Lei blk_mq_put_dispatch_budget(q); 255a0823421SDouglas Anderson /* 256a0823421SDouglas Anderson * We're releasing without dispatching. Holding the 257a0823421SDouglas Anderson * budget could have blocked any "hctx"s with the 258a0823421SDouglas Anderson * same queue and if we didn't dispatch then there's 259a0823421SDouglas Anderson * no guarantee anyone will kick the queue. Kick it 260a0823421SDouglas Anderson * ourselves. 261a0823421SDouglas Anderson */ 262a0823421SDouglas Anderson blk_mq_delay_run_hw_queues(q, BLK_MQ_BUDGET_DELAY); 263b347689fSMing Lei break; 264b347689fSMing Lei } 265b347689fSMing Lei 266b347689fSMing Lei /* 267b347689fSMing Lei * Now this rq owns the budget which has to be released 268b347689fSMing Lei * if this rq won't be queued to driver via .queue_rq() 269b347689fSMing Lei * in blk_mq_dispatch_rq_list(). 270b347689fSMing Lei */ 271b347689fSMing Lei list_add(&rq->queuelist, &rq_list); 272b347689fSMing Lei 273b347689fSMing Lei /* round robin for fair dispatch */ 274b347689fSMing Lei ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); 275b347689fSMing Lei 2761fd40b5eSMing Lei } while (blk_mq_dispatch_rq_list(rq->mq_hctx, &rq_list, 1)); 277b347689fSMing Lei 278b347689fSMing Lei WRITE_ONCE(hctx->dispatch_from, ctx); 27928d65729SSalman Qazi return ret; 280b347689fSMing Lei } 281b347689fSMing Lei 282e1b586f2SZheng Bin static int __blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 283bd166ef1SJens Axboe { 28481380ca1SOmar Sandoval struct request_queue *q = hctx->queue; 28581380ca1SOmar Sandoval struct elevator_queue *e = q->elevator; 286f9cd4bfeSJens Axboe const bool has_sched_dispatch = e && e->type->ops.dispatch_request; 28728d65729SSalman Qazi int ret = 0; 288bd166ef1SJens Axboe LIST_HEAD(rq_list); 289bd166ef1SJens Axboe 290bd166ef1SJens Axboe /* 291bd166ef1SJens Axboe * If we have previous entries on our dispatch list, grab them first for 292bd166ef1SJens Axboe * more fair dispatch. 293bd166ef1SJens Axboe */ 294bd166ef1SJens Axboe if (!list_empty_careful(&hctx->dispatch)) { 295bd166ef1SJens Axboe spin_lock(&hctx->lock); 296bd166ef1SJens Axboe if (!list_empty(&hctx->dispatch)) 297bd166ef1SJens Axboe list_splice_init(&hctx->dispatch, &rq_list); 298bd166ef1SJens Axboe spin_unlock(&hctx->lock); 299bd166ef1SJens Axboe } 300bd166ef1SJens Axboe 301bd166ef1SJens Axboe /* 302bd166ef1SJens Axboe * Only ask the scheduler for requests, if we didn't have residual 303bd166ef1SJens Axboe * requests from the dispatch list. This is to avoid the case where 304bd166ef1SJens Axboe * we only ever dispatch a fraction of the requests available because 305bd166ef1SJens Axboe * of low device queue depth. Once we pull requests out of the IO 306bd166ef1SJens Axboe * scheduler, we can no longer merge or sort them. So it's best to 307bd166ef1SJens Axboe * leave them there for as long as we can. Mark the hw queue as 308bd166ef1SJens Axboe * needing a restart in that case. 309caf8eb0dSMing Lei * 3105e3d02bbSMing Lei * We want to dispatch from the scheduler if there was nothing 3115e3d02bbSMing Lei * on the dispatch list or we were able to dispatch from the 3125e3d02bbSMing Lei * dispatch list. 31364765a75SJens Axboe */ 314caf8eb0dSMing Lei if (!list_empty(&rq_list)) { 315caf8eb0dSMing Lei blk_mq_sched_mark_restart_hctx(hctx); 3161fd40b5eSMing Lei if (blk_mq_dispatch_rq_list(hctx, &rq_list, 0)) { 317b347689fSMing Lei if (has_sched_dispatch) 31828d65729SSalman Qazi ret = blk_mq_do_dispatch_sched(hctx); 319b347689fSMing Lei else 32028d65729SSalman Qazi ret = blk_mq_do_dispatch_ctx(hctx); 321b347689fSMing Lei } 322caf8eb0dSMing Lei } else if (has_sched_dispatch) { 32328d65729SSalman Qazi ret = blk_mq_do_dispatch_sched(hctx); 3246e768717SMing Lei } else if (hctx->dispatch_busy) { 3256e768717SMing Lei /* dequeue request one by one from sw queue if queue is busy */ 32628d65729SSalman Qazi ret = blk_mq_do_dispatch_ctx(hctx); 327caf8eb0dSMing Lei } else { 328caf8eb0dSMing Lei blk_mq_flush_busy_ctxs(hctx, &rq_list); 3291fd40b5eSMing Lei blk_mq_dispatch_rq_list(hctx, &rq_list, 0); 330c13660a0SJens Axboe } 33128d65729SSalman Qazi 33228d65729SSalman Qazi return ret; 33328d65729SSalman Qazi } 33428d65729SSalman Qazi 33528d65729SSalman Qazi void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 33628d65729SSalman Qazi { 33728d65729SSalman Qazi struct request_queue *q = hctx->queue; 33828d65729SSalman Qazi 33928d65729SSalman Qazi /* RCU or SRCU read lock is needed before checking quiesced flag */ 34028d65729SSalman Qazi if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 34128d65729SSalman Qazi return; 34228d65729SSalman Qazi 34328d65729SSalman Qazi hctx->run++; 34428d65729SSalman Qazi 34528d65729SSalman Qazi /* 34628d65729SSalman Qazi * A return of -EAGAIN is an indication that hctx->dispatch is not 34728d65729SSalman Qazi * empty and we must run again in order to avoid starving flushes. 34828d65729SSalman Qazi */ 34928d65729SSalman Qazi if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) { 35028d65729SSalman Qazi if (__blk_mq_sched_dispatch_requests(hctx) == -EAGAIN) 35128d65729SSalman Qazi blk_mq_run_hw_queue(hctx, true); 35228d65729SSalman Qazi } 353bd166ef1SJens Axboe } 354bd166ef1SJens Axboe 355e4d750c9SJens Axboe bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 35614ccb66bSChristoph Hellwig unsigned int nr_segs, struct request **merged_request) 357bd166ef1SJens Axboe { 358bd166ef1SJens Axboe struct request *rq; 359bd166ef1SJens Axboe 36034fe7c05SChristoph Hellwig switch (elv_merge(q, &rq, bio)) { 36134fe7c05SChristoph Hellwig case ELEVATOR_BACK_MERGE: 362bd166ef1SJens Axboe if (!blk_mq_sched_allow_merge(q, rq, bio)) 363bd166ef1SJens Axboe return false; 36414ccb66bSChristoph Hellwig if (!bio_attempt_back_merge(rq, bio, nr_segs)) 36534fe7c05SChristoph Hellwig return false; 366e4d750c9SJens Axboe *merged_request = attempt_back_merge(q, rq); 367e4d750c9SJens Axboe if (!*merged_request) 36834fe7c05SChristoph Hellwig elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); 369bd166ef1SJens Axboe return true; 37034fe7c05SChristoph Hellwig case ELEVATOR_FRONT_MERGE: 371bd166ef1SJens Axboe if (!blk_mq_sched_allow_merge(q, rq, bio)) 372bd166ef1SJens Axboe return false; 37314ccb66bSChristoph Hellwig if (!bio_attempt_front_merge(rq, bio, nr_segs)) 37434fe7c05SChristoph Hellwig return false; 375e4d750c9SJens Axboe *merged_request = attempt_front_merge(q, rq); 376e4d750c9SJens Axboe if (!*merged_request) 37734fe7c05SChristoph Hellwig elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 378bd166ef1SJens Axboe return true; 379bea99a50SKeith Busch case ELEVATOR_DISCARD_MERGE: 380bea99a50SKeith Busch return bio_attempt_discard_merge(q, rq, bio); 38134fe7c05SChristoph Hellwig default: 382bd166ef1SJens Axboe return false; 383bd166ef1SJens Axboe } 38434fe7c05SChristoph Hellwig } 385bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 386bd166ef1SJens Axboe 3879bddeb2aSMing Lei /* 3889c558734SJens Axboe * Iterate list of requests and see if we can merge this bio with any 3899c558734SJens Axboe * of them. 3909bddeb2aSMing Lei */ 3919c558734SJens Axboe bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 39214ccb66bSChristoph Hellwig struct bio *bio, unsigned int nr_segs) 3939bddeb2aSMing Lei { 3949bddeb2aSMing Lei struct request *rq; 3959bddeb2aSMing Lei int checked = 8; 3969bddeb2aSMing Lei 3979c558734SJens Axboe list_for_each_entry_reverse(rq, list, queuelist) { 3989bddeb2aSMing Lei bool merged = false; 3999bddeb2aSMing Lei 4009bddeb2aSMing Lei if (!checked--) 4019bddeb2aSMing Lei break; 4029bddeb2aSMing Lei 4039bddeb2aSMing Lei if (!blk_rq_merge_ok(rq, bio)) 4049bddeb2aSMing Lei continue; 4059bddeb2aSMing Lei 4069bddeb2aSMing Lei switch (blk_try_merge(rq, bio)) { 4079bddeb2aSMing Lei case ELEVATOR_BACK_MERGE: 4089bddeb2aSMing Lei if (blk_mq_sched_allow_merge(q, rq, bio)) 40914ccb66bSChristoph Hellwig merged = bio_attempt_back_merge(rq, bio, 41014ccb66bSChristoph Hellwig nr_segs); 4119bddeb2aSMing Lei break; 4129bddeb2aSMing Lei case ELEVATOR_FRONT_MERGE: 4139bddeb2aSMing Lei if (blk_mq_sched_allow_merge(q, rq, bio)) 41414ccb66bSChristoph Hellwig merged = bio_attempt_front_merge(rq, bio, 41514ccb66bSChristoph Hellwig nr_segs); 4169bddeb2aSMing Lei break; 4179bddeb2aSMing Lei case ELEVATOR_DISCARD_MERGE: 4189bddeb2aSMing Lei merged = bio_attempt_discard_merge(q, rq, bio); 4199bddeb2aSMing Lei break; 4209bddeb2aSMing Lei default: 4219bddeb2aSMing Lei continue; 4229bddeb2aSMing Lei } 4239bddeb2aSMing Lei 4249bddeb2aSMing Lei return merged; 4259bddeb2aSMing Lei } 4269bddeb2aSMing Lei 4279bddeb2aSMing Lei return false; 4289bddeb2aSMing Lei } 4299c558734SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); 4309c558734SJens Axboe 4319c558734SJens Axboe /* 4329c558734SJens Axboe * Reverse check our software queue for entries that we could potentially 4339c558734SJens Axboe * merge with. Currently includes a hand-wavy stop count of 8, to not spend 4349c558734SJens Axboe * too much time checking for merges. 4359c558734SJens Axboe */ 4369c558734SJens Axboe static bool blk_mq_attempt_merge(struct request_queue *q, 437c16d6b5aSMing Lei struct blk_mq_hw_ctx *hctx, 43814ccb66bSChristoph Hellwig struct blk_mq_ctx *ctx, struct bio *bio, 43914ccb66bSChristoph Hellwig unsigned int nr_segs) 4409c558734SJens Axboe { 441c16d6b5aSMing Lei enum hctx_type type = hctx->type; 442c16d6b5aSMing Lei 4439c558734SJens Axboe lockdep_assert_held(&ctx->lock); 4449c558734SJens Axboe 44514ccb66bSChristoph Hellwig if (blk_mq_bio_list_merge(q, &ctx->rq_lists[type], bio, nr_segs)) { 4469c558734SJens Axboe ctx->rq_merged++; 4479c558734SJens Axboe return true; 4489c558734SJens Axboe } 4499c558734SJens Axboe 4509c558734SJens Axboe return false; 4519c558734SJens Axboe } 4529bddeb2aSMing Lei 45314ccb66bSChristoph Hellwig bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio, 45414ccb66bSChristoph Hellwig unsigned int nr_segs) 455bd166ef1SJens Axboe { 456bd166ef1SJens Axboe struct elevator_queue *e = q->elevator; 457bd166ef1SJens Axboe struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 4588ccdf4a3SJianchao Wang struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, bio->bi_opf, ctx); 4599bddeb2aSMing Lei bool ret = false; 460c16d6b5aSMing Lei enum hctx_type type; 461bd166ef1SJens Axboe 462c05f4220SBart Van Assche if (e && e->type->ops.bio_merge) 46314ccb66bSChristoph Hellwig return e->type->ops.bio_merge(hctx, bio, nr_segs); 464bd166ef1SJens Axboe 465c16d6b5aSMing Lei type = hctx->type; 466b04f50abSMing Lei if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 467c16d6b5aSMing Lei !list_empty_careful(&ctx->rq_lists[type])) { 4689bddeb2aSMing Lei /* default per sw-queue merge */ 4699bddeb2aSMing Lei spin_lock(&ctx->lock); 47014ccb66bSChristoph Hellwig ret = blk_mq_attempt_merge(q, hctx, ctx, bio, nr_segs); 4719bddeb2aSMing Lei spin_unlock(&ctx->lock); 4729bddeb2aSMing Lei } 4739bddeb2aSMing Lei 4749bddeb2aSMing Lei return ret; 475bd166ef1SJens Axboe } 476bd166ef1SJens Axboe 477bd166ef1SJens Axboe bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) 478bd166ef1SJens Axboe { 479bd166ef1SJens Axboe return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); 480bd166ef1SJens Axboe } 481bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 482bd166ef1SJens Axboe 483bd166ef1SJens Axboe void blk_mq_sched_request_inserted(struct request *rq) 484bd166ef1SJens Axboe { 485bd166ef1SJens Axboe trace_block_rq_insert(rq->q, rq); 486bd166ef1SJens Axboe } 487bd166ef1SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 488bd166ef1SJens Axboe 4890cacba6cSOmar Sandoval static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 490a6a252e6SMing Lei bool has_sched, 4910cacba6cSOmar Sandoval struct request *rq) 492bd166ef1SJens Axboe { 49301e99aecSMing Lei /* 49401e99aecSMing Lei * dispatch flush and passthrough rq directly 49501e99aecSMing Lei * 49601e99aecSMing Lei * passthrough request has to be added to hctx->dispatch directly. 49701e99aecSMing Lei * For some reason, device may be in one situation which can't 49801e99aecSMing Lei * handle FS request, so STS_RESOURCE is always returned and the 49901e99aecSMing Lei * FS request will be added to hctx->dispatch. However passthrough 50001e99aecSMing Lei * request may be required at that time for fixing the problem. If 50101e99aecSMing Lei * passthrough request is added to scheduler queue, there isn't any 50201e99aecSMing Lei * chance to dispatch it given we prioritize requests in hctx->dispatch. 50301e99aecSMing Lei */ 50401e99aecSMing Lei if ((rq->rq_flags & RQF_FLUSH_SEQ) || blk_rq_is_passthrough(rq)) 505bd166ef1SJens Axboe return true; 506bd166ef1SJens Axboe 507923218f6SMing Lei if (has_sched) 508a6a252e6SMing Lei rq->rq_flags |= RQF_SORTED; 509a6a252e6SMing Lei 510a6a252e6SMing Lei return false; 511a6a252e6SMing Lei } 512a6a252e6SMing Lei 513bd6737f1SJens Axboe void blk_mq_sched_insert_request(struct request *rq, bool at_head, 5149e97d295SMike Snitzer bool run_queue, bool async) 515bd6737f1SJens Axboe { 516bd6737f1SJens Axboe struct request_queue *q = rq->q; 517bd6737f1SJens Axboe struct elevator_queue *e = q->elevator; 518bd6737f1SJens Axboe struct blk_mq_ctx *ctx = rq->mq_ctx; 519ea4f995eSJens Axboe struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 520bd6737f1SJens Axboe 521a6a252e6SMing Lei /* flush rq in flush machinery need to be dispatched directly */ 522a6a252e6SMing Lei if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 523923218f6SMing Lei blk_insert_flush(rq); 524923218f6SMing Lei goto run; 525bd6737f1SJens Axboe } 526bd6737f1SJens Axboe 527923218f6SMing Lei WARN_ON(e && (rq->tag != -1)); 528923218f6SMing Lei 52901e99aecSMing Lei if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) { 530cc3200eaSMing Lei /* 531cc3200eaSMing Lei * Firstly normal IO request is inserted to scheduler queue or 532cc3200eaSMing Lei * sw queue, meantime we add flush request to dispatch queue( 533cc3200eaSMing Lei * hctx->dispatch) directly and there is at most one in-flight 534cc3200eaSMing Lei * flush request for each hw queue, so it doesn't matter to add 535cc3200eaSMing Lei * flush request to tail or front of the dispatch queue. 536cc3200eaSMing Lei * 537cc3200eaSMing Lei * Secondly in case of NCQ, flush request belongs to non-NCQ 538cc3200eaSMing Lei * command, and queueing it will fail when there is any 539cc3200eaSMing Lei * in-flight normal IO request(NCQ command). When adding flush 540cc3200eaSMing Lei * rq to the front of hctx->dispatch, it is easier to introduce 541cc3200eaSMing Lei * extra time to flush rq's latency because of S_SCHED_RESTART 542cc3200eaSMing Lei * compared with adding to the tail of dispatch queue, then 543cc3200eaSMing Lei * chance of flush merge is increased, and less flush requests 544cc3200eaSMing Lei * will be issued to controller. It is observed that ~10% time 545cc3200eaSMing Lei * is saved in blktests block/004 on disk attached to AHCI/NCQ 546cc3200eaSMing Lei * drive when adding flush rq to the front of hctx->dispatch. 547cc3200eaSMing Lei * 548cc3200eaSMing Lei * Simply queue flush rq to the front of hctx->dispatch so that 549cc3200eaSMing Lei * intensive flush workloads can benefit in case of NCQ HW. 550cc3200eaSMing Lei */ 551cc3200eaSMing Lei at_head = (rq->rq_flags & RQF_FLUSH_SEQ) ? true : at_head; 55201e99aecSMing Lei blk_mq_request_bypass_insert(rq, at_head, false); 5530cacba6cSOmar Sandoval goto run; 55401e99aecSMing Lei } 5550cacba6cSOmar Sandoval 556f9cd4bfeSJens Axboe if (e && e->type->ops.insert_requests) { 557bd6737f1SJens Axboe LIST_HEAD(list); 558bd6737f1SJens Axboe 559bd6737f1SJens Axboe list_add(&rq->queuelist, &list); 560f9cd4bfeSJens Axboe e->type->ops.insert_requests(hctx, &list, at_head); 561bd6737f1SJens Axboe } else { 562bd6737f1SJens Axboe spin_lock(&ctx->lock); 563bd6737f1SJens Axboe __blk_mq_insert_request(hctx, rq, at_head); 564bd6737f1SJens Axboe spin_unlock(&ctx->lock); 565bd6737f1SJens Axboe } 566bd6737f1SJens Axboe 5670cacba6cSOmar Sandoval run: 568bd6737f1SJens Axboe if (run_queue) 569bd6737f1SJens Axboe blk_mq_run_hw_queue(hctx, async); 570bd6737f1SJens Axboe } 571bd6737f1SJens Axboe 57267cae4c9SJens Axboe void blk_mq_sched_insert_requests(struct blk_mq_hw_ctx *hctx, 573bd6737f1SJens Axboe struct blk_mq_ctx *ctx, 574bd6737f1SJens Axboe struct list_head *list, bool run_queue_async) 575bd6737f1SJens Axboe { 576f9afca4dSJens Axboe struct elevator_queue *e; 577e87eb301SMing Lei struct request_queue *q = hctx->queue; 578e87eb301SMing Lei 579e87eb301SMing Lei /* 580e87eb301SMing Lei * blk_mq_sched_insert_requests() is called from flush plug 581e87eb301SMing Lei * context only, and hold one usage counter to prevent queue 582e87eb301SMing Lei * from being released. 583e87eb301SMing Lei */ 584e87eb301SMing Lei percpu_ref_get(&q->q_usage_counter); 585f9afca4dSJens Axboe 586f9afca4dSJens Axboe e = hctx->queue->elevator; 587f9cd4bfeSJens Axboe if (e && e->type->ops.insert_requests) 588f9cd4bfeSJens Axboe e->type->ops.insert_requests(hctx, list, false); 5896ce3dd6eSMing Lei else { 5906ce3dd6eSMing Lei /* 5916ce3dd6eSMing Lei * try to issue requests directly if the hw queue isn't 5926ce3dd6eSMing Lei * busy in case of 'none' scheduler, and this way may save 5936ce3dd6eSMing Lei * us one extra enqueue & dequeue to sw queue. 5946ce3dd6eSMing Lei */ 595fd9c40f6SBart Van Assche if (!hctx->dispatch_busy && !e && !run_queue_async) { 5966ce3dd6eSMing Lei blk_mq_try_issue_list_directly(hctx, list); 597fd9c40f6SBart Van Assche if (list_empty(list)) 598e87eb301SMing Lei goto out; 599fd9c40f6SBart Van Assche } 600bd6737f1SJens Axboe blk_mq_insert_requests(hctx, ctx, list); 6016ce3dd6eSMing Lei } 602bd6737f1SJens Axboe 603bd6737f1SJens Axboe blk_mq_run_hw_queue(hctx, run_queue_async); 604e87eb301SMing Lei out: 605e87eb301SMing Lei percpu_ref_put(&q->q_usage_counter); 606bd6737f1SJens Axboe } 607bd6737f1SJens Axboe 608bd166ef1SJens Axboe static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 609bd166ef1SJens Axboe struct blk_mq_hw_ctx *hctx, 610bd166ef1SJens Axboe unsigned int hctx_idx) 611bd166ef1SJens Axboe { 612bd166ef1SJens Axboe if (hctx->sched_tags) { 613bd166ef1SJens Axboe blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 614bd166ef1SJens Axboe blk_mq_free_rq_map(hctx->sched_tags); 615bd166ef1SJens Axboe hctx->sched_tags = NULL; 616bd166ef1SJens Axboe } 617bd166ef1SJens Axboe } 618bd166ef1SJens Axboe 6196917ff0bSOmar Sandoval static int blk_mq_sched_alloc_tags(struct request_queue *q, 6206917ff0bSOmar Sandoval struct blk_mq_hw_ctx *hctx, 6216917ff0bSOmar Sandoval unsigned int hctx_idx) 622bd166ef1SJens Axboe { 623bd166ef1SJens Axboe struct blk_mq_tag_set *set = q->tag_set; 6246917ff0bSOmar Sandoval int ret; 625bd166ef1SJens Axboe 6266917ff0bSOmar Sandoval hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 6276917ff0bSOmar Sandoval set->reserved_tags); 628bd166ef1SJens Axboe if (!hctx->sched_tags) 6296917ff0bSOmar Sandoval return -ENOMEM; 6306917ff0bSOmar Sandoval 6316917ff0bSOmar Sandoval ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 6326917ff0bSOmar Sandoval if (ret) 6336917ff0bSOmar Sandoval blk_mq_sched_free_tags(set, hctx, hctx_idx); 634bd166ef1SJens Axboe 635bd166ef1SJens Axboe return ret; 636bd166ef1SJens Axboe } 637bd166ef1SJens Axboe 638c3e22192SMing Lei /* called in queue's release handler, tagset has gone away */ 63954d5329dSOmar Sandoval static void blk_mq_sched_tags_teardown(struct request_queue *q) 640bd166ef1SJens Axboe { 641bd166ef1SJens Axboe struct blk_mq_hw_ctx *hctx; 642bd166ef1SJens Axboe int i; 643bd166ef1SJens Axboe 644c3e22192SMing Lei queue_for_each_hw_ctx(q, hctx, i) { 645c3e22192SMing Lei if (hctx->sched_tags) { 646c3e22192SMing Lei blk_mq_free_rq_map(hctx->sched_tags); 647c3e22192SMing Lei hctx->sched_tags = NULL; 648c3e22192SMing Lei } 649c3e22192SMing Lei } 650bd166ef1SJens Axboe } 651d3484991SJens Axboe 6526917ff0bSOmar Sandoval int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 6536917ff0bSOmar Sandoval { 6546917ff0bSOmar Sandoval struct blk_mq_hw_ctx *hctx; 655ee056f98SOmar Sandoval struct elevator_queue *eq; 6566917ff0bSOmar Sandoval unsigned int i; 6576917ff0bSOmar Sandoval int ret; 6586917ff0bSOmar Sandoval 6596917ff0bSOmar Sandoval if (!e) { 6606917ff0bSOmar Sandoval q->elevator = NULL; 66132a50fabSMing Lei q->nr_requests = q->tag_set->queue_depth; 6626917ff0bSOmar Sandoval return 0; 6636917ff0bSOmar Sandoval } 6646917ff0bSOmar Sandoval 6656917ff0bSOmar Sandoval /* 66632825c45SMing Lei * Default to double of smaller one between hw queue_depth and 128, 66732825c45SMing Lei * since we don't split into sync/async like the old code did. 66832825c45SMing Lei * Additionally, this is a per-hw queue depth. 6696917ff0bSOmar Sandoval */ 67032825c45SMing Lei q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 67132825c45SMing Lei BLKDEV_MAX_RQ); 6726917ff0bSOmar Sandoval 6736917ff0bSOmar Sandoval queue_for_each_hw_ctx(q, hctx, i) { 6746917ff0bSOmar Sandoval ret = blk_mq_sched_alloc_tags(q, hctx, i); 6756917ff0bSOmar Sandoval if (ret) 6766917ff0bSOmar Sandoval goto err; 6776917ff0bSOmar Sandoval } 6786917ff0bSOmar Sandoval 679f9cd4bfeSJens Axboe ret = e->ops.init_sched(q, e); 6806917ff0bSOmar Sandoval if (ret) 6816917ff0bSOmar Sandoval goto err; 6826917ff0bSOmar Sandoval 683d332ce09SOmar Sandoval blk_mq_debugfs_register_sched(q); 684d332ce09SOmar Sandoval 685ee056f98SOmar Sandoval queue_for_each_hw_ctx(q, hctx, i) { 686f9cd4bfeSJens Axboe if (e->ops.init_hctx) { 687f9cd4bfeSJens Axboe ret = e->ops.init_hctx(hctx, i); 688ee056f98SOmar Sandoval if (ret) { 689ee056f98SOmar Sandoval eq = q->elevator; 690c3e22192SMing Lei blk_mq_sched_free_requests(q); 691ee056f98SOmar Sandoval blk_mq_exit_sched(q, eq); 692ee056f98SOmar Sandoval kobject_put(&eq->kobj); 693ee056f98SOmar Sandoval return ret; 694ee056f98SOmar Sandoval } 695ee056f98SOmar Sandoval } 696d332ce09SOmar Sandoval blk_mq_debugfs_register_sched_hctx(q, hctx); 697ee056f98SOmar Sandoval } 698ee056f98SOmar Sandoval 6996917ff0bSOmar Sandoval return 0; 7006917ff0bSOmar Sandoval 7016917ff0bSOmar Sandoval err: 702c3e22192SMing Lei blk_mq_sched_free_requests(q); 70354d5329dSOmar Sandoval blk_mq_sched_tags_teardown(q); 70454d5329dSOmar Sandoval q->elevator = NULL; 7056917ff0bSOmar Sandoval return ret; 7066917ff0bSOmar Sandoval } 7076917ff0bSOmar Sandoval 708c3e22192SMing Lei /* 709c3e22192SMing Lei * called in either blk_queue_cleanup or elevator_switch, tagset 710c3e22192SMing Lei * is required for freeing requests 711c3e22192SMing Lei */ 712c3e22192SMing Lei void blk_mq_sched_free_requests(struct request_queue *q) 713c3e22192SMing Lei { 714c3e22192SMing Lei struct blk_mq_hw_ctx *hctx; 715c3e22192SMing Lei int i; 716c3e22192SMing Lei 717c3e22192SMing Lei queue_for_each_hw_ctx(q, hctx, i) { 718c3e22192SMing Lei if (hctx->sched_tags) 719c3e22192SMing Lei blk_mq_free_rqs(q->tag_set, hctx->sched_tags, i); 720c3e22192SMing Lei } 721c3e22192SMing Lei } 722c3e22192SMing Lei 72354d5329dSOmar Sandoval void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 72454d5329dSOmar Sandoval { 725ee056f98SOmar Sandoval struct blk_mq_hw_ctx *hctx; 726ee056f98SOmar Sandoval unsigned int i; 727ee056f98SOmar Sandoval 728ee056f98SOmar Sandoval queue_for_each_hw_ctx(q, hctx, i) { 729d332ce09SOmar Sandoval blk_mq_debugfs_unregister_sched_hctx(hctx); 730f9cd4bfeSJens Axboe if (e->type->ops.exit_hctx && hctx->sched_data) { 731f9cd4bfeSJens Axboe e->type->ops.exit_hctx(hctx, i); 732ee056f98SOmar Sandoval hctx->sched_data = NULL; 733ee056f98SOmar Sandoval } 734ee056f98SOmar Sandoval } 735d332ce09SOmar Sandoval blk_mq_debugfs_unregister_sched(q); 736f9cd4bfeSJens Axboe if (e->type->ops.exit_sched) 737f9cd4bfeSJens Axboe e->type->ops.exit_sched(e); 73854d5329dSOmar Sandoval blk_mq_sched_tags_teardown(q); 73954d5329dSOmar Sandoval q->elevator = NULL; 74054d5329dSOmar Sandoval } 741