13dcf60bcSChristoph Hellwig // SPDX-License-Identifier: GPL-2.0 275bb4625SJens Axboe /* 375bb4625SJens Axboe * Block multiqueue core code 475bb4625SJens Axboe * 575bb4625SJens Axboe * Copyright (C) 2013-2014 Jens Axboe 675bb4625SJens Axboe * Copyright (C) 2013-2014 Christoph Hellwig 775bb4625SJens Axboe */ 8320ae51fSJens Axboe #include <linux/kernel.h> 9320ae51fSJens Axboe #include <linux/module.h> 10320ae51fSJens Axboe #include <linux/backing-dev.h> 11320ae51fSJens Axboe #include <linux/bio.h> 12320ae51fSJens Axboe #include <linux/blkdev.h> 13fe45e630SChristoph Hellwig #include <linux/blk-integrity.h> 14f75782e4SCatalin Marinas #include <linux/kmemleak.h> 15320ae51fSJens Axboe #include <linux/mm.h> 16320ae51fSJens Axboe #include <linux/init.h> 17320ae51fSJens Axboe #include <linux/slab.h> 18320ae51fSJens Axboe #include <linux/workqueue.h> 19320ae51fSJens Axboe #include <linux/smp.h> 20e41d12f5SChristoph Hellwig #include <linux/interrupt.h> 21320ae51fSJens Axboe #include <linux/llist.h> 22320ae51fSJens Axboe #include <linux/list_sort.h> 23320ae51fSJens Axboe #include <linux/cpu.h> 24320ae51fSJens Axboe #include <linux/cache.h> 25320ae51fSJens Axboe #include <linux/sched/sysctl.h> 26105ab3d8SIngo Molnar #include <linux/sched/topology.h> 27174cd4b1SIngo Molnar #include <linux/sched/signal.h> 28320ae51fSJens Axboe #include <linux/delay.h> 29aedcd72fSJens Axboe #include <linux/crash_dump.h> 3088c7b2b7SJens Axboe #include <linux/prefetch.h> 31a892c8d5SSatya Tangirala #include <linux/blk-crypto.h> 32320ae51fSJens Axboe 33320ae51fSJens Axboe #include <trace/events/block.h> 34320ae51fSJens Axboe 35320ae51fSJens Axboe #include <linux/blk-mq.h> 3654d4e6abSMax Gurtovoy #include <linux/t10-pi.h> 37320ae51fSJens Axboe #include "blk.h" 38320ae51fSJens Axboe #include "blk-mq.h" 399c1051aaSOmar Sandoval #include "blk-mq-debugfs.h" 40320ae51fSJens Axboe #include "blk-mq-tag.h" 41986d413bSBart Van Assche #include "blk-pm.h" 42cf43e6beSJens Axboe #include "blk-stat.h" 43bd166ef1SJens Axboe #include "blk-mq-sched.h" 44c1c80384SJosef Bacik #include "blk-rq-qos.h" 45320ae51fSJens Axboe 46f9ab4918SSebastian Andrzej Siewior static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 47c3077b5dSChristoph Hellwig 4834dbad5dSOmar Sandoval static void blk_mq_poll_stats_start(struct request_queue *q); 4934dbad5dSOmar Sandoval static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 5034dbad5dSOmar Sandoval 51720b8cccSStephen Bates static int blk_mq_poll_stats_bkt(const struct request *rq) 52720b8cccSStephen Bates { 533d244306SHou Tao int ddir, sectors, bucket; 54720b8cccSStephen Bates 5599c749a4SJens Axboe ddir = rq_data_dir(rq); 563d244306SHou Tao sectors = blk_rq_stats_sectors(rq); 57720b8cccSStephen Bates 583d244306SHou Tao bucket = ddir + 2 * ilog2(sectors); 59720b8cccSStephen Bates 60720b8cccSStephen Bates if (bucket < 0) 61720b8cccSStephen Bates return -1; 62720b8cccSStephen Bates else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 63720b8cccSStephen Bates return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 64720b8cccSStephen Bates 65720b8cccSStephen Bates return bucket; 66720b8cccSStephen Bates } 67720b8cccSStephen Bates 683e08773cSChristoph Hellwig #define BLK_QC_T_SHIFT 16 693e08773cSChristoph Hellwig #define BLK_QC_T_INTERNAL (1U << 31) 703e08773cSChristoph Hellwig 71f70299f0SChristoph Hellwig static inline struct blk_mq_hw_ctx *blk_qc_to_hctx(struct request_queue *q, 72f70299f0SChristoph Hellwig blk_qc_t qc) 73f70299f0SChristoph Hellwig { 74f70299f0SChristoph Hellwig return q->queue_hw_ctx[(qc & ~BLK_QC_T_INTERNAL) >> BLK_QC_T_SHIFT]; 75f70299f0SChristoph Hellwig } 76f70299f0SChristoph Hellwig 77c6699d6fSChristoph Hellwig static inline struct request *blk_qc_to_rq(struct blk_mq_hw_ctx *hctx, 78c6699d6fSChristoph Hellwig blk_qc_t qc) 79c6699d6fSChristoph Hellwig { 80efbabbe1SChristoph Hellwig unsigned int tag = qc & ((1U << BLK_QC_T_SHIFT) - 1); 81efbabbe1SChristoph Hellwig 82efbabbe1SChristoph Hellwig if (qc & BLK_QC_T_INTERNAL) 83efbabbe1SChristoph Hellwig return blk_mq_tag_to_rq(hctx->sched_tags, tag); 84efbabbe1SChristoph Hellwig return blk_mq_tag_to_rq(hctx->tags, tag); 85c6699d6fSChristoph Hellwig } 86c6699d6fSChristoph Hellwig 873e08773cSChristoph Hellwig static inline blk_qc_t blk_rq_to_qc(struct request *rq) 883e08773cSChristoph Hellwig { 893e08773cSChristoph Hellwig return (rq->mq_hctx->queue_num << BLK_QC_T_SHIFT) | 903e08773cSChristoph Hellwig (rq->tag != -1 ? 913e08773cSChristoph Hellwig rq->tag : (rq->internal_tag | BLK_QC_T_INTERNAL)); 923e08773cSChristoph Hellwig } 933e08773cSChristoph Hellwig 94320ae51fSJens Axboe /* 9585fae294SYufen Yu * Check if any of the ctx, dispatch list or elevator 9685fae294SYufen Yu * have pending work in this hardware queue. 97320ae51fSJens Axboe */ 9879f720a7SJens Axboe static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 99320ae51fSJens Axboe { 10079f720a7SJens Axboe return !list_empty_careful(&hctx->dispatch) || 10179f720a7SJens Axboe sbitmap_any_bit_set(&hctx->ctx_map) || 102bd166ef1SJens Axboe blk_mq_sched_has_work(hctx); 103320ae51fSJens Axboe } 104320ae51fSJens Axboe 105320ae51fSJens Axboe /* 106320ae51fSJens Axboe * Mark this ctx as having pending work in this hardware queue 107320ae51fSJens Axboe */ 108320ae51fSJens Axboe static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 109320ae51fSJens Axboe struct blk_mq_ctx *ctx) 110320ae51fSJens Axboe { 111f31967f0SJens Axboe const int bit = ctx->index_hw[hctx->type]; 112f31967f0SJens Axboe 113f31967f0SJens Axboe if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 114f31967f0SJens Axboe sbitmap_set_bit(&hctx->ctx_map, bit); 1151429d7c9SJens Axboe } 1161429d7c9SJens Axboe 1171429d7c9SJens Axboe static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 1181429d7c9SJens Axboe struct blk_mq_ctx *ctx) 1191429d7c9SJens Axboe { 120f31967f0SJens Axboe const int bit = ctx->index_hw[hctx->type]; 121f31967f0SJens Axboe 122f31967f0SJens Axboe sbitmap_clear_bit(&hctx->ctx_map, bit); 123320ae51fSJens Axboe } 124320ae51fSJens Axboe 125f299b7c7SJens Axboe struct mq_inflight { 1268446fe92SChristoph Hellwig struct block_device *part; 127a2e80f6fSPavel Begunkov unsigned int inflight[2]; 128f299b7c7SJens Axboe }; 129f299b7c7SJens Axboe 1307baa8572SJens Axboe static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 131f299b7c7SJens Axboe struct request *rq, void *priv, 132f299b7c7SJens Axboe bool reserved) 133f299b7c7SJens Axboe { 134f299b7c7SJens Axboe struct mq_inflight *mi = priv; 135f299b7c7SJens Axboe 136b0d97557SJeffle Xu if ((!mi->part->bd_partno || rq->part == mi->part) && 137b0d97557SJeffle Xu blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) 138bb4e6b14SPavel Begunkov mi->inflight[rq_data_dir(rq)]++; 1397baa8572SJens Axboe 1407baa8572SJens Axboe return true; 141f299b7c7SJens Axboe } 142f299b7c7SJens Axboe 1438446fe92SChristoph Hellwig unsigned int blk_mq_in_flight(struct request_queue *q, 1448446fe92SChristoph Hellwig struct block_device *part) 145f299b7c7SJens Axboe { 146a2e80f6fSPavel Begunkov struct mq_inflight mi = { .part = part }; 147f299b7c7SJens Axboe 148f299b7c7SJens Axboe blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 149e016b782SMikulas Patocka 150a2e80f6fSPavel Begunkov return mi.inflight[0] + mi.inflight[1]; 151bf0ddabaSOmar Sandoval } 152bf0ddabaSOmar Sandoval 1538446fe92SChristoph Hellwig void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, 154bf0ddabaSOmar Sandoval unsigned int inflight[2]) 155bf0ddabaSOmar Sandoval { 156a2e80f6fSPavel Begunkov struct mq_inflight mi = { .part = part }; 157bf0ddabaSOmar Sandoval 158bb4e6b14SPavel Begunkov blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 159a2e80f6fSPavel Begunkov inflight[0] = mi.inflight[0]; 160a2e80f6fSPavel Begunkov inflight[1] = mi.inflight[1]; 161bf0ddabaSOmar Sandoval } 162bf0ddabaSOmar Sandoval 1631671d522SMing Lei void blk_freeze_queue_start(struct request_queue *q) 16443a5e4e2SMing Lei { 1657996a8b5SBob Liu mutex_lock(&q->mq_freeze_lock); 1667996a8b5SBob Liu if (++q->mq_freeze_depth == 1) { 1673ef28e83SDan Williams percpu_ref_kill(&q->q_usage_counter); 1687996a8b5SBob Liu mutex_unlock(&q->mq_freeze_lock); 169344e9ffcSJens Axboe if (queue_is_mq(q)) 170b94ec296SMike Snitzer blk_mq_run_hw_queues(q, false); 1717996a8b5SBob Liu } else { 1727996a8b5SBob Liu mutex_unlock(&q->mq_freeze_lock); 173cddd5d17STejun Heo } 174f3af020bSTejun Heo } 1751671d522SMing Lei EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 176f3af020bSTejun Heo 1776bae363eSKeith Busch void blk_mq_freeze_queue_wait(struct request_queue *q) 178f3af020bSTejun Heo { 1793ef28e83SDan Williams wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 18043a5e4e2SMing Lei } 1816bae363eSKeith Busch EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 18243a5e4e2SMing Lei 183f91328c4SKeith Busch int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 184f91328c4SKeith Busch unsigned long timeout) 185f91328c4SKeith Busch { 186f91328c4SKeith Busch return wait_event_timeout(q->mq_freeze_wq, 187f91328c4SKeith Busch percpu_ref_is_zero(&q->q_usage_counter), 188f91328c4SKeith Busch timeout); 189f91328c4SKeith Busch } 190f91328c4SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 191320ae51fSJens Axboe 192f3af020bSTejun Heo /* 193f3af020bSTejun Heo * Guarantee no request is in use, so we can change any data structure of 194f3af020bSTejun Heo * the queue afterward. 195f3af020bSTejun Heo */ 1963ef28e83SDan Williams void blk_freeze_queue(struct request_queue *q) 197f3af020bSTejun Heo { 1983ef28e83SDan Williams /* 1993ef28e83SDan Williams * In the !blk_mq case we are only calling this to kill the 2003ef28e83SDan Williams * q_usage_counter, otherwise this increases the freeze depth 2013ef28e83SDan Williams * and waits for it to return to zero. For this reason there is 2023ef28e83SDan Williams * no blk_unfreeze_queue(), and blk_freeze_queue() is not 2033ef28e83SDan Williams * exported to drivers as the only user for unfreeze is blk_mq. 2043ef28e83SDan Williams */ 2051671d522SMing Lei blk_freeze_queue_start(q); 206f3af020bSTejun Heo blk_mq_freeze_queue_wait(q); 207f3af020bSTejun Heo } 2083ef28e83SDan Williams 2093ef28e83SDan Williams void blk_mq_freeze_queue(struct request_queue *q) 2103ef28e83SDan Williams { 2113ef28e83SDan Williams /* 2123ef28e83SDan Williams * ...just an alias to keep freeze and unfreeze actions balanced 2133ef28e83SDan Williams * in the blk_mq_* namespace 2143ef28e83SDan Williams */ 2153ef28e83SDan Williams blk_freeze_queue(q); 2163ef28e83SDan Williams } 217c761d96bSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 218f3af020bSTejun Heo 219aec89dc5SChristoph Hellwig void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 220320ae51fSJens Axboe { 2217996a8b5SBob Liu mutex_lock(&q->mq_freeze_lock); 222aec89dc5SChristoph Hellwig if (force_atomic) 223aec89dc5SChristoph Hellwig q->q_usage_counter.data->force_atomic = true; 2247996a8b5SBob Liu q->mq_freeze_depth--; 2257996a8b5SBob Liu WARN_ON_ONCE(q->mq_freeze_depth < 0); 2267996a8b5SBob Liu if (!q->mq_freeze_depth) { 227bdd63160SBart Van Assche percpu_ref_resurrect(&q->q_usage_counter); 228320ae51fSJens Axboe wake_up_all(&q->mq_freeze_wq); 229320ae51fSJens Axboe } 2307996a8b5SBob Liu mutex_unlock(&q->mq_freeze_lock); 231add703fdSTejun Heo } 232aec89dc5SChristoph Hellwig 233aec89dc5SChristoph Hellwig void blk_mq_unfreeze_queue(struct request_queue *q) 234aec89dc5SChristoph Hellwig { 235aec89dc5SChristoph Hellwig __blk_mq_unfreeze_queue(q, false); 236aec89dc5SChristoph Hellwig } 237b4c6a028SKeith Busch EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 238320ae51fSJens Axboe 239852ec809SBart Van Assche /* 240852ec809SBart Van Assche * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 241852ec809SBart Van Assche * mpt3sas driver such that this function can be removed. 242852ec809SBart Van Assche */ 243852ec809SBart Van Assche void blk_mq_quiesce_queue_nowait(struct request_queue *q) 244852ec809SBart Van Assche { 2458814ce8aSBart Van Assche blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 246852ec809SBart Van Assche } 247852ec809SBart Van Assche EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 248852ec809SBart Van Assche 2496a83e74dSBart Van Assche /** 25069e07c4aSMing Lei * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 2516a83e74dSBart Van Assche * @q: request queue. 2526a83e74dSBart Van Assche * 2536a83e74dSBart Van Assche * Note: this function does not prevent that the struct request end_io() 25469e07c4aSMing Lei * callback function is invoked. Once this function is returned, we make 25569e07c4aSMing Lei * sure no dispatch can happen until the queue is unquiesced via 25669e07c4aSMing Lei * blk_mq_unquiesce_queue(). 2576a83e74dSBart Van Assche */ 2586a83e74dSBart Van Assche void blk_mq_quiesce_queue(struct request_queue *q) 2596a83e74dSBart Van Assche { 2606a83e74dSBart Van Assche struct blk_mq_hw_ctx *hctx; 2616a83e74dSBart Van Assche unsigned int i; 2626a83e74dSBart Van Assche bool rcu = false; 2636a83e74dSBart Van Assche 2641d9e9bc6SMing Lei blk_mq_quiesce_queue_nowait(q); 265f4560ffeSMing Lei 2666a83e74dSBart Van Assche queue_for_each_hw_ctx(q, hctx, i) { 2676a83e74dSBart Van Assche if (hctx->flags & BLK_MQ_F_BLOCKING) 26805707b64STejun Heo synchronize_srcu(hctx->srcu); 2696a83e74dSBart Van Assche else 2706a83e74dSBart Van Assche rcu = true; 2716a83e74dSBart Van Assche } 2726a83e74dSBart Van Assche if (rcu) 2736a83e74dSBart Van Assche synchronize_rcu(); 2746a83e74dSBart Van Assche } 2756a83e74dSBart Van Assche EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 2766a83e74dSBart Van Assche 277e4e73913SMing Lei /* 278e4e73913SMing Lei * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 279e4e73913SMing Lei * @q: request queue. 280e4e73913SMing Lei * 281e4e73913SMing Lei * This function recovers queue into the state before quiescing 282e4e73913SMing Lei * which is done by blk_mq_quiesce_queue. 283e4e73913SMing Lei */ 284e4e73913SMing Lei void blk_mq_unquiesce_queue(struct request_queue *q) 285e4e73913SMing Lei { 2868814ce8aSBart Van Assche blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 287f4560ffeSMing Lei 2881d9e9bc6SMing Lei /* dispatch requests which are inserted during quiescing */ 2891d9e9bc6SMing Lei blk_mq_run_hw_queues(q, true); 290e4e73913SMing Lei } 291e4e73913SMing Lei EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 292e4e73913SMing Lei 293aed3ea94SJens Axboe void blk_mq_wake_waiters(struct request_queue *q) 294aed3ea94SJens Axboe { 295aed3ea94SJens Axboe struct blk_mq_hw_ctx *hctx; 296aed3ea94SJens Axboe unsigned int i; 297aed3ea94SJens Axboe 298aed3ea94SJens Axboe queue_for_each_hw_ctx(q, hctx, i) 299aed3ea94SJens Axboe if (blk_mq_hw_queue_mapped(hctx)) 300aed3ea94SJens Axboe blk_mq_tag_wakeup_all(hctx->tags, true); 301aed3ea94SJens Axboe } 302aed3ea94SJens Axboe 303e4cdf1a1SChristoph Hellwig static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 3047ea4d8a4SChristoph Hellwig unsigned int tag, u64 alloc_time_ns) 305320ae51fSJens Axboe { 306605f784eSPavel Begunkov struct blk_mq_ctx *ctx = data->ctx; 307605f784eSPavel Begunkov struct blk_mq_hw_ctx *hctx = data->hctx; 308605f784eSPavel Begunkov struct request_queue *q = data->q; 309605f784eSPavel Begunkov struct elevator_queue *e = q->elevator; 310e4cdf1a1SChristoph Hellwig struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 311e4cdf1a1SChristoph Hellwig struct request *rq = tags->static_rqs[tag]; 31212845906SPavel Begunkov unsigned int rq_flags = 0; 313c3a148d2SBart Van Assche 314605f784eSPavel Begunkov if (e) { 31512845906SPavel Begunkov rq_flags = RQF_ELV; 31676647368SChristoph Hellwig rq->tag = BLK_MQ_NO_TAG; 317e4cdf1a1SChristoph Hellwig rq->internal_tag = tag; 318e4cdf1a1SChristoph Hellwig } else { 319e4cdf1a1SChristoph Hellwig rq->tag = tag; 32076647368SChristoph Hellwig rq->internal_tag = BLK_MQ_NO_TAG; 321e4cdf1a1SChristoph Hellwig } 322e4cdf1a1SChristoph Hellwig 32312845906SPavel Begunkov if (data->flags & BLK_MQ_REQ_PM) 32412845906SPavel Begunkov rq_flags |= RQF_PM; 32512845906SPavel Begunkov if (blk_queue_io_stat(q)) 32612845906SPavel Begunkov rq_flags |= RQF_IO_STAT; 32712845906SPavel Begunkov rq->rq_flags = rq_flags; 32812845906SPavel Begunkov 3294f266f2bSPavel Begunkov if (blk_mq_need_time_stamp(rq)) 3304f266f2bSPavel Begunkov rq->start_time_ns = ktime_get_ns(); 3314f266f2bSPavel Begunkov else 3324f266f2bSPavel Begunkov rq->start_time_ns = 0; 333af76e555SChristoph Hellwig /* csd/requeue_work/fifo_time is initialized before use */ 334605f784eSPavel Begunkov rq->q = q; 335605f784eSPavel Begunkov rq->mq_ctx = ctx; 336605f784eSPavel Begunkov rq->mq_hctx = hctx; 3377ea4d8a4SChristoph Hellwig rq->cmd_flags = data->cmd_flags; 338af76e555SChristoph Hellwig rq->rq_disk = NULL; 339af76e555SChristoph Hellwig rq->part = NULL; 3406f816b4bSTejun Heo #ifdef CONFIG_BLK_RQ_ALLOC_TIME 3416f816b4bSTejun Heo rq->alloc_time_ns = alloc_time_ns; 3426f816b4bSTejun Heo #endif 343544ccc8dSOmar Sandoval rq->io_start_time_ns = 0; 3443d244306SHou Tao rq->stats_sectors = 0; 345af76e555SChristoph Hellwig rq->nr_phys_segments = 0; 346af76e555SChristoph Hellwig #if defined(CONFIG_BLK_DEV_INTEGRITY) 347af76e555SChristoph Hellwig rq->nr_integrity_segments = 0; 348af76e555SChristoph Hellwig #endif 349f6be4fb4SJens Axboe rq->timeout = 0; 350af76e555SChristoph Hellwig rq->end_io = NULL; 351af76e555SChristoph Hellwig rq->end_io_data = NULL; 352af76e555SChristoph Hellwig 3534f266f2bSPavel Begunkov blk_crypto_rq_set_defaults(rq); 3544f266f2bSPavel Begunkov INIT_LIST_HEAD(&rq->queuelist); 3554f266f2bSPavel Begunkov /* tag was already set */ 3564f266f2bSPavel Begunkov WRITE_ONCE(rq->deadline, 0); 35712f5b931SKeith Busch refcount_set(&rq->ref, 1); 3587ea4d8a4SChristoph Hellwig 3594f266f2bSPavel Begunkov if (rq->rq_flags & RQF_ELV) { 3607ea4d8a4SChristoph Hellwig struct elevator_queue *e = data->q->elevator; 3617ea4d8a4SChristoph Hellwig 3627ea4d8a4SChristoph Hellwig rq->elv.icq = NULL; 3634f266f2bSPavel Begunkov INIT_HLIST_NODE(&rq->hash); 3644f266f2bSPavel Begunkov RB_CLEAR_NODE(&rq->rb_node); 3654f266f2bSPavel Begunkov 3664f266f2bSPavel Begunkov if (!op_is_flush(data->cmd_flags) && 3674f266f2bSPavel Begunkov e->type->ops.prepare_request) { 3687ea4d8a4SChristoph Hellwig if (e->type->icq_cache) 3697ea4d8a4SChristoph Hellwig blk_mq_sched_assign_ioc(rq); 3707ea4d8a4SChristoph Hellwig 3717ea4d8a4SChristoph Hellwig e->type->ops.prepare_request(rq); 3727ea4d8a4SChristoph Hellwig rq->rq_flags |= RQF_ELVPRIV; 3737ea4d8a4SChristoph Hellwig } 3747ea4d8a4SChristoph Hellwig } 3757ea4d8a4SChristoph Hellwig 3765dee8577SChristoph Hellwig return rq; 3775dee8577SChristoph Hellwig } 3785dee8577SChristoph Hellwig 379349302daSJens Axboe static inline struct request * 380349302daSJens Axboe __blk_mq_alloc_requests_batch(struct blk_mq_alloc_data *data, 381349302daSJens Axboe u64 alloc_time_ns) 382349302daSJens Axboe { 383349302daSJens Axboe unsigned int tag, tag_offset; 384349302daSJens Axboe struct request *rq; 385349302daSJens Axboe unsigned long tags; 386349302daSJens Axboe int i, nr = 0; 387349302daSJens Axboe 388349302daSJens Axboe tags = blk_mq_get_tags(data, data->nr_tags, &tag_offset); 389349302daSJens Axboe if (unlikely(!tags)) 390349302daSJens Axboe return NULL; 391349302daSJens Axboe 392349302daSJens Axboe for (i = 0; tags; i++) { 393349302daSJens Axboe if (!(tags & (1UL << i))) 394349302daSJens Axboe continue; 395349302daSJens Axboe tag = tag_offset + i; 396349302daSJens Axboe tags &= ~(1UL << i); 397349302daSJens Axboe rq = blk_mq_rq_ctx_init(data, tag, alloc_time_ns); 398013a7f95SJens Axboe rq_list_add(data->cached_rq, rq); 399349302daSJens Axboe } 400349302daSJens Axboe data->nr_tags -= nr; 401349302daSJens Axboe 402013a7f95SJens Axboe return rq_list_pop(data->cached_rq); 403349302daSJens Axboe } 404349302daSJens Axboe 405b90cfaedSChristoph Hellwig static struct request *__blk_mq_alloc_requests(struct blk_mq_alloc_data *data) 406d2c0d383SChristoph Hellwig { 407e6e7abffSChristoph Hellwig struct request_queue *q = data->q; 408d2c0d383SChristoph Hellwig struct elevator_queue *e = q->elevator; 4096f816b4bSTejun Heo u64 alloc_time_ns = 0; 41047c122e3SJens Axboe struct request *rq; 411600c3b0cSChristoph Hellwig unsigned int tag; 412d2c0d383SChristoph Hellwig 4136f816b4bSTejun Heo /* alloc_time includes depth and tag waits */ 4146f816b4bSTejun Heo if (blk_queue_rq_alloc_time(q)) 4156f816b4bSTejun Heo alloc_time_ns = ktime_get_ns(); 4166f816b4bSTejun Heo 417f9afca4dSJens Axboe if (data->cmd_flags & REQ_NOWAIT) 41803a07c92SGoldwyn Rodrigues data->flags |= BLK_MQ_REQ_NOWAIT; 419d2c0d383SChristoph Hellwig 420d2c0d383SChristoph Hellwig if (e) { 421d2c0d383SChristoph Hellwig /* 4228d663f34SLin Feng * Flush/passthrough requests are special and go directly to the 42317a51199SJens Axboe * dispatch list. Don't include reserved tags in the 42417a51199SJens Axboe * limiting, as it isn't useful. 425d2c0d383SChristoph Hellwig */ 426f9afca4dSJens Axboe if (!op_is_flush(data->cmd_flags) && 4278d663f34SLin Feng !blk_op_is_passthrough(data->cmd_flags) && 428f9afca4dSJens Axboe e->type->ops.limit_depth && 42917a51199SJens Axboe !(data->flags & BLK_MQ_REQ_RESERVED)) 430f9afca4dSJens Axboe e->type->ops.limit_depth(data->cmd_flags, data); 431d2c0d383SChristoph Hellwig } 432d2c0d383SChristoph Hellwig 433bf0beec0SMing Lei retry: 434600c3b0cSChristoph Hellwig data->ctx = blk_mq_get_ctx(q); 435600c3b0cSChristoph Hellwig data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); 43642fdc5e4SChristoph Hellwig if (!e) 437600c3b0cSChristoph Hellwig blk_mq_tag_busy(data->hctx); 438600c3b0cSChristoph Hellwig 439bf0beec0SMing Lei /* 440349302daSJens Axboe * Try batched alloc if we want more than 1 tag. 441349302daSJens Axboe */ 442349302daSJens Axboe if (data->nr_tags > 1) { 443349302daSJens Axboe rq = __blk_mq_alloc_requests_batch(data, alloc_time_ns); 444349302daSJens Axboe if (rq) 445349302daSJens Axboe return rq; 446349302daSJens Axboe data->nr_tags = 1; 447349302daSJens Axboe } 448349302daSJens Axboe 449349302daSJens Axboe /* 450bf0beec0SMing Lei * Waiting allocations only fail because of an inactive hctx. In that 451bf0beec0SMing Lei * case just retry the hctx assignment and tag allocation as CPU hotplug 452bf0beec0SMing Lei * should have migrated us to an online CPU by now. 453bf0beec0SMing Lei */ 454e4cdf1a1SChristoph Hellwig tag = blk_mq_get_tag(data); 455b90cfaedSChristoph Hellwig if (tag == BLK_MQ_NO_TAG) { 456bf0beec0SMing Lei if (data->flags & BLK_MQ_REQ_NOWAIT) 457349302daSJens Axboe return NULL; 458bf0beec0SMing Lei /* 459b90cfaedSChristoph Hellwig * Give up the CPU and sleep for a random short time to 460b90cfaedSChristoph Hellwig * ensure that thread using a realtime scheduling class 461b90cfaedSChristoph Hellwig * are migrated off the CPU, and thus off the hctx that 462b90cfaedSChristoph Hellwig * is going away. 463bf0beec0SMing Lei */ 464bf0beec0SMing Lei msleep(3); 465bf0beec0SMing Lei goto retry; 466b90cfaedSChristoph Hellwig } 467b90cfaedSChristoph Hellwig 468349302daSJens Axboe return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); 469bf0beec0SMing Lei } 47047c122e3SJens Axboe 471cd6ce148SBart Van Assche struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 4729a95e4efSBart Van Assche blk_mq_req_flags_t flags) 473320ae51fSJens Axboe { 474e6e7abffSChristoph Hellwig struct blk_mq_alloc_data data = { 475e6e7abffSChristoph Hellwig .q = q, 476e6e7abffSChristoph Hellwig .flags = flags, 477e6e7abffSChristoph Hellwig .cmd_flags = op, 47847c122e3SJens Axboe .nr_tags = 1, 479e6e7abffSChristoph Hellwig }; 480bd166ef1SJens Axboe struct request *rq; 481a492f075SJoe Lawrence int ret; 482320ae51fSJens Axboe 4833a0a5299SBart Van Assche ret = blk_queue_enter(q, flags); 484a492f075SJoe Lawrence if (ret) 485a492f075SJoe Lawrence return ERR_PTR(ret); 486320ae51fSJens Axboe 487b90cfaedSChristoph Hellwig rq = __blk_mq_alloc_requests(&data); 488bd166ef1SJens Axboe if (!rq) 489a5ea5811SChristoph Hellwig goto out_queue_exit; 4900c4de0f3SChristoph Hellwig rq->__data_len = 0; 4910c4de0f3SChristoph Hellwig rq->__sector = (sector_t) -1; 4920c4de0f3SChristoph Hellwig rq->bio = rq->biotail = NULL; 493320ae51fSJens Axboe return rq; 494a5ea5811SChristoph Hellwig out_queue_exit: 495a5ea5811SChristoph Hellwig blk_queue_exit(q); 496a5ea5811SChristoph Hellwig return ERR_PTR(-EWOULDBLOCK); 497320ae51fSJens Axboe } 4984bb659b1SJens Axboe EXPORT_SYMBOL(blk_mq_alloc_request); 499320ae51fSJens Axboe 500cd6ce148SBart Van Assche struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 5019a95e4efSBart Van Assche unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 5021f5bd336SMing Lin { 503e6e7abffSChristoph Hellwig struct blk_mq_alloc_data data = { 504e6e7abffSChristoph Hellwig .q = q, 505e6e7abffSChristoph Hellwig .flags = flags, 506e6e7abffSChristoph Hellwig .cmd_flags = op, 50747c122e3SJens Axboe .nr_tags = 1, 508e6e7abffSChristoph Hellwig }; 509600c3b0cSChristoph Hellwig u64 alloc_time_ns = 0; 5106d2809d5SOmar Sandoval unsigned int cpu; 511600c3b0cSChristoph Hellwig unsigned int tag; 5121f5bd336SMing Lin int ret; 5131f5bd336SMing Lin 514600c3b0cSChristoph Hellwig /* alloc_time includes depth and tag waits */ 515600c3b0cSChristoph Hellwig if (blk_queue_rq_alloc_time(q)) 516600c3b0cSChristoph Hellwig alloc_time_ns = ktime_get_ns(); 517600c3b0cSChristoph Hellwig 5181f5bd336SMing Lin /* 5191f5bd336SMing Lin * If the tag allocator sleeps we could get an allocation for a 5201f5bd336SMing Lin * different hardware context. No need to complicate the low level 5211f5bd336SMing Lin * allocator for this for the rare use case of a command tied to 5221f5bd336SMing Lin * a specific queue. 5231f5bd336SMing Lin */ 524600c3b0cSChristoph Hellwig if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) 5251f5bd336SMing Lin return ERR_PTR(-EINVAL); 5261f5bd336SMing Lin 5271f5bd336SMing Lin if (hctx_idx >= q->nr_hw_queues) 5281f5bd336SMing Lin return ERR_PTR(-EIO); 5291f5bd336SMing Lin 5303a0a5299SBart Van Assche ret = blk_queue_enter(q, flags); 5311f5bd336SMing Lin if (ret) 5321f5bd336SMing Lin return ERR_PTR(ret); 5331f5bd336SMing Lin 534c8712c6aSChristoph Hellwig /* 535c8712c6aSChristoph Hellwig * Check if the hardware context is actually mapped to anything. 536c8712c6aSChristoph Hellwig * If not tell the caller that it should skip this queue. 537c8712c6aSChristoph Hellwig */ 538a5ea5811SChristoph Hellwig ret = -EXDEV; 539e6e7abffSChristoph Hellwig data.hctx = q->queue_hw_ctx[hctx_idx]; 540e6e7abffSChristoph Hellwig if (!blk_mq_hw_queue_mapped(data.hctx)) 541a5ea5811SChristoph Hellwig goto out_queue_exit; 542e6e7abffSChristoph Hellwig cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); 543e6e7abffSChristoph Hellwig data.ctx = __blk_mq_get_ctx(q, cpu); 5441f5bd336SMing Lin 54542fdc5e4SChristoph Hellwig if (!q->elevator) 546600c3b0cSChristoph Hellwig blk_mq_tag_busy(data.hctx); 547600c3b0cSChristoph Hellwig 548a5ea5811SChristoph Hellwig ret = -EWOULDBLOCK; 549600c3b0cSChristoph Hellwig tag = blk_mq_get_tag(&data); 550600c3b0cSChristoph Hellwig if (tag == BLK_MQ_NO_TAG) 551a5ea5811SChristoph Hellwig goto out_queue_exit; 552600c3b0cSChristoph Hellwig return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); 553600c3b0cSChristoph Hellwig 554a5ea5811SChristoph Hellwig out_queue_exit: 555a5ea5811SChristoph Hellwig blk_queue_exit(q); 556a5ea5811SChristoph Hellwig return ERR_PTR(ret); 5571f5bd336SMing Lin } 5581f5bd336SMing Lin EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 5591f5bd336SMing Lin 56012f5b931SKeith Busch static void __blk_mq_free_request(struct request *rq) 56112f5b931SKeith Busch { 56212f5b931SKeith Busch struct request_queue *q = rq->q; 56312f5b931SKeith Busch struct blk_mq_ctx *ctx = rq->mq_ctx; 564ea4f995eSJens Axboe struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 56512f5b931SKeith Busch const int sched_tag = rq->internal_tag; 56612f5b931SKeith Busch 567a892c8d5SSatya Tangirala blk_crypto_free_request(rq); 568986d413bSBart Van Assche blk_pm_mark_last_busy(rq); 569ea4f995eSJens Axboe rq->mq_hctx = NULL; 57076647368SChristoph Hellwig if (rq->tag != BLK_MQ_NO_TAG) 571cae740a0SJohn Garry blk_mq_put_tag(hctx->tags, ctx, rq->tag); 57276647368SChristoph Hellwig if (sched_tag != BLK_MQ_NO_TAG) 573cae740a0SJohn Garry blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); 57412f5b931SKeith Busch blk_mq_sched_restart(hctx); 57512f5b931SKeith Busch blk_queue_exit(q); 57612f5b931SKeith Busch } 57712f5b931SKeith Busch 5786af54051SChristoph Hellwig void blk_mq_free_request(struct request *rq) 579320ae51fSJens Axboe { 580320ae51fSJens Axboe struct request_queue *q = rq->q; 581ea4f995eSJens Axboe struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 582320ae51fSJens Axboe 583e0d78afeSJens Axboe if (rq->rq_flags & RQF_ELVPRIV) { 5842ff0682dSJens Axboe struct elevator_queue *e = q->elevator; 5852ff0682dSJens Axboe 5862ff0682dSJens Axboe if (e->type->ops.finish_request) 587f9cd4bfeSJens Axboe e->type->ops.finish_request(rq); 5886af54051SChristoph Hellwig if (rq->elv.icq) { 5896af54051SChristoph Hellwig put_io_context(rq->elv.icq->ioc); 5906af54051SChristoph Hellwig rq->elv.icq = NULL; 5916af54051SChristoph Hellwig } 5926af54051SChristoph Hellwig } 5936af54051SChristoph Hellwig 594e8064021SChristoph Hellwig if (rq->rq_flags & RQF_MQ_INFLIGHT) 595bccf5e26SJohn Garry __blk_mq_dec_active_requests(hctx); 59687760e5eSJens Axboe 5977beb2f84SJens Axboe if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 598d152c682SChristoph Hellwig laptop_io_completion(q->disk->bdi); 5997beb2f84SJens Axboe 600a7905043SJosef Bacik rq_qos_done(q, rq); 6010d2602caSJens Axboe 60212f5b931SKeith Busch WRITE_ONCE(rq->state, MQ_RQ_IDLE); 60312f5b931SKeith Busch if (refcount_dec_and_test(&rq->ref)) 60412f5b931SKeith Busch __blk_mq_free_request(rq); 605320ae51fSJens Axboe } 6061a3b595aSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_free_request); 607320ae51fSJens Axboe 60847c122e3SJens Axboe void blk_mq_free_plug_rqs(struct blk_plug *plug) 60947c122e3SJens Axboe { 61047c122e3SJens Axboe struct request *rq; 61147c122e3SJens Axboe 612013a7f95SJens Axboe while ((rq = rq_list_pop(&plug->cached_rq)) != NULL) { 61347c122e3SJens Axboe percpu_ref_get(&rq->q->q_usage_counter); 61447c122e3SJens Axboe blk_mq_free_request(rq); 61547c122e3SJens Axboe } 61647c122e3SJens Axboe } 61747c122e3SJens Axboe 6189be3e06fSJens Axboe static void req_bio_endio(struct request *rq, struct bio *bio, 6199be3e06fSJens Axboe unsigned int nbytes, blk_status_t error) 6209be3e06fSJens Axboe { 6219be3e06fSJens Axboe if (error) 6229be3e06fSJens Axboe bio->bi_status = error; 6239be3e06fSJens Axboe 6249be3e06fSJens Axboe if (unlikely(rq->rq_flags & RQF_QUIET)) 6259be3e06fSJens Axboe bio_set_flag(bio, BIO_QUIET); 6269be3e06fSJens Axboe 6279be3e06fSJens Axboe bio_advance(bio, nbytes); 6289be3e06fSJens Axboe 6299be3e06fSJens Axboe if (req_op(rq) == REQ_OP_ZONE_APPEND && error == BLK_STS_OK) { 6309be3e06fSJens Axboe /* 6319be3e06fSJens Axboe * Partial zone append completions cannot be supported as the 6329be3e06fSJens Axboe * BIO fragments may end up not being written sequentially. 6339be3e06fSJens Axboe */ 6349be3e06fSJens Axboe if (bio->bi_iter.bi_size) 6359be3e06fSJens Axboe bio->bi_status = BLK_STS_IOERR; 6369be3e06fSJens Axboe else 6379be3e06fSJens Axboe bio->bi_iter.bi_sector = rq->__sector; 6389be3e06fSJens Axboe } 6399be3e06fSJens Axboe 6409be3e06fSJens Axboe /* don't actually finish bio if it's part of flush sequence */ 6419be3e06fSJens Axboe if (bio->bi_iter.bi_size == 0 && !(rq->rq_flags & RQF_FLUSH_SEQ)) 6429be3e06fSJens Axboe bio_endio(bio); 6439be3e06fSJens Axboe } 6449be3e06fSJens Axboe 6459be3e06fSJens Axboe static void blk_account_io_completion(struct request *req, unsigned int bytes) 6469be3e06fSJens Axboe { 6479be3e06fSJens Axboe if (req->part && blk_do_io_stat(req)) { 6489be3e06fSJens Axboe const int sgrp = op_stat_group(req_op(req)); 6499be3e06fSJens Axboe 6509be3e06fSJens Axboe part_stat_lock(); 6519be3e06fSJens Axboe part_stat_add(req->part, sectors[sgrp], bytes >> 9); 6529be3e06fSJens Axboe part_stat_unlock(); 6539be3e06fSJens Axboe } 6549be3e06fSJens Axboe } 6559be3e06fSJens Axboe 6569be3e06fSJens Axboe /** 6579be3e06fSJens Axboe * blk_update_request - Complete multiple bytes without completing the request 6589be3e06fSJens Axboe * @req: the request being processed 6599be3e06fSJens Axboe * @error: block status code 6609be3e06fSJens Axboe * @nr_bytes: number of bytes to complete for @req 6619be3e06fSJens Axboe * 6629be3e06fSJens Axboe * Description: 6639be3e06fSJens Axboe * Ends I/O on a number of bytes attached to @req, but doesn't complete 6649be3e06fSJens Axboe * the request structure even if @req doesn't have leftover. 6659be3e06fSJens Axboe * If @req has leftover, sets it up for the next range of segments. 6669be3e06fSJens Axboe * 6679be3e06fSJens Axboe * Passing the result of blk_rq_bytes() as @nr_bytes guarantees 6689be3e06fSJens Axboe * %false return from this function. 6699be3e06fSJens Axboe * 6709be3e06fSJens Axboe * Note: 6719be3e06fSJens Axboe * The RQF_SPECIAL_PAYLOAD flag is ignored on purpose in this function 6729be3e06fSJens Axboe * except in the consistency check at the end of this function. 6739be3e06fSJens Axboe * 6749be3e06fSJens Axboe * Return: 6759be3e06fSJens Axboe * %false - this request doesn't have any more data 6769be3e06fSJens Axboe * %true - this request has more data 6779be3e06fSJens Axboe **/ 6789be3e06fSJens Axboe bool blk_update_request(struct request *req, blk_status_t error, 6799be3e06fSJens Axboe unsigned int nr_bytes) 6809be3e06fSJens Axboe { 6819be3e06fSJens Axboe int total_bytes; 6829be3e06fSJens Axboe 683*8a7d267bSChristoph Hellwig trace_block_rq_complete(req, error, nr_bytes); 6849be3e06fSJens Axboe 6859be3e06fSJens Axboe if (!req->bio) 6869be3e06fSJens Axboe return false; 6879be3e06fSJens Axboe 6889be3e06fSJens Axboe #ifdef CONFIG_BLK_DEV_INTEGRITY 6899be3e06fSJens Axboe if (blk_integrity_rq(req) && req_op(req) == REQ_OP_READ && 6909be3e06fSJens Axboe error == BLK_STS_OK) 6919be3e06fSJens Axboe req->q->integrity.profile->complete_fn(req, nr_bytes); 6929be3e06fSJens Axboe #endif 6939be3e06fSJens Axboe 6949be3e06fSJens Axboe if (unlikely(error && !blk_rq_is_passthrough(req) && 6959be3e06fSJens Axboe !(req->rq_flags & RQF_QUIET))) 6969be3e06fSJens Axboe blk_print_req_error(req, error); 6979be3e06fSJens Axboe 6989be3e06fSJens Axboe blk_account_io_completion(req, nr_bytes); 6999be3e06fSJens Axboe 7009be3e06fSJens Axboe total_bytes = 0; 7019be3e06fSJens Axboe while (req->bio) { 7029be3e06fSJens Axboe struct bio *bio = req->bio; 7039be3e06fSJens Axboe unsigned bio_bytes = min(bio->bi_iter.bi_size, nr_bytes); 7049be3e06fSJens Axboe 7059be3e06fSJens Axboe if (bio_bytes == bio->bi_iter.bi_size) 7069be3e06fSJens Axboe req->bio = bio->bi_next; 7079be3e06fSJens Axboe 7089be3e06fSJens Axboe /* Completion has already been traced */ 7099be3e06fSJens Axboe bio_clear_flag(bio, BIO_TRACE_COMPLETION); 7109be3e06fSJens Axboe req_bio_endio(req, bio, bio_bytes, error); 7119be3e06fSJens Axboe 7129be3e06fSJens Axboe total_bytes += bio_bytes; 7139be3e06fSJens Axboe nr_bytes -= bio_bytes; 7149be3e06fSJens Axboe 7159be3e06fSJens Axboe if (!nr_bytes) 7169be3e06fSJens Axboe break; 7179be3e06fSJens Axboe } 7189be3e06fSJens Axboe 7199be3e06fSJens Axboe /* 7209be3e06fSJens Axboe * completely done 7219be3e06fSJens Axboe */ 7229be3e06fSJens Axboe if (!req->bio) { 7239be3e06fSJens Axboe /* 7249be3e06fSJens Axboe * Reset counters so that the request stacking driver 7259be3e06fSJens Axboe * can find how many bytes remain in the request 7269be3e06fSJens Axboe * later. 7279be3e06fSJens Axboe */ 7289be3e06fSJens Axboe req->__data_len = 0; 7299be3e06fSJens Axboe return false; 7309be3e06fSJens Axboe } 7319be3e06fSJens Axboe 7329be3e06fSJens Axboe req->__data_len -= total_bytes; 7339be3e06fSJens Axboe 7349be3e06fSJens Axboe /* update sector only for requests with clear definition of sector */ 7359be3e06fSJens Axboe if (!blk_rq_is_passthrough(req)) 7369be3e06fSJens Axboe req->__sector += total_bytes >> 9; 7379be3e06fSJens Axboe 7389be3e06fSJens Axboe /* mixed attributes always follow the first bio */ 7399be3e06fSJens Axboe if (req->rq_flags & RQF_MIXED_MERGE) { 7409be3e06fSJens Axboe req->cmd_flags &= ~REQ_FAILFAST_MASK; 7419be3e06fSJens Axboe req->cmd_flags |= req->bio->bi_opf & REQ_FAILFAST_MASK; 7429be3e06fSJens Axboe } 7439be3e06fSJens Axboe 7449be3e06fSJens Axboe if (!(req->rq_flags & RQF_SPECIAL_PAYLOAD)) { 7459be3e06fSJens Axboe /* 7469be3e06fSJens Axboe * If total number of sectors is less than the first segment 7479be3e06fSJens Axboe * size, something has gone terribly wrong. 7489be3e06fSJens Axboe */ 7499be3e06fSJens Axboe if (blk_rq_bytes(req) < blk_rq_cur_bytes(req)) { 7509be3e06fSJens Axboe blk_dump_rq_flags(req, "request botched"); 7519be3e06fSJens Axboe req->__data_len = blk_rq_cur_bytes(req); 7529be3e06fSJens Axboe } 7539be3e06fSJens Axboe 7549be3e06fSJens Axboe /* recalculate the number of segments */ 7559be3e06fSJens Axboe req->nr_phys_segments = blk_recalc_rq_segments(req); 7569be3e06fSJens Axboe } 7579be3e06fSJens Axboe 7589be3e06fSJens Axboe return true; 7599be3e06fSJens Axboe } 7609be3e06fSJens Axboe EXPORT_SYMBOL_GPL(blk_update_request); 7619be3e06fSJens Axboe 762f794f335SJens Axboe static inline void __blk_mq_end_request_acct(struct request *rq, u64 now) 763320ae51fSJens Axboe { 7644bc6339aSOmar Sandoval if (rq->rq_flags & RQF_STATS) { 7654bc6339aSOmar Sandoval blk_mq_poll_stats_start(rq->q); 766522a7775SOmar Sandoval blk_stat_add(rq, now); 7674bc6339aSOmar Sandoval } 7684bc6339aSOmar Sandoval 769ed88660aSOmar Sandoval blk_mq_sched_completed_request(rq, now); 770522a7775SOmar Sandoval blk_account_io_done(rq, now); 7718971a3b7SPavel Begunkov } 7720d11e6acSMing Lei 773f794f335SJens Axboe inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 774f794f335SJens Axboe { 775f794f335SJens Axboe if (blk_mq_need_time_stamp(rq)) 776f794f335SJens Axboe __blk_mq_end_request_acct(rq, ktime_get_ns()); 777f794f335SJens Axboe 77891b63639SChristoph Hellwig if (rq->end_io) { 779a7905043SJosef Bacik rq_qos_done(rq->q, rq); 780320ae51fSJens Axboe rq->end_io(rq, error); 78191b63639SChristoph Hellwig } else { 782320ae51fSJens Axboe blk_mq_free_request(rq); 783320ae51fSJens Axboe } 78491b63639SChristoph Hellwig } 785c8a446adSChristoph Hellwig EXPORT_SYMBOL(__blk_mq_end_request); 78663151a44SChristoph Hellwig 7872a842acaSChristoph Hellwig void blk_mq_end_request(struct request *rq, blk_status_t error) 78863151a44SChristoph Hellwig { 78963151a44SChristoph Hellwig if (blk_update_request(rq, error, blk_rq_bytes(rq))) 79063151a44SChristoph Hellwig BUG(); 791c8a446adSChristoph Hellwig __blk_mq_end_request(rq, error); 79263151a44SChristoph Hellwig } 793c8a446adSChristoph Hellwig EXPORT_SYMBOL(blk_mq_end_request); 794320ae51fSJens Axboe 795f794f335SJens Axboe #define TAG_COMP_BATCH 32 796f794f335SJens Axboe 797f794f335SJens Axboe static inline void blk_mq_flush_tag_batch(struct blk_mq_hw_ctx *hctx, 798f794f335SJens Axboe int *tag_array, int nr_tags) 799f794f335SJens Axboe { 800f794f335SJens Axboe struct request_queue *q = hctx->queue; 801f794f335SJens Axboe 802f794f335SJens Axboe blk_mq_put_tags(hctx->tags, tag_array, nr_tags); 803f794f335SJens Axboe percpu_ref_put_many(&q->q_usage_counter, nr_tags); 804f794f335SJens Axboe } 805f794f335SJens Axboe 806f794f335SJens Axboe void blk_mq_end_request_batch(struct io_comp_batch *iob) 807f794f335SJens Axboe { 808f794f335SJens Axboe int tags[TAG_COMP_BATCH], nr_tags = 0; 809f794f335SJens Axboe struct blk_mq_hw_ctx *last_hctx = NULL; 810f794f335SJens Axboe struct request *rq; 811f794f335SJens Axboe u64 now = 0; 812f794f335SJens Axboe 813f794f335SJens Axboe if (iob->need_ts) 814f794f335SJens Axboe now = ktime_get_ns(); 815f794f335SJens Axboe 816f794f335SJens Axboe while ((rq = rq_list_pop(&iob->req_list)) != NULL) { 817f794f335SJens Axboe prefetch(rq->bio); 818f794f335SJens Axboe prefetch(rq->rq_next); 819f794f335SJens Axboe 820f794f335SJens Axboe blk_update_request(rq, BLK_STS_OK, blk_rq_bytes(rq)); 821f794f335SJens Axboe if (iob->need_ts) 822f794f335SJens Axboe __blk_mq_end_request_acct(rq, now); 823f794f335SJens Axboe 824f794f335SJens Axboe WRITE_ONCE(rq->state, MQ_RQ_IDLE); 825f794f335SJens Axboe if (!refcount_dec_and_test(&rq->ref)) 826f794f335SJens Axboe continue; 827f794f335SJens Axboe 828f794f335SJens Axboe blk_crypto_free_request(rq); 829f794f335SJens Axboe blk_pm_mark_last_busy(rq); 830f794f335SJens Axboe rq_qos_done(rq->q, rq); 831f794f335SJens Axboe 832f794f335SJens Axboe if (nr_tags == TAG_COMP_BATCH || 833f794f335SJens Axboe (last_hctx && last_hctx != rq->mq_hctx)) { 834f794f335SJens Axboe blk_mq_flush_tag_batch(last_hctx, tags, nr_tags); 835f794f335SJens Axboe nr_tags = 0; 836f794f335SJens Axboe } 837f794f335SJens Axboe tags[nr_tags++] = rq->tag; 838f794f335SJens Axboe last_hctx = rq->mq_hctx; 839f794f335SJens Axboe } 840f794f335SJens Axboe 841f794f335SJens Axboe if (nr_tags) 842f794f335SJens Axboe blk_mq_flush_tag_batch(last_hctx, tags, nr_tags); 843f794f335SJens Axboe } 844f794f335SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_end_request_batch); 845f794f335SJens Axboe 846f9ab4918SSebastian Andrzej Siewior static void blk_complete_reqs(struct llist_head *list) 847c3077b5dSChristoph Hellwig { 848f9ab4918SSebastian Andrzej Siewior struct llist_node *entry = llist_reverse_order(llist_del_all(list)); 849f9ab4918SSebastian Andrzej Siewior struct request *rq, *next; 850c3077b5dSChristoph Hellwig 851f9ab4918SSebastian Andrzej Siewior llist_for_each_entry_safe(rq, next, entry, ipi_list) 852c3077b5dSChristoph Hellwig rq->q->mq_ops->complete(rq); 853c3077b5dSChristoph Hellwig } 854c3077b5dSChristoph Hellwig 855f9ab4918SSebastian Andrzej Siewior static __latent_entropy void blk_done_softirq(struct softirq_action *h) 856115243f5SChristoph Hellwig { 857f9ab4918SSebastian Andrzej Siewior blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); 858c3077b5dSChristoph Hellwig } 859c3077b5dSChristoph Hellwig 860c3077b5dSChristoph Hellwig static int blk_softirq_cpu_dead(unsigned int cpu) 861c3077b5dSChristoph Hellwig { 862f9ab4918SSebastian Andrzej Siewior blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); 863c3077b5dSChristoph Hellwig return 0; 864c3077b5dSChristoph Hellwig } 865c3077b5dSChristoph Hellwig 86630a91cb4SChristoph Hellwig static void __blk_mq_complete_request_remote(void *data) 867320ae51fSJens Axboe { 868f9ab4918SSebastian Andrzej Siewior __raise_softirq_irqoff(BLOCK_SOFTIRQ); 86936e76539SMing Lei } 87036e76539SMing Lei 87196339526SChristoph Hellwig static inline bool blk_mq_complete_need_ipi(struct request *rq) 87296339526SChristoph Hellwig { 87396339526SChristoph Hellwig int cpu = raw_smp_processor_id(); 87496339526SChristoph Hellwig 87596339526SChristoph Hellwig if (!IS_ENABLED(CONFIG_SMP) || 87696339526SChristoph Hellwig !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) 87796339526SChristoph Hellwig return false; 87871425189SSebastian Andrzej Siewior /* 87971425189SSebastian Andrzej Siewior * With force threaded interrupts enabled, raising softirq from an SMP 88071425189SSebastian Andrzej Siewior * function call will always result in waking the ksoftirqd thread. 88171425189SSebastian Andrzej Siewior * This is probably worse than completing the request on a different 88271425189SSebastian Andrzej Siewior * cache domain. 88371425189SSebastian Andrzej Siewior */ 88491cc470eSTanner Love if (force_irqthreads()) 88571425189SSebastian Andrzej Siewior return false; 88696339526SChristoph Hellwig 88796339526SChristoph Hellwig /* same CPU or cache domain? Complete locally */ 88896339526SChristoph Hellwig if (cpu == rq->mq_ctx->cpu || 88996339526SChristoph Hellwig (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && 89096339526SChristoph Hellwig cpus_share_cache(cpu, rq->mq_ctx->cpu))) 89196339526SChristoph Hellwig return false; 89296339526SChristoph Hellwig 89396339526SChristoph Hellwig /* don't try to IPI to an offline CPU */ 89496339526SChristoph Hellwig return cpu_online(rq->mq_ctx->cpu); 89596339526SChristoph Hellwig } 89696339526SChristoph Hellwig 897f9ab4918SSebastian Andrzej Siewior static void blk_mq_complete_send_ipi(struct request *rq) 898f9ab4918SSebastian Andrzej Siewior { 899f9ab4918SSebastian Andrzej Siewior struct llist_head *list; 900f9ab4918SSebastian Andrzej Siewior unsigned int cpu; 901f9ab4918SSebastian Andrzej Siewior 902f9ab4918SSebastian Andrzej Siewior cpu = rq->mq_ctx->cpu; 903f9ab4918SSebastian Andrzej Siewior list = &per_cpu(blk_cpu_done, cpu); 904f9ab4918SSebastian Andrzej Siewior if (llist_add(&rq->ipi_list, list)) { 905f9ab4918SSebastian Andrzej Siewior INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); 906f9ab4918SSebastian Andrzej Siewior smp_call_function_single_async(cpu, &rq->csd); 907f9ab4918SSebastian Andrzej Siewior } 908f9ab4918SSebastian Andrzej Siewior } 909f9ab4918SSebastian Andrzej Siewior 910f9ab4918SSebastian Andrzej Siewior static void blk_mq_raise_softirq(struct request *rq) 911f9ab4918SSebastian Andrzej Siewior { 912f9ab4918SSebastian Andrzej Siewior struct llist_head *list; 913f9ab4918SSebastian Andrzej Siewior 914f9ab4918SSebastian Andrzej Siewior preempt_disable(); 915f9ab4918SSebastian Andrzej Siewior list = this_cpu_ptr(&blk_cpu_done); 916f9ab4918SSebastian Andrzej Siewior if (llist_add(&rq->ipi_list, list)) 917f9ab4918SSebastian Andrzej Siewior raise_softirq(BLOCK_SOFTIRQ); 918f9ab4918SSebastian Andrzej Siewior preempt_enable(); 919f9ab4918SSebastian Andrzej Siewior } 920f9ab4918SSebastian Andrzej Siewior 92140d09b53SChristoph Hellwig bool blk_mq_complete_request_remote(struct request *rq) 92240d09b53SChristoph Hellwig { 92340d09b53SChristoph Hellwig WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 92440d09b53SChristoph Hellwig 9254ab32bf3SJens Axboe /* 9264ab32bf3SJens Axboe * For a polled request, always complete locallly, it's pointless 9274ab32bf3SJens Axboe * to redirect the completion. 9284ab32bf3SJens Axboe */ 9296ce913feSChristoph Hellwig if (rq->cmd_flags & REQ_POLLED) 93040d09b53SChristoph Hellwig return false; 931320ae51fSJens Axboe 93240d09b53SChristoph Hellwig if (blk_mq_complete_need_ipi(rq)) { 933f9ab4918SSebastian Andrzej Siewior blk_mq_complete_send_ipi(rq); 934f9ab4918SSebastian Andrzej Siewior return true; 9353d6efbf6SChristoph Hellwig } 93640d09b53SChristoph Hellwig 937f9ab4918SSebastian Andrzej Siewior if (rq->q->nr_hw_queues == 1) { 938f9ab4918SSebastian Andrzej Siewior blk_mq_raise_softirq(rq); 93940d09b53SChristoph Hellwig return true; 940320ae51fSJens Axboe } 941f9ab4918SSebastian Andrzej Siewior return false; 942f9ab4918SSebastian Andrzej Siewior } 94340d09b53SChristoph Hellwig EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); 94440d09b53SChristoph Hellwig 945320ae51fSJens Axboe /** 94615f73f5bSChristoph Hellwig * blk_mq_complete_request - end I/O on a request 94715f73f5bSChristoph Hellwig * @rq: the request being processed 948320ae51fSJens Axboe * 94915f73f5bSChristoph Hellwig * Description: 95015f73f5bSChristoph Hellwig * Complete a request by scheduling the ->complete_rq operation. 95115f73f5bSChristoph Hellwig **/ 95215f73f5bSChristoph Hellwig void blk_mq_complete_request(struct request *rq) 953320ae51fSJens Axboe { 95440d09b53SChristoph Hellwig if (!blk_mq_complete_request_remote(rq)) 95596339526SChristoph Hellwig rq->q->mq_ops->complete(rq); 956320ae51fSJens Axboe } 95715f73f5bSChristoph Hellwig EXPORT_SYMBOL(blk_mq_complete_request); 95830a91cb4SChristoph Hellwig 95904ced159SJens Axboe static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) 960b7435db8SBart Van Assche __releases(hctx->srcu) 96104ced159SJens Axboe { 96204ced159SJens Axboe if (!(hctx->flags & BLK_MQ_F_BLOCKING)) 96304ced159SJens Axboe rcu_read_unlock(); 96404ced159SJens Axboe else 96505707b64STejun Heo srcu_read_unlock(hctx->srcu, srcu_idx); 96604ced159SJens Axboe } 96704ced159SJens Axboe 96804ced159SJens Axboe static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) 969b7435db8SBart Van Assche __acquires(hctx->srcu) 97004ced159SJens Axboe { 97108b5a6e2SJens Axboe if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 97208b5a6e2SJens Axboe /* shut up gcc false positive */ 97308b5a6e2SJens Axboe *srcu_idx = 0; 97404ced159SJens Axboe rcu_read_lock(); 97508b5a6e2SJens Axboe } else 97605707b64STejun Heo *srcu_idx = srcu_read_lock(hctx->srcu); 97704ced159SJens Axboe } 97804ced159SJens Axboe 97930a91cb4SChristoph Hellwig /** 980105663f7SAndré Almeida * blk_mq_start_request - Start processing a request 981105663f7SAndré Almeida * @rq: Pointer to request to be started 982105663f7SAndré Almeida * 983105663f7SAndré Almeida * Function used by device drivers to notify the block layer that a request 984105663f7SAndré Almeida * is going to be processed now, so blk layer can do proper initializations 985105663f7SAndré Almeida * such as starting the timeout timer. 986105663f7SAndré Almeida */ 987e2490073SChristoph Hellwig void blk_mq_start_request(struct request *rq) 988320ae51fSJens Axboe { 989320ae51fSJens Axboe struct request_queue *q = rq->q; 990320ae51fSJens Axboe 991a54895faSChristoph Hellwig trace_block_rq_issue(rq); 992320ae51fSJens Axboe 993cf43e6beSJens Axboe if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 99400067077SJens Axboe u64 start_time; 99500067077SJens Axboe #ifdef CONFIG_BLK_CGROUP 99600067077SJens Axboe if (rq->bio) 99700067077SJens Axboe start_time = bio_issue_time(&rq->bio->bi_issue); 99800067077SJens Axboe else 99900067077SJens Axboe #endif 100000067077SJens Axboe start_time = ktime_get_ns(); 100100067077SJens Axboe rq->io_start_time_ns = start_time; 10023d244306SHou Tao rq->stats_sectors = blk_rq_sectors(rq); 1003cf43e6beSJens Axboe rq->rq_flags |= RQF_STATS; 1004a7905043SJosef Bacik rq_qos_issue(q, rq); 1005cf43e6beSJens Axboe } 1006cf43e6beSJens Axboe 10071d9bd516STejun Heo WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 1008538b7534SJens Axboe 1009538b7534SJens Axboe blk_add_timer(rq); 101012f5b931SKeith Busch WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 101149f5baa5SChristoph Hellwig 101254d4e6abSMax Gurtovoy #ifdef CONFIG_BLK_DEV_INTEGRITY 101354d4e6abSMax Gurtovoy if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) 101454d4e6abSMax Gurtovoy q->integrity.profile->prepare_fn(rq); 101554d4e6abSMax Gurtovoy #endif 10163e08773cSChristoph Hellwig if (rq->bio && rq->bio->bi_opf & REQ_POLLED) 10173e08773cSChristoph Hellwig WRITE_ONCE(rq->bio->bi_cookie, blk_rq_to_qc(rq)); 1018320ae51fSJens Axboe } 1019e2490073SChristoph Hellwig EXPORT_SYMBOL(blk_mq_start_request); 1020320ae51fSJens Axboe 1021ed0791b2SChristoph Hellwig static void __blk_mq_requeue_request(struct request *rq) 1022320ae51fSJens Axboe { 1023320ae51fSJens Axboe struct request_queue *q = rq->q; 1024320ae51fSJens Axboe 1025923218f6SMing Lei blk_mq_put_driver_tag(rq); 1026923218f6SMing Lei 1027a54895faSChristoph Hellwig trace_block_rq_requeue(rq); 1028a7905043SJosef Bacik rq_qos_requeue(q, rq); 102949f5baa5SChristoph Hellwig 103012f5b931SKeith Busch if (blk_mq_request_started(rq)) { 103112f5b931SKeith Busch WRITE_ONCE(rq->state, MQ_RQ_IDLE); 1032da661267SChristoph Hellwig rq->rq_flags &= ~RQF_TIMED_OUT; 1033320ae51fSJens Axboe } 1034e2490073SChristoph Hellwig } 1035320ae51fSJens Axboe 10362b053acaSBart Van Assche void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 1037ed0791b2SChristoph Hellwig { 1038ed0791b2SChristoph Hellwig __blk_mq_requeue_request(rq); 1039ed0791b2SChristoph Hellwig 1040105976f5SMing Lei /* this request will be re-inserted to io scheduler queue */ 1041105976f5SMing Lei blk_mq_sched_requeue_request(rq); 1042105976f5SMing Lei 10437d692330SJens Axboe BUG_ON(!list_empty(&rq->queuelist)); 10442b053acaSBart Van Assche blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 1045ed0791b2SChristoph Hellwig } 1046ed0791b2SChristoph Hellwig EXPORT_SYMBOL(blk_mq_requeue_request); 1047ed0791b2SChristoph Hellwig 10486fca6a61SChristoph Hellwig static void blk_mq_requeue_work(struct work_struct *work) 10496fca6a61SChristoph Hellwig { 10506fca6a61SChristoph Hellwig struct request_queue *q = 10512849450aSMike Snitzer container_of(work, struct request_queue, requeue_work.work); 10526fca6a61SChristoph Hellwig LIST_HEAD(rq_list); 10536fca6a61SChristoph Hellwig struct request *rq, *next; 10546fca6a61SChristoph Hellwig 105518e9781dSJens Axboe spin_lock_irq(&q->requeue_lock); 10566fca6a61SChristoph Hellwig list_splice_init(&q->requeue_list, &rq_list); 105718e9781dSJens Axboe spin_unlock_irq(&q->requeue_lock); 10586fca6a61SChristoph Hellwig 10596fca6a61SChristoph Hellwig list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 1060aef1897cSJianchao Wang if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) 10616fca6a61SChristoph Hellwig continue; 10626fca6a61SChristoph Hellwig 1063e8064021SChristoph Hellwig rq->rq_flags &= ~RQF_SOFTBARRIER; 10646fca6a61SChristoph Hellwig list_del_init(&rq->queuelist); 1065aef1897cSJianchao Wang /* 1066aef1897cSJianchao Wang * If RQF_DONTPREP, rq has contained some driver specific 1067aef1897cSJianchao Wang * data, so insert it to hctx dispatch list to avoid any 1068aef1897cSJianchao Wang * merge. 1069aef1897cSJianchao Wang */ 1070aef1897cSJianchao Wang if (rq->rq_flags & RQF_DONTPREP) 107101e99aecSMing Lei blk_mq_request_bypass_insert(rq, false, false); 1072aef1897cSJianchao Wang else 10739e97d295SMike Snitzer blk_mq_sched_insert_request(rq, true, false, false); 10746fca6a61SChristoph Hellwig } 10756fca6a61SChristoph Hellwig 10766fca6a61SChristoph Hellwig while (!list_empty(&rq_list)) { 10776fca6a61SChristoph Hellwig rq = list_entry(rq_list.next, struct request, queuelist); 10786fca6a61SChristoph Hellwig list_del_init(&rq->queuelist); 10799e97d295SMike Snitzer blk_mq_sched_insert_request(rq, false, false, false); 10806fca6a61SChristoph Hellwig } 10816fca6a61SChristoph Hellwig 108252d7f1b5SBart Van Assche blk_mq_run_hw_queues(q, false); 10836fca6a61SChristoph Hellwig } 10846fca6a61SChristoph Hellwig 10852b053acaSBart Van Assche void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 10862b053acaSBart Van Assche bool kick_requeue_list) 10876fca6a61SChristoph Hellwig { 10886fca6a61SChristoph Hellwig struct request_queue *q = rq->q; 10896fca6a61SChristoph Hellwig unsigned long flags; 10906fca6a61SChristoph Hellwig 10916fca6a61SChristoph Hellwig /* 10926fca6a61SChristoph Hellwig * We abuse this flag that is otherwise used by the I/O scheduler to 1093ff821d27SJens Axboe * request head insertion from the workqueue. 10946fca6a61SChristoph Hellwig */ 1095e8064021SChristoph Hellwig BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 10966fca6a61SChristoph Hellwig 10976fca6a61SChristoph Hellwig spin_lock_irqsave(&q->requeue_lock, flags); 10986fca6a61SChristoph Hellwig if (at_head) { 1099e8064021SChristoph Hellwig rq->rq_flags |= RQF_SOFTBARRIER; 11006fca6a61SChristoph Hellwig list_add(&rq->queuelist, &q->requeue_list); 11016fca6a61SChristoph Hellwig } else { 11026fca6a61SChristoph Hellwig list_add_tail(&rq->queuelist, &q->requeue_list); 11036fca6a61SChristoph Hellwig } 11046fca6a61SChristoph Hellwig spin_unlock_irqrestore(&q->requeue_lock, flags); 11052b053acaSBart Van Assche 11062b053acaSBart Van Assche if (kick_requeue_list) 11072b053acaSBart Van Assche blk_mq_kick_requeue_list(q); 11086fca6a61SChristoph Hellwig } 11096fca6a61SChristoph Hellwig 11106fca6a61SChristoph Hellwig void blk_mq_kick_requeue_list(struct request_queue *q) 11116fca6a61SChristoph Hellwig { 1112ae943d20SBart Van Assche kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 11136fca6a61SChristoph Hellwig } 11146fca6a61SChristoph Hellwig EXPORT_SYMBOL(blk_mq_kick_requeue_list); 11156fca6a61SChristoph Hellwig 11162849450aSMike Snitzer void blk_mq_delay_kick_requeue_list(struct request_queue *q, 11172849450aSMike Snitzer unsigned long msecs) 11182849450aSMike Snitzer { 1119d4acf365SBart Van Assche kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 11202849450aSMike Snitzer msecs_to_jiffies(msecs)); 11212849450aSMike Snitzer } 11222849450aSMike Snitzer EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 11232849450aSMike Snitzer 11240e62f51fSJens Axboe struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 11250e62f51fSJens Axboe { 112688c7b2b7SJens Axboe if (tag < tags->nr_tags) { 112788c7b2b7SJens Axboe prefetch(tags->rqs[tag]); 11280048b483SMing Lei return tags->rqs[tag]; 112988c7b2b7SJens Axboe } 11304ee86babSHannes Reinecke 11314ee86babSHannes Reinecke return NULL; 113224d2f903SChristoph Hellwig } 113324d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_tag_to_rq); 113424d2f903SChristoph Hellwig 11353c94d83cSJens Axboe static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, 1136ae879912SJens Axboe void *priv, bool reserved) 1137ae879912SJens Axboe { 1138ae879912SJens Axboe /* 113905a4fed6SMing Lei * If we find a request that isn't idle and the queue matches, 11403c94d83cSJens Axboe * we know the queue is busy. Return false to stop the iteration. 1141ae879912SJens Axboe */ 114205a4fed6SMing Lei if (blk_mq_request_started(rq) && rq->q == hctx->queue) { 1143ae879912SJens Axboe bool *busy = priv; 1144ae879912SJens Axboe 1145ae879912SJens Axboe *busy = true; 1146ae879912SJens Axboe return false; 1147ae879912SJens Axboe } 1148ae879912SJens Axboe 1149ae879912SJens Axboe return true; 1150ae879912SJens Axboe } 1151ae879912SJens Axboe 11523c94d83cSJens Axboe bool blk_mq_queue_inflight(struct request_queue *q) 1153ae879912SJens Axboe { 1154ae879912SJens Axboe bool busy = false; 1155ae879912SJens Axboe 11563c94d83cSJens Axboe blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 1157ae879912SJens Axboe return busy; 1158ae879912SJens Axboe } 11593c94d83cSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 1160ae879912SJens Axboe 1161358f70daSTejun Heo static void blk_mq_rq_timed_out(struct request *req, bool reserved) 1162320ae51fSJens Axboe { 1163da661267SChristoph Hellwig req->rq_flags |= RQF_TIMED_OUT; 1164d1210d5aSChristoph Hellwig if (req->q->mq_ops->timeout) { 1165d1210d5aSChristoph Hellwig enum blk_eh_timer_return ret; 116687ee7b11SJens Axboe 1167d1210d5aSChristoph Hellwig ret = req->q->mq_ops->timeout(req, reserved); 1168d1210d5aSChristoph Hellwig if (ret == BLK_EH_DONE) 1169d1210d5aSChristoph Hellwig return; 1170d1210d5aSChristoph Hellwig WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 117187ee7b11SJens Axboe } 1172d1210d5aSChristoph Hellwig 1173d1210d5aSChristoph Hellwig blk_add_timer(req); 117487ee7b11SJens Axboe } 117587ee7b11SJens Axboe 117612f5b931SKeith Busch static bool blk_mq_req_expired(struct request *rq, unsigned long *next) 117712f5b931SKeith Busch { 117812f5b931SKeith Busch unsigned long deadline; 117912f5b931SKeith Busch 118012f5b931SKeith Busch if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 118112f5b931SKeith Busch return false; 1182da661267SChristoph Hellwig if (rq->rq_flags & RQF_TIMED_OUT) 1183da661267SChristoph Hellwig return false; 118412f5b931SKeith Busch 1185079076b3SChristoph Hellwig deadline = READ_ONCE(rq->deadline); 118612f5b931SKeith Busch if (time_after_eq(jiffies, deadline)) 118712f5b931SKeith Busch return true; 118812f5b931SKeith Busch 118912f5b931SKeith Busch if (*next == 0) 119012f5b931SKeith Busch *next = deadline; 119112f5b931SKeith Busch else if (time_after(*next, deadline)) 119212f5b931SKeith Busch *next = deadline; 119312f5b931SKeith Busch return false; 119412f5b931SKeith Busch } 119512f5b931SKeith Busch 11962e315dc0SMing Lei void blk_mq_put_rq_ref(struct request *rq) 11972e315dc0SMing Lei { 1198a9ed27a7SMing Lei if (is_flush_rq(rq)) 11992e315dc0SMing Lei rq->end_io(rq, 0); 12002e315dc0SMing Lei else if (refcount_dec_and_test(&rq->ref)) 12012e315dc0SMing Lei __blk_mq_free_request(rq); 12022e315dc0SMing Lei } 12032e315dc0SMing Lei 12047baa8572SJens Axboe static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 120581481eb4SChristoph Hellwig struct request *rq, void *priv, bool reserved) 1206320ae51fSJens Axboe { 120712f5b931SKeith Busch unsigned long *next = priv; 120881481eb4SChristoph Hellwig 120912f5b931SKeith Busch /* 1210c797b40cSMing Lei * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot 1211c797b40cSMing Lei * be reallocated underneath the timeout handler's processing, then 1212c797b40cSMing Lei * the expire check is reliable. If the request is not expired, then 1213c797b40cSMing Lei * it was completed and reallocated as a new request after returning 1214c797b40cSMing Lei * from blk_mq_check_expired(). 121512f5b931SKeith Busch */ 121612f5b931SKeith Busch if (blk_mq_req_expired(rq, next)) 12171d9bd516STejun Heo blk_mq_rq_timed_out(rq, reserved); 12187baa8572SJens Axboe return true; 12191d9bd516STejun Heo } 12201d9bd516STejun Heo 1221287922ebSChristoph Hellwig static void blk_mq_timeout_work(struct work_struct *work) 122281481eb4SChristoph Hellwig { 1223287922ebSChristoph Hellwig struct request_queue *q = 1224287922ebSChristoph Hellwig container_of(work, struct request_queue, timeout_work); 122512f5b931SKeith Busch unsigned long next = 0; 12261d9bd516STejun Heo struct blk_mq_hw_ctx *hctx; 122781481eb4SChristoph Hellwig int i; 1228320ae51fSJens Axboe 122971f79fb3SGabriel Krisman Bertazi /* A deadlock might occur if a request is stuck requiring a 123071f79fb3SGabriel Krisman Bertazi * timeout at the same time a queue freeze is waiting 123171f79fb3SGabriel Krisman Bertazi * completion, since the timeout code would not be able to 123271f79fb3SGabriel Krisman Bertazi * acquire the queue reference here. 123371f79fb3SGabriel Krisman Bertazi * 123471f79fb3SGabriel Krisman Bertazi * That's why we don't use blk_queue_enter here; instead, we use 123571f79fb3SGabriel Krisman Bertazi * percpu_ref_tryget directly, because we need to be able to 123671f79fb3SGabriel Krisman Bertazi * obtain a reference even in the short window between the queue 123771f79fb3SGabriel Krisman Bertazi * starting to freeze, by dropping the first reference in 12381671d522SMing Lei * blk_freeze_queue_start, and the moment the last request is 123971f79fb3SGabriel Krisman Bertazi * consumed, marked by the instant q_usage_counter reaches 124071f79fb3SGabriel Krisman Bertazi * zero. 124171f79fb3SGabriel Krisman Bertazi */ 124271f79fb3SGabriel Krisman Bertazi if (!percpu_ref_tryget(&q->q_usage_counter)) 1243287922ebSChristoph Hellwig return; 1244287922ebSChristoph Hellwig 124512f5b931SKeith Busch blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); 1246320ae51fSJens Axboe 124712f5b931SKeith Busch if (next != 0) { 124812f5b931SKeith Busch mod_timer(&q->timeout, next); 12490d2602caSJens Axboe } else { 1250fcd36c36SBart Van Assche /* 1251fcd36c36SBart Van Assche * Request timeouts are handled as a forward rolling timer. If 1252fcd36c36SBart Van Assche * we end up here it means that no requests are pending and 1253fcd36c36SBart Van Assche * also that no request has been pending for a while. Mark 1254fcd36c36SBart Van Assche * each hctx as idle. 1255fcd36c36SBart Van Assche */ 1256f054b56cSMing Lei queue_for_each_hw_ctx(q, hctx, i) { 1257f054b56cSMing Lei /* the hctx may be unmapped, so check it here */ 1258f054b56cSMing Lei if (blk_mq_hw_queue_mapped(hctx)) 12590d2602caSJens Axboe blk_mq_tag_idle(hctx); 12600d2602caSJens Axboe } 1261320ae51fSJens Axboe } 1262287922ebSChristoph Hellwig blk_queue_exit(q); 1263f054b56cSMing Lei } 1264320ae51fSJens Axboe 126588459642SOmar Sandoval struct flush_busy_ctx_data { 126688459642SOmar Sandoval struct blk_mq_hw_ctx *hctx; 126788459642SOmar Sandoval struct list_head *list; 126888459642SOmar Sandoval }; 126988459642SOmar Sandoval 127088459642SOmar Sandoval static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 127188459642SOmar Sandoval { 127288459642SOmar Sandoval struct flush_busy_ctx_data *flush_data = data; 127388459642SOmar Sandoval struct blk_mq_hw_ctx *hctx = flush_data->hctx; 127488459642SOmar Sandoval struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1275c16d6b5aSMing Lei enum hctx_type type = hctx->type; 127688459642SOmar Sandoval 127788459642SOmar Sandoval spin_lock(&ctx->lock); 1278c16d6b5aSMing Lei list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 1279e9a99a63SOmar Sandoval sbitmap_clear_bit(sb, bitnr); 128088459642SOmar Sandoval spin_unlock(&ctx->lock); 128188459642SOmar Sandoval return true; 128288459642SOmar Sandoval } 128388459642SOmar Sandoval 1284320ae51fSJens Axboe /* 12851429d7c9SJens Axboe * Process software queues that have been marked busy, splicing them 12861429d7c9SJens Axboe * to the for-dispatch 12871429d7c9SJens Axboe */ 12882c3ad667SJens Axboe void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 12891429d7c9SJens Axboe { 129088459642SOmar Sandoval struct flush_busy_ctx_data data = { 129188459642SOmar Sandoval .hctx = hctx, 129288459642SOmar Sandoval .list = list, 129388459642SOmar Sandoval }; 12941429d7c9SJens Axboe 129588459642SOmar Sandoval sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 12961429d7c9SJens Axboe } 12972c3ad667SJens Axboe EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 12981429d7c9SJens Axboe 1299b347689fSMing Lei struct dispatch_rq_data { 1300b347689fSMing Lei struct blk_mq_hw_ctx *hctx; 1301b347689fSMing Lei struct request *rq; 1302b347689fSMing Lei }; 1303b347689fSMing Lei 1304b347689fSMing Lei static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 1305b347689fSMing Lei void *data) 1306b347689fSMing Lei { 1307b347689fSMing Lei struct dispatch_rq_data *dispatch_data = data; 1308b347689fSMing Lei struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 1309b347689fSMing Lei struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1310c16d6b5aSMing Lei enum hctx_type type = hctx->type; 1311b347689fSMing Lei 1312b347689fSMing Lei spin_lock(&ctx->lock); 1313c16d6b5aSMing Lei if (!list_empty(&ctx->rq_lists[type])) { 1314c16d6b5aSMing Lei dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1315b347689fSMing Lei list_del_init(&dispatch_data->rq->queuelist); 1316c16d6b5aSMing Lei if (list_empty(&ctx->rq_lists[type])) 1317b347689fSMing Lei sbitmap_clear_bit(sb, bitnr); 1318b347689fSMing Lei } 1319b347689fSMing Lei spin_unlock(&ctx->lock); 1320b347689fSMing Lei 1321b347689fSMing Lei return !dispatch_data->rq; 1322b347689fSMing Lei } 1323b347689fSMing Lei 1324b347689fSMing Lei struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1325b347689fSMing Lei struct blk_mq_ctx *start) 1326b347689fSMing Lei { 1327f31967f0SJens Axboe unsigned off = start ? start->index_hw[hctx->type] : 0; 1328b347689fSMing Lei struct dispatch_rq_data data = { 1329b347689fSMing Lei .hctx = hctx, 1330b347689fSMing Lei .rq = NULL, 1331b347689fSMing Lei }; 1332b347689fSMing Lei 1333b347689fSMing Lei __sbitmap_for_each_set(&hctx->ctx_map, off, 1334b347689fSMing Lei dispatch_rq_from_ctx, &data); 1335b347689fSMing Lei 1336b347689fSMing Lei return data.rq; 1337b347689fSMing Lei } 1338b347689fSMing Lei 1339570e9b73SMing Lei static bool __blk_mq_get_driver_tag(struct request *rq) 1340570e9b73SMing Lei { 1341ae0f1a73SJohn Garry struct sbitmap_queue *bt = &rq->mq_hctx->tags->bitmap_tags; 1342570e9b73SMing Lei unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; 1343570e9b73SMing Lei int tag; 1344570e9b73SMing Lei 1345568f2700SMing Lei blk_mq_tag_busy(rq->mq_hctx); 1346568f2700SMing Lei 1347570e9b73SMing Lei if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { 1348ae0f1a73SJohn Garry bt = &rq->mq_hctx->tags->breserved_tags; 1349570e9b73SMing Lei tag_offset = 0; 135028500850SMing Lei } else { 1351570e9b73SMing Lei if (!hctx_may_queue(rq->mq_hctx, bt)) 1352570e9b73SMing Lei return false; 135328500850SMing Lei } 135428500850SMing Lei 1355570e9b73SMing Lei tag = __sbitmap_queue_get(bt); 1356570e9b73SMing Lei if (tag == BLK_MQ_NO_TAG) 1357570e9b73SMing Lei return false; 1358570e9b73SMing Lei 1359570e9b73SMing Lei rq->tag = tag + tag_offset; 1360570e9b73SMing Lei return true; 1361570e9b73SMing Lei } 1362570e9b73SMing Lei 136361347154SJan Kara bool blk_mq_get_driver_tag(struct request *rq) 1364570e9b73SMing Lei { 1365568f2700SMing Lei struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1366568f2700SMing Lei 1367568f2700SMing Lei if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) 1368568f2700SMing Lei return false; 1369568f2700SMing Lei 137051db1c37SMing Lei if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && 1371568f2700SMing Lei !(rq->rq_flags & RQF_MQ_INFLIGHT)) { 1372568f2700SMing Lei rq->rq_flags |= RQF_MQ_INFLIGHT; 1373bccf5e26SJohn Garry __blk_mq_inc_active_requests(hctx); 1374568f2700SMing Lei } 1375568f2700SMing Lei hctx->tags->rqs[rq->tag] = rq; 1376570e9b73SMing Lei return true; 1377570e9b73SMing Lei } 1378570e9b73SMing Lei 1379eb619fdbSJens Axboe static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1380eb619fdbSJens Axboe int flags, void *key) 1381da55f2ccSOmar Sandoval { 1382da55f2ccSOmar Sandoval struct blk_mq_hw_ctx *hctx; 1383da55f2ccSOmar Sandoval 1384da55f2ccSOmar Sandoval hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1385da55f2ccSOmar Sandoval 13865815839bSMing Lei spin_lock(&hctx->dispatch_wait_lock); 1387e8618575SJens Axboe if (!list_empty(&wait->entry)) { 1388e8618575SJens Axboe struct sbitmap_queue *sbq; 1389e8618575SJens Axboe 1390eb619fdbSJens Axboe list_del_init(&wait->entry); 1391ae0f1a73SJohn Garry sbq = &hctx->tags->bitmap_tags; 1392e8618575SJens Axboe atomic_dec(&sbq->ws_active); 1393e8618575SJens Axboe } 13945815839bSMing Lei spin_unlock(&hctx->dispatch_wait_lock); 13955815839bSMing Lei 1396da55f2ccSOmar Sandoval blk_mq_run_hw_queue(hctx, true); 1397da55f2ccSOmar Sandoval return 1; 1398da55f2ccSOmar Sandoval } 1399da55f2ccSOmar Sandoval 1400f906a6a0SJens Axboe /* 1401f906a6a0SJens Axboe * Mark us waiting for a tag. For shared tags, this involves hooking us into 1402ee3e4de5SBart Van Assche * the tag wakeups. For non-shared tags, we can simply mark us needing a 1403ee3e4de5SBart Van Assche * restart. For both cases, take care to check the condition again after 1404f906a6a0SJens Axboe * marking us as waiting. 1405f906a6a0SJens Axboe */ 14062278d69fSMing Lei static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1407eb619fdbSJens Axboe struct request *rq) 1408da55f2ccSOmar Sandoval { 1409ae0f1a73SJohn Garry struct sbitmap_queue *sbq = &hctx->tags->bitmap_tags; 14105815839bSMing Lei struct wait_queue_head *wq; 1411f906a6a0SJens Axboe wait_queue_entry_t *wait; 1412f906a6a0SJens Axboe bool ret; 1413da55f2ccSOmar Sandoval 141451db1c37SMing Lei if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 1415684b7324SYufen Yu blk_mq_sched_mark_restart_hctx(hctx); 1416c27d53fbSBart Van Assche 1417c27d53fbSBart Van Assche /* 1418c27d53fbSBart Van Assche * It's possible that a tag was freed in the window between the 1419c27d53fbSBart Van Assche * allocation failure and adding the hardware queue to the wait 1420c27d53fbSBart Van Assche * queue. 1421c27d53fbSBart Van Assche * 1422c27d53fbSBart Van Assche * Don't clear RESTART here, someone else could have set it. 1423c27d53fbSBart Van Assche * At most this will cost an extra queue run. 1424c27d53fbSBart Van Assche */ 14258ab6bb9eSMing Lei return blk_mq_get_driver_tag(rq); 1426c27d53fbSBart Van Assche } 1427c27d53fbSBart Van Assche 14282278d69fSMing Lei wait = &hctx->dispatch_wait; 1429eb619fdbSJens Axboe if (!list_empty_careful(&wait->entry)) 1430da55f2ccSOmar Sandoval return false; 1431da55f2ccSOmar Sandoval 1432e8618575SJens Axboe wq = &bt_wait_ptr(sbq, hctx)->wait; 14335815839bSMing Lei 14345815839bSMing Lei spin_lock_irq(&wq->lock); 14355815839bSMing Lei spin_lock(&hctx->dispatch_wait_lock); 1436eb619fdbSJens Axboe if (!list_empty(&wait->entry)) { 14375815839bSMing Lei spin_unlock(&hctx->dispatch_wait_lock); 14385815839bSMing Lei spin_unlock_irq(&wq->lock); 1439eb619fdbSJens Axboe return false; 1440eb619fdbSJens Axboe } 1441eb619fdbSJens Axboe 1442e8618575SJens Axboe atomic_inc(&sbq->ws_active); 14435815839bSMing Lei wait->flags &= ~WQ_FLAG_EXCLUSIVE; 14445815839bSMing Lei __add_wait_queue(wq, wait); 1445da55f2ccSOmar Sandoval 1446da55f2ccSOmar Sandoval /* 1447eb619fdbSJens Axboe * It's possible that a tag was freed in the window between the 1448eb619fdbSJens Axboe * allocation failure and adding the hardware queue to the wait 1449eb619fdbSJens Axboe * queue. 1450da55f2ccSOmar Sandoval */ 14518ab6bb9eSMing Lei ret = blk_mq_get_driver_tag(rq); 1452f906a6a0SJens Axboe if (!ret) { 14535815839bSMing Lei spin_unlock(&hctx->dispatch_wait_lock); 14545815839bSMing Lei spin_unlock_irq(&wq->lock); 1455eb619fdbSJens Axboe return false; 1456eb619fdbSJens Axboe } 1457eb619fdbSJens Axboe 1458eb619fdbSJens Axboe /* 1459eb619fdbSJens Axboe * We got a tag, remove ourselves from the wait queue to ensure 1460eb619fdbSJens Axboe * someone else gets the wakeup. 1461eb619fdbSJens Axboe */ 1462eb619fdbSJens Axboe list_del_init(&wait->entry); 1463e8618575SJens Axboe atomic_dec(&sbq->ws_active); 14645815839bSMing Lei spin_unlock(&hctx->dispatch_wait_lock); 14655815839bSMing Lei spin_unlock_irq(&wq->lock); 1466c27d53fbSBart Van Assche 1467da55f2ccSOmar Sandoval return true; 1468da55f2ccSOmar Sandoval } 1469da55f2ccSOmar Sandoval 14706e768717SMing Lei #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 14716e768717SMing Lei #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 14726e768717SMing Lei /* 14736e768717SMing Lei * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 14746e768717SMing Lei * - EWMA is one simple way to compute running average value 14756e768717SMing Lei * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 14766e768717SMing Lei * - take 4 as factor for avoiding to get too small(0) result, and this 14776e768717SMing Lei * factor doesn't matter because EWMA decreases exponentially 14786e768717SMing Lei */ 14796e768717SMing Lei static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 14806e768717SMing Lei { 14816e768717SMing Lei unsigned int ewma; 14826e768717SMing Lei 14836e768717SMing Lei ewma = hctx->dispatch_busy; 14846e768717SMing Lei 14856e768717SMing Lei if (!ewma && !busy) 14866e768717SMing Lei return; 14876e768717SMing Lei 14886e768717SMing Lei ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 14896e768717SMing Lei if (busy) 14906e768717SMing Lei ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 14916e768717SMing Lei ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 14926e768717SMing Lei 14936e768717SMing Lei hctx->dispatch_busy = ewma; 14946e768717SMing Lei } 14956e768717SMing Lei 149686ff7c2aSMing Lei #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 149786ff7c2aSMing Lei 1498c92a4103SJohannes Thumshirn static void blk_mq_handle_dev_resource(struct request *rq, 1499c92a4103SJohannes Thumshirn struct list_head *list) 1500c92a4103SJohannes Thumshirn { 1501c92a4103SJohannes Thumshirn struct request *next = 1502c92a4103SJohannes Thumshirn list_first_entry_or_null(list, struct request, queuelist); 1503c92a4103SJohannes Thumshirn 1504c92a4103SJohannes Thumshirn /* 1505c92a4103SJohannes Thumshirn * If an I/O scheduler has been configured and we got a driver tag for 1506c92a4103SJohannes Thumshirn * the next request already, free it. 1507c92a4103SJohannes Thumshirn */ 1508c92a4103SJohannes Thumshirn if (next) 1509c92a4103SJohannes Thumshirn blk_mq_put_driver_tag(next); 1510c92a4103SJohannes Thumshirn 1511c92a4103SJohannes Thumshirn list_add(&rq->queuelist, list); 1512c92a4103SJohannes Thumshirn __blk_mq_requeue_request(rq); 1513c92a4103SJohannes Thumshirn } 1514c92a4103SJohannes Thumshirn 15150512a75bSKeith Busch static void blk_mq_handle_zone_resource(struct request *rq, 15160512a75bSKeith Busch struct list_head *zone_list) 15170512a75bSKeith Busch { 15180512a75bSKeith Busch /* 15190512a75bSKeith Busch * If we end up here it is because we cannot dispatch a request to a 15200512a75bSKeith Busch * specific zone due to LLD level zone-write locking or other zone 15210512a75bSKeith Busch * related resource not being available. In this case, set the request 15220512a75bSKeith Busch * aside in zone_list for retrying it later. 15230512a75bSKeith Busch */ 15240512a75bSKeith Busch list_add(&rq->queuelist, zone_list); 15250512a75bSKeith Busch __blk_mq_requeue_request(rq); 15260512a75bSKeith Busch } 15270512a75bSKeith Busch 152875383524SMing Lei enum prep_dispatch { 152975383524SMing Lei PREP_DISPATCH_OK, 153075383524SMing Lei PREP_DISPATCH_NO_TAG, 153175383524SMing Lei PREP_DISPATCH_NO_BUDGET, 153275383524SMing Lei }; 153375383524SMing Lei 153475383524SMing Lei static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, 153575383524SMing Lei bool need_budget) 1536f04c3df3SJens Axboe { 153775383524SMing Lei struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 15382a5a24aaSMing Lei int budget_token = -1; 1539f04c3df3SJens Axboe 15402a5a24aaSMing Lei if (need_budget) { 15412a5a24aaSMing Lei budget_token = blk_mq_get_dispatch_budget(rq->q); 15422a5a24aaSMing Lei if (budget_token < 0) { 15435fe56de7SJohn Garry blk_mq_put_driver_tag(rq); 154475383524SMing Lei return PREP_DISPATCH_NO_BUDGET; 15455fe56de7SJohn Garry } 15462a5a24aaSMing Lei blk_mq_set_rq_budget_token(rq, budget_token); 15472a5a24aaSMing Lei } 15480bca799bSMing Lei 15498ab6bb9eSMing Lei if (!blk_mq_get_driver_tag(rq)) { 15503c782d67SJens Axboe /* 1551da55f2ccSOmar Sandoval * The initial allocation attempt failed, so we need to 1552eb619fdbSJens Axboe * rerun the hardware queue when a tag is freed. The 1553eb619fdbSJens Axboe * waitqueue takes care of that. If the queue is run 1554eb619fdbSJens Axboe * before we add this entry back on the dispatch list, 1555eb619fdbSJens Axboe * we'll re-run it below. 15563c782d67SJens Axboe */ 15572278d69fSMing Lei if (!blk_mq_mark_tag_wait(hctx, rq)) { 1558f906a6a0SJens Axboe /* 15591fd40b5eSMing Lei * All budgets not got from this function will be put 15601fd40b5eSMing Lei * together during handling partial dispatch 1561f906a6a0SJens Axboe */ 15621fd40b5eSMing Lei if (need_budget) 15632a5a24aaSMing Lei blk_mq_put_dispatch_budget(rq->q, budget_token); 156475383524SMing Lei return PREP_DISPATCH_NO_TAG; 156575383524SMing Lei } 156675383524SMing Lei } 156775383524SMing Lei 156875383524SMing Lei return PREP_DISPATCH_OK; 156975383524SMing Lei } 157075383524SMing Lei 15711fd40b5eSMing Lei /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ 15721fd40b5eSMing Lei static void blk_mq_release_budgets(struct request_queue *q, 15732a5a24aaSMing Lei struct list_head *list) 15741fd40b5eSMing Lei { 15752a5a24aaSMing Lei struct request *rq; 15761fd40b5eSMing Lei 15772a5a24aaSMing Lei list_for_each_entry(rq, list, queuelist) { 15782a5a24aaSMing Lei int budget_token = blk_mq_get_rq_budget_token(rq); 15792a5a24aaSMing Lei 15802a5a24aaSMing Lei if (budget_token >= 0) 15812a5a24aaSMing Lei blk_mq_put_dispatch_budget(q, budget_token); 15822a5a24aaSMing Lei } 15831fd40b5eSMing Lei } 15841fd40b5eSMing Lei 15851429d7c9SJens Axboe /* 15861429d7c9SJens Axboe * Returns true if we did some work AND can potentially do more. 15871429d7c9SJens Axboe */ 1588445874e8SMing Lei bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, 15891fd40b5eSMing Lei unsigned int nr_budgets) 15901429d7c9SJens Axboe { 159175383524SMing Lei enum prep_dispatch prep; 1592445874e8SMing Lei struct request_queue *q = hctx->queue; 1593703fd1c0SJens Axboe struct request *rq, *nxt; 1594703fd1c0SJens Axboe int errors, queued; 1595703fd1c0SJens Axboe blk_status_t ret = BLK_STS_OK; 1596703fd1c0SJens Axboe LIST_HEAD(zone_list); 15971429d7c9SJens Axboe 15981429d7c9SJens Axboe if (list_empty(list)) 1599f04c3df3SJens Axboe return false; 1600f04c3df3SJens Axboe 1601f04c3df3SJens Axboe /* 1602f04c3df3SJens Axboe * Now process all the entries, sending them to the driver. 1603f04c3df3SJens Axboe */ 1604f04c3df3SJens Axboe errors = queued = 0; 1605f04c3df3SJens Axboe do { 1606f04c3df3SJens Axboe struct blk_mq_queue_data bd; 1607f04c3df3SJens Axboe 1608f04c3df3SJens Axboe rq = list_first_entry(list, struct request, queuelist); 1609f04c3df3SJens Axboe 1610445874e8SMing Lei WARN_ON_ONCE(hctx != rq->mq_hctx); 16111fd40b5eSMing Lei prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); 161275383524SMing Lei if (prep != PREP_DISPATCH_OK) 1613bd166ef1SJens Axboe break; 1614de148297SMing Lei 1615f04c3df3SJens Axboe list_del_init(&rq->queuelist); 1616f04c3df3SJens Axboe 1617f04c3df3SJens Axboe bd.rq = rq; 1618113285b4SJens Axboe 1619113285b4SJens Axboe /* 1620113285b4SJens Axboe * Flag last if we have no more requests, or if we have more 1621113285b4SJens Axboe * but can't assign a driver tag to it. 1622113285b4SJens Axboe */ 1623113285b4SJens Axboe if (list_empty(list)) 1624113285b4SJens Axboe bd.last = true; 1625113285b4SJens Axboe else { 1626113285b4SJens Axboe nxt = list_first_entry(list, struct request, queuelist); 16278ab6bb9eSMing Lei bd.last = !blk_mq_get_driver_tag(nxt); 1628113285b4SJens Axboe } 1629f04c3df3SJens Axboe 16301fd40b5eSMing Lei /* 16311fd40b5eSMing Lei * once the request is queued to lld, no need to cover the 16321fd40b5eSMing Lei * budget any more 16331fd40b5eSMing Lei */ 16341fd40b5eSMing Lei if (nr_budgets) 16351fd40b5eSMing Lei nr_budgets--; 1636f04c3df3SJens Axboe ret = q->mq_ops->queue_rq(hctx, &bd); 16377bf13729SMing Lei switch (ret) { 16387bf13729SMing Lei case BLK_STS_OK: 16397bf13729SMing Lei queued++; 1640f04c3df3SJens Axboe break; 16417bf13729SMing Lei case BLK_STS_RESOURCE: 16427bf13729SMing Lei case BLK_STS_DEV_RESOURCE: 16437bf13729SMing Lei blk_mq_handle_dev_resource(rq, list); 16447bf13729SMing Lei goto out; 16457bf13729SMing Lei case BLK_STS_ZONE_RESOURCE: 16460512a75bSKeith Busch /* 16470512a75bSKeith Busch * Move the request to zone_list and keep going through 16480512a75bSKeith Busch * the dispatch list to find more requests the drive can 16490512a75bSKeith Busch * accept. 16500512a75bSKeith Busch */ 16510512a75bSKeith Busch blk_mq_handle_zone_resource(rq, &zone_list); 16520512a75bSKeith Busch break; 16537bf13729SMing Lei default: 1654fc17b653SChristoph Hellwig errors++; 1655e21ee5a6SHannes Reinecke blk_mq_end_request(rq, ret); 1656fc17b653SChristoph Hellwig } 165781380ca1SOmar Sandoval } while (!list_empty(list)); 16587bf13729SMing Lei out: 16590512a75bSKeith Busch if (!list_empty(&zone_list)) 16600512a75bSKeith Busch list_splice_tail_init(&zone_list, list); 16610512a75bSKeith Busch 1662632bfb63Syangerkun /* If we didn't flush the entire list, we could have told the driver 1663632bfb63Syangerkun * there was more coming, but that turned out to be a lie. 1664632bfb63Syangerkun */ 1665632bfb63Syangerkun if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) 1666632bfb63Syangerkun q->mq_ops->commit_rqs(hctx); 1667f04c3df3SJens Axboe /* 1668f04c3df3SJens Axboe * Any items that need requeuing? Stuff them into hctx->dispatch, 1669f04c3df3SJens Axboe * that is where we will continue on next queue run. 1670f04c3df3SJens Axboe */ 1671f04c3df3SJens Axboe if (!list_empty(list)) { 167286ff7c2aSMing Lei bool needs_restart; 167375383524SMing Lei /* For non-shared tags, the RESTART check will suffice */ 167475383524SMing Lei bool no_tag = prep == PREP_DISPATCH_NO_TAG && 167551db1c37SMing Lei (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); 167675383524SMing Lei bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; 167786ff7c2aSMing Lei 16782a5a24aaSMing Lei if (nr_budgets) 16792a5a24aaSMing Lei blk_mq_release_budgets(q, list); 1680f04c3df3SJens Axboe 1681f04c3df3SJens Axboe spin_lock(&hctx->lock); 168201e99aecSMing Lei list_splice_tail_init(list, &hctx->dispatch); 1683f04c3df3SJens Axboe spin_unlock(&hctx->lock); 1684f04c3df3SJens Axboe 1685f04c3df3SJens Axboe /* 1686d7d8535fSMing Lei * Order adding requests to hctx->dispatch and checking 1687d7d8535fSMing Lei * SCHED_RESTART flag. The pair of this smp_mb() is the one 1688d7d8535fSMing Lei * in blk_mq_sched_restart(). Avoid restart code path to 1689d7d8535fSMing Lei * miss the new added requests to hctx->dispatch, meantime 1690d7d8535fSMing Lei * SCHED_RESTART is observed here. 1691d7d8535fSMing Lei */ 1692d7d8535fSMing Lei smp_mb(); 1693d7d8535fSMing Lei 1694d7d8535fSMing Lei /* 1695710c785fSBart Van Assche * If SCHED_RESTART was set by the caller of this function and 1696710c785fSBart Van Assche * it is no longer set that means that it was cleared by another 1697710c785fSBart Van Assche * thread and hence that a queue rerun is needed. 1698f04c3df3SJens Axboe * 1699eb619fdbSJens Axboe * If 'no_tag' is set, that means that we failed getting 1700eb619fdbSJens Axboe * a driver tag with an I/O scheduler attached. If our dispatch 1701eb619fdbSJens Axboe * waitqueue is no longer active, ensure that we run the queue 1702eb619fdbSJens Axboe * AFTER adding our entries back to the list. 1703bd166ef1SJens Axboe * 1704710c785fSBart Van Assche * If no I/O scheduler has been configured it is possible that 1705710c785fSBart Van Assche * the hardware queue got stopped and restarted before requests 1706710c785fSBart Van Assche * were pushed back onto the dispatch list. Rerun the queue to 1707710c785fSBart Van Assche * avoid starvation. Notes: 1708710c785fSBart Van Assche * - blk_mq_run_hw_queue() checks whether or not a queue has 1709710c785fSBart Van Assche * been stopped before rerunning a queue. 1710710c785fSBart Van Assche * - Some but not all block drivers stop a queue before 1711fc17b653SChristoph Hellwig * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1712710c785fSBart Van Assche * and dm-rq. 171386ff7c2aSMing Lei * 171486ff7c2aSMing Lei * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 171586ff7c2aSMing Lei * bit is set, run queue after a delay to avoid IO stalls 1716ab3cee37SDouglas Anderson * that could otherwise occur if the queue is idle. We'll do 1717ab3cee37SDouglas Anderson * similar if we couldn't get budget and SCHED_RESTART is set. 1718bd166ef1SJens Axboe */ 171986ff7c2aSMing Lei needs_restart = blk_mq_sched_needs_restart(hctx); 172086ff7c2aSMing Lei if (!needs_restart || 1721eb619fdbSJens Axboe (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1722f04c3df3SJens Axboe blk_mq_run_hw_queue(hctx, true); 1723ab3cee37SDouglas Anderson else if (needs_restart && (ret == BLK_STS_RESOURCE || 1724ab3cee37SDouglas Anderson no_budget_avail)) 172586ff7c2aSMing Lei blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 17261f57f8d4SJens Axboe 17276e768717SMing Lei blk_mq_update_dispatch_busy(hctx, true); 17281f57f8d4SJens Axboe return false; 17296e768717SMing Lei } else 17306e768717SMing Lei blk_mq_update_dispatch_busy(hctx, false); 1731f04c3df3SJens Axboe 173293efe981SJens Axboe return (queued + errors) != 0; 1733f04c3df3SJens Axboe } 1734f04c3df3SJens Axboe 1735105663f7SAndré Almeida /** 1736105663f7SAndré Almeida * __blk_mq_run_hw_queue - Run a hardware queue. 1737105663f7SAndré Almeida * @hctx: Pointer to the hardware queue to run. 1738105663f7SAndré Almeida * 1739105663f7SAndré Almeida * Send pending requests to the hardware. 1740105663f7SAndré Almeida */ 17416a83e74dSBart Van Assche static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 17426a83e74dSBart Van Assche { 17436a83e74dSBart Van Assche int srcu_idx; 17446a83e74dSBart Van Assche 1745b7a71e66SJens Axboe /* 1746b7a71e66SJens Axboe * We can't run the queue inline with ints disabled. Ensure that 1747b7a71e66SJens Axboe * we catch bad users of this early. 1748b7a71e66SJens Axboe */ 1749b7a71e66SJens Axboe WARN_ON_ONCE(in_interrupt()); 1750b7a71e66SJens Axboe 175104ced159SJens Axboe might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1752bf4907c0SJens Axboe 175304ced159SJens Axboe hctx_lock(hctx, &srcu_idx); 17541f460b63SMing Lei blk_mq_sched_dispatch_requests(hctx); 175504ced159SJens Axboe hctx_unlock(hctx, srcu_idx); 17566a83e74dSBart Van Assche } 17576a83e74dSBart Van Assche 1758f82ddf19SMing Lei static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 1759f82ddf19SMing Lei { 1760f82ddf19SMing Lei int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 1761f82ddf19SMing Lei 1762f82ddf19SMing Lei if (cpu >= nr_cpu_ids) 1763f82ddf19SMing Lei cpu = cpumask_first(hctx->cpumask); 1764f82ddf19SMing Lei return cpu; 1765f82ddf19SMing Lei } 1766f82ddf19SMing Lei 1767506e931fSJens Axboe /* 1768506e931fSJens Axboe * It'd be great if the workqueue API had a way to pass 1769506e931fSJens Axboe * in a mask and had some smarts for more clever placement. 1770506e931fSJens Axboe * For now we just round-robin here, switching for every 1771506e931fSJens Axboe * BLK_MQ_CPU_WORK_BATCH queued items. 1772506e931fSJens Axboe */ 1773506e931fSJens Axboe static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1774506e931fSJens Axboe { 17757bed4595SMing Lei bool tried = false; 1776476f8c98SMing Lei int next_cpu = hctx->next_cpu; 17777bed4595SMing Lei 1778b657d7e6SChristoph Hellwig if (hctx->queue->nr_hw_queues == 1) 1779b657d7e6SChristoph Hellwig return WORK_CPU_UNBOUND; 1780506e931fSJens Axboe 1781506e931fSJens Axboe if (--hctx->next_cpu_batch <= 0) { 17827bed4595SMing Lei select_cpu: 1783476f8c98SMing Lei next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 178420e4d813SChristoph Hellwig cpu_online_mask); 1785506e931fSJens Axboe if (next_cpu >= nr_cpu_ids) 1786f82ddf19SMing Lei next_cpu = blk_mq_first_mapped_cpu(hctx); 1787506e931fSJens Axboe hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1788506e931fSJens Axboe } 1789506e931fSJens Axboe 17907bed4595SMing Lei /* 17917bed4595SMing Lei * Do unbound schedule if we can't find a online CPU for this hctx, 17927bed4595SMing Lei * and it should only happen in the path of handling CPU DEAD. 17937bed4595SMing Lei */ 1794476f8c98SMing Lei if (!cpu_online(next_cpu)) { 17957bed4595SMing Lei if (!tried) { 17967bed4595SMing Lei tried = true; 17977bed4595SMing Lei goto select_cpu; 17987bed4595SMing Lei } 17997bed4595SMing Lei 18007bed4595SMing Lei /* 18017bed4595SMing Lei * Make sure to re-select CPU next time once after CPUs 18027bed4595SMing Lei * in hctx->cpumask become online again. 18037bed4595SMing Lei */ 1804476f8c98SMing Lei hctx->next_cpu = next_cpu; 18057bed4595SMing Lei hctx->next_cpu_batch = 1; 18067bed4595SMing Lei return WORK_CPU_UNBOUND; 18077bed4595SMing Lei } 1808476f8c98SMing Lei 1809476f8c98SMing Lei hctx->next_cpu = next_cpu; 1810476f8c98SMing Lei return next_cpu; 1811b657d7e6SChristoph Hellwig } 1812b657d7e6SChristoph Hellwig 1813105663f7SAndré Almeida /** 1814105663f7SAndré Almeida * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. 1815105663f7SAndré Almeida * @hctx: Pointer to the hardware queue to run. 1816105663f7SAndré Almeida * @async: If we want to run the queue asynchronously. 1817fa94ba8aSMinwoo Im * @msecs: Milliseconds of delay to wait before running the queue. 1818105663f7SAndré Almeida * 1819105663f7SAndré Almeida * If !@async, try to run the queue now. Else, run the queue asynchronously and 1820105663f7SAndré Almeida * with a delay of @msecs. 1821105663f7SAndré Almeida */ 18227587a5aeSBart Van Assche static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 18237587a5aeSBart Van Assche unsigned long msecs) 1824320ae51fSJens Axboe { 18255435c023SBart Van Assche if (unlikely(blk_mq_hctx_stopped(hctx))) 1826320ae51fSJens Axboe return; 1827320ae51fSJens Axboe 18281b792f2fSJens Axboe if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 18292a90d4aaSPaolo Bonzini int cpu = get_cpu(); 18302a90d4aaSPaolo Bonzini if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1831320ae51fSJens Axboe __blk_mq_run_hw_queue(hctx); 18322a90d4aaSPaolo Bonzini put_cpu(); 1833398205b8SPaolo Bonzini return; 1834e4043dcfSJens Axboe } 1835398205b8SPaolo Bonzini 18362a90d4aaSPaolo Bonzini put_cpu(); 1837398205b8SPaolo Bonzini } 1838398205b8SPaolo Bonzini 1839ae943d20SBart Van Assche kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 18407587a5aeSBart Van Assche msecs_to_jiffies(msecs)); 18417587a5aeSBart Van Assche } 18427587a5aeSBart Van Assche 1843105663f7SAndré Almeida /** 1844105663f7SAndré Almeida * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. 1845105663f7SAndré Almeida * @hctx: Pointer to the hardware queue to run. 1846fa94ba8aSMinwoo Im * @msecs: Milliseconds of delay to wait before running the queue. 1847105663f7SAndré Almeida * 1848105663f7SAndré Almeida * Run a hardware queue asynchronously with a delay of @msecs. 1849105663f7SAndré Almeida */ 18507587a5aeSBart Van Assche void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 18517587a5aeSBart Van Assche { 18527587a5aeSBart Van Assche __blk_mq_delay_run_hw_queue(hctx, true, msecs); 18537587a5aeSBart Van Assche } 18547587a5aeSBart Van Assche EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 18557587a5aeSBart Van Assche 1856105663f7SAndré Almeida /** 1857105663f7SAndré Almeida * blk_mq_run_hw_queue - Start to run a hardware queue. 1858105663f7SAndré Almeida * @hctx: Pointer to the hardware queue to run. 1859105663f7SAndré Almeida * @async: If we want to run the queue asynchronously. 1860105663f7SAndré Almeida * 1861105663f7SAndré Almeida * Check if the request queue is not in a quiesced state and if there are 1862105663f7SAndré Almeida * pending requests to be sent. If this is true, run the queue to send requests 1863105663f7SAndré Almeida * to hardware. 1864105663f7SAndré Almeida */ 1865626fb735SJohn Garry void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 18667587a5aeSBart Van Assche { 186724f5a90fSMing Lei int srcu_idx; 186824f5a90fSMing Lei bool need_run; 186924f5a90fSMing Lei 187024f5a90fSMing Lei /* 187124f5a90fSMing Lei * When queue is quiesced, we may be switching io scheduler, or 187224f5a90fSMing Lei * updating nr_hw_queues, or other things, and we can't run queue 187324f5a90fSMing Lei * any more, even __blk_mq_hctx_has_pending() can't be called safely. 187424f5a90fSMing Lei * 187524f5a90fSMing Lei * And queue will be rerun in blk_mq_unquiesce_queue() if it is 187624f5a90fSMing Lei * quiesced. 187724f5a90fSMing Lei */ 187804ced159SJens Axboe hctx_lock(hctx, &srcu_idx); 187924f5a90fSMing Lei need_run = !blk_queue_quiesced(hctx->queue) && 188024f5a90fSMing Lei blk_mq_hctx_has_pending(hctx); 188104ced159SJens Axboe hctx_unlock(hctx, srcu_idx); 188224f5a90fSMing Lei 1883626fb735SJohn Garry if (need_run) 18847587a5aeSBart Van Assche __blk_mq_delay_run_hw_queue(hctx, async, 0); 1885320ae51fSJens Axboe } 18865b727272SOmar Sandoval EXPORT_SYMBOL(blk_mq_run_hw_queue); 1887320ae51fSJens Axboe 1888b6e68ee8SJan Kara /* 1889b6e68ee8SJan Kara * Is the request queue handled by an IO scheduler that does not respect 1890b6e68ee8SJan Kara * hardware queues when dispatching? 1891b6e68ee8SJan Kara */ 1892b6e68ee8SJan Kara static bool blk_mq_has_sqsched(struct request_queue *q) 1893b6e68ee8SJan Kara { 1894b6e68ee8SJan Kara struct elevator_queue *e = q->elevator; 1895b6e68ee8SJan Kara 1896b6e68ee8SJan Kara if (e && e->type->ops.dispatch_request && 1897b6e68ee8SJan Kara !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) 1898b6e68ee8SJan Kara return true; 1899b6e68ee8SJan Kara return false; 1900b6e68ee8SJan Kara } 1901b6e68ee8SJan Kara 1902b6e68ee8SJan Kara /* 1903b6e68ee8SJan Kara * Return prefered queue to dispatch from (if any) for non-mq aware IO 1904b6e68ee8SJan Kara * scheduler. 1905b6e68ee8SJan Kara */ 1906b6e68ee8SJan Kara static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) 1907b6e68ee8SJan Kara { 1908b6e68ee8SJan Kara struct blk_mq_hw_ctx *hctx; 1909b6e68ee8SJan Kara 1910b6e68ee8SJan Kara /* 1911b6e68ee8SJan Kara * If the IO scheduler does not respect hardware queues when 1912b6e68ee8SJan Kara * dispatching, we just don't bother with multiple HW queues and 1913b6e68ee8SJan Kara * dispatch from hctx for the current CPU since running multiple queues 1914b6e68ee8SJan Kara * just causes lock contention inside the scheduler and pointless cache 1915b6e68ee8SJan Kara * bouncing. 1916b6e68ee8SJan Kara */ 1917b6e68ee8SJan Kara hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, 1918b6e68ee8SJan Kara raw_smp_processor_id()); 1919b6e68ee8SJan Kara if (!blk_mq_hctx_stopped(hctx)) 1920b6e68ee8SJan Kara return hctx; 1921b6e68ee8SJan Kara return NULL; 1922b6e68ee8SJan Kara } 1923b6e68ee8SJan Kara 1924105663f7SAndré Almeida /** 192524f7bb88SMauro Carvalho Chehab * blk_mq_run_hw_queues - Run all hardware queues in a request queue. 1926105663f7SAndré Almeida * @q: Pointer to the request queue to run. 1927105663f7SAndré Almeida * @async: If we want to run the queue asynchronously. 1928105663f7SAndré Almeida */ 1929b94ec296SMike Snitzer void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1930320ae51fSJens Axboe { 1931b6e68ee8SJan Kara struct blk_mq_hw_ctx *hctx, *sq_hctx; 1932320ae51fSJens Axboe int i; 1933320ae51fSJens Axboe 1934b6e68ee8SJan Kara sq_hctx = NULL; 1935b6e68ee8SJan Kara if (blk_mq_has_sqsched(q)) 1936b6e68ee8SJan Kara sq_hctx = blk_mq_get_sq_hctx(q); 1937320ae51fSJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 193879f720a7SJens Axboe if (blk_mq_hctx_stopped(hctx)) 1939320ae51fSJens Axboe continue; 1940b6e68ee8SJan Kara /* 1941b6e68ee8SJan Kara * Dispatch from this hctx either if there's no hctx preferred 1942b6e68ee8SJan Kara * by IO scheduler or if it has requests that bypass the 1943b6e68ee8SJan Kara * scheduler. 1944b6e68ee8SJan Kara */ 1945b6e68ee8SJan Kara if (!sq_hctx || sq_hctx == hctx || 1946b6e68ee8SJan Kara !list_empty_careful(&hctx->dispatch)) 1947b94ec296SMike Snitzer blk_mq_run_hw_queue(hctx, async); 1948320ae51fSJens Axboe } 1949320ae51fSJens Axboe } 1950b94ec296SMike Snitzer EXPORT_SYMBOL(blk_mq_run_hw_queues); 1951320ae51fSJens Axboe 1952fd001443SBart Van Assche /** 1953b9151e7bSDouglas Anderson * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. 1954b9151e7bSDouglas Anderson * @q: Pointer to the request queue to run. 1955fa94ba8aSMinwoo Im * @msecs: Milliseconds of delay to wait before running the queues. 1956b9151e7bSDouglas Anderson */ 1957b9151e7bSDouglas Anderson void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) 1958b9151e7bSDouglas Anderson { 1959b6e68ee8SJan Kara struct blk_mq_hw_ctx *hctx, *sq_hctx; 1960b9151e7bSDouglas Anderson int i; 1961b9151e7bSDouglas Anderson 1962b6e68ee8SJan Kara sq_hctx = NULL; 1963b6e68ee8SJan Kara if (blk_mq_has_sqsched(q)) 1964b6e68ee8SJan Kara sq_hctx = blk_mq_get_sq_hctx(q); 1965b9151e7bSDouglas Anderson queue_for_each_hw_ctx(q, hctx, i) { 1966b9151e7bSDouglas Anderson if (blk_mq_hctx_stopped(hctx)) 1967b9151e7bSDouglas Anderson continue; 1968b6e68ee8SJan Kara /* 1969b6e68ee8SJan Kara * Dispatch from this hctx either if there's no hctx preferred 1970b6e68ee8SJan Kara * by IO scheduler or if it has requests that bypass the 1971b6e68ee8SJan Kara * scheduler. 1972b6e68ee8SJan Kara */ 1973b6e68ee8SJan Kara if (!sq_hctx || sq_hctx == hctx || 1974b6e68ee8SJan Kara !list_empty_careful(&hctx->dispatch)) 1975b9151e7bSDouglas Anderson blk_mq_delay_run_hw_queue(hctx, msecs); 1976b9151e7bSDouglas Anderson } 1977b9151e7bSDouglas Anderson } 1978b9151e7bSDouglas Anderson EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); 1979b9151e7bSDouglas Anderson 1980b9151e7bSDouglas Anderson /** 1981fd001443SBart Van Assche * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1982fd001443SBart Van Assche * @q: request queue. 1983fd001443SBart Van Assche * 1984fd001443SBart Van Assche * The caller is responsible for serializing this function against 1985fd001443SBart Van Assche * blk_mq_{start,stop}_hw_queue(). 1986fd001443SBart Van Assche */ 1987fd001443SBart Van Assche bool blk_mq_queue_stopped(struct request_queue *q) 1988fd001443SBart Van Assche { 1989fd001443SBart Van Assche struct blk_mq_hw_ctx *hctx; 1990fd001443SBart Van Assche int i; 1991fd001443SBart Van Assche 1992fd001443SBart Van Assche queue_for_each_hw_ctx(q, hctx, i) 1993fd001443SBart Van Assche if (blk_mq_hctx_stopped(hctx)) 1994fd001443SBart Van Assche return true; 1995fd001443SBart Van Assche 1996fd001443SBart Van Assche return false; 1997fd001443SBart Van Assche } 1998fd001443SBart Van Assche EXPORT_SYMBOL(blk_mq_queue_stopped); 1999fd001443SBart Van Assche 200039a70c76SMing Lei /* 200139a70c76SMing Lei * This function is often used for pausing .queue_rq() by driver when 200239a70c76SMing Lei * there isn't enough resource or some conditions aren't satisfied, and 20034d606219SBart Van Assche * BLK_STS_RESOURCE is usually returned. 200439a70c76SMing Lei * 200539a70c76SMing Lei * We do not guarantee that dispatch can be drained or blocked 200639a70c76SMing Lei * after blk_mq_stop_hw_queue() returns. Please use 200739a70c76SMing Lei * blk_mq_quiesce_queue() for that requirement. 200839a70c76SMing Lei */ 2009320ae51fSJens Axboe void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 2010320ae51fSJens Axboe { 2011641a9ed6SMing Lei cancel_delayed_work(&hctx->run_work); 2012641a9ed6SMing Lei 2013641a9ed6SMing Lei set_bit(BLK_MQ_S_STOPPED, &hctx->state); 2014320ae51fSJens Axboe } 2015320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_stop_hw_queue); 2016320ae51fSJens Axboe 201739a70c76SMing Lei /* 201839a70c76SMing Lei * This function is often used for pausing .queue_rq() by driver when 201939a70c76SMing Lei * there isn't enough resource or some conditions aren't satisfied, and 20204d606219SBart Van Assche * BLK_STS_RESOURCE is usually returned. 202139a70c76SMing Lei * 202239a70c76SMing Lei * We do not guarantee that dispatch can be drained or blocked 202339a70c76SMing Lei * after blk_mq_stop_hw_queues() returns. Please use 202439a70c76SMing Lei * blk_mq_quiesce_queue() for that requirement. 202539a70c76SMing Lei */ 20262719aa21SJens Axboe void blk_mq_stop_hw_queues(struct request_queue *q) 20272719aa21SJens Axboe { 2028641a9ed6SMing Lei struct blk_mq_hw_ctx *hctx; 2029641a9ed6SMing Lei int i; 2030641a9ed6SMing Lei 2031641a9ed6SMing Lei queue_for_each_hw_ctx(q, hctx, i) 2032641a9ed6SMing Lei blk_mq_stop_hw_queue(hctx); 2033280d45f6SChristoph Hellwig } 2034280d45f6SChristoph Hellwig EXPORT_SYMBOL(blk_mq_stop_hw_queues); 2035280d45f6SChristoph Hellwig 2036320ae51fSJens Axboe void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 2037320ae51fSJens Axboe { 2038320ae51fSJens Axboe clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2039e4043dcfSJens Axboe 20400ffbce80SJens Axboe blk_mq_run_hw_queue(hctx, false); 2041320ae51fSJens Axboe } 2042320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_start_hw_queue); 2043320ae51fSJens Axboe 20442f268556SChristoph Hellwig void blk_mq_start_hw_queues(struct request_queue *q) 20452f268556SChristoph Hellwig { 20462f268556SChristoph Hellwig struct blk_mq_hw_ctx *hctx; 20472f268556SChristoph Hellwig int i; 20482f268556SChristoph Hellwig 20492f268556SChristoph Hellwig queue_for_each_hw_ctx(q, hctx, i) 20502f268556SChristoph Hellwig blk_mq_start_hw_queue(hctx); 20512f268556SChristoph Hellwig } 20522f268556SChristoph Hellwig EXPORT_SYMBOL(blk_mq_start_hw_queues); 20532f268556SChristoph Hellwig 2054ae911c5eSJens Axboe void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 2055ae911c5eSJens Axboe { 2056ae911c5eSJens Axboe if (!blk_mq_hctx_stopped(hctx)) 2057ae911c5eSJens Axboe return; 2058ae911c5eSJens Axboe 2059ae911c5eSJens Axboe clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 2060ae911c5eSJens Axboe blk_mq_run_hw_queue(hctx, async); 2061ae911c5eSJens Axboe } 2062ae911c5eSJens Axboe EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 2063ae911c5eSJens Axboe 20641b4a3258SChristoph Hellwig void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 2065320ae51fSJens Axboe { 2066320ae51fSJens Axboe struct blk_mq_hw_ctx *hctx; 2067320ae51fSJens Axboe int i; 2068320ae51fSJens Axboe 2069ae911c5eSJens Axboe queue_for_each_hw_ctx(q, hctx, i) 2070ae911c5eSJens Axboe blk_mq_start_stopped_hw_queue(hctx, async); 2071320ae51fSJens Axboe } 2072320ae51fSJens Axboe EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 2073320ae51fSJens Axboe 207470f4db63SChristoph Hellwig static void blk_mq_run_work_fn(struct work_struct *work) 2075320ae51fSJens Axboe { 2076320ae51fSJens Axboe struct blk_mq_hw_ctx *hctx; 2077320ae51fSJens Axboe 20789f993737SJens Axboe hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 207921c6e939SJens Axboe 208021c6e939SJens Axboe /* 208115fe8a90SMing Lei * If we are stopped, don't run the queue. 208221c6e939SJens Axboe */ 20830841031aSYufen Yu if (blk_mq_hctx_stopped(hctx)) 20840196d6b4SJianchao Wang return; 2085e4043dcfSJens Axboe 2086320ae51fSJens Axboe __blk_mq_run_hw_queue(hctx); 2087320ae51fSJens Axboe } 2088320ae51fSJens Axboe 2089cfd0c552SMing Lei static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 2090cfd0c552SMing Lei struct request *rq, 2091cfd0c552SMing Lei bool at_head) 2092320ae51fSJens Axboe { 2093e57690feSJens Axboe struct blk_mq_ctx *ctx = rq->mq_ctx; 2094c16d6b5aSMing Lei enum hctx_type type = hctx->type; 2095e57690feSJens Axboe 20967b607814SBart Van Assche lockdep_assert_held(&ctx->lock); 20977b607814SBart Van Assche 2098a54895faSChristoph Hellwig trace_block_rq_insert(rq); 209901b983c9SJens Axboe 210072a0a36eSChristoph Hellwig if (at_head) 2101c16d6b5aSMing Lei list_add(&rq->queuelist, &ctx->rq_lists[type]); 210272a0a36eSChristoph Hellwig else 2103c16d6b5aSMing Lei list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); 2104cfd0c552SMing Lei } 21054bb659b1SJens Axboe 21062c3ad667SJens Axboe void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 21072c3ad667SJens Axboe bool at_head) 2108cfd0c552SMing Lei { 2109cfd0c552SMing Lei struct blk_mq_ctx *ctx = rq->mq_ctx; 2110cfd0c552SMing Lei 21117b607814SBart Van Assche lockdep_assert_held(&ctx->lock); 21127b607814SBart Van Assche 2113e57690feSJens Axboe __blk_mq_insert_req_list(hctx, rq, at_head); 2114320ae51fSJens Axboe blk_mq_hctx_mark_pending(hctx, ctx); 2115320ae51fSJens Axboe } 2116320ae51fSJens Axboe 2117105663f7SAndré Almeida /** 2118105663f7SAndré Almeida * blk_mq_request_bypass_insert - Insert a request at dispatch list. 2119105663f7SAndré Almeida * @rq: Pointer to request to be inserted. 212026bfeb26SRandy Dunlap * @at_head: true if the request should be inserted at the head of the list. 2121105663f7SAndré Almeida * @run_queue: If we should run the hardware queue after inserting the request. 2122105663f7SAndré Almeida * 2123157f377bSJens Axboe * Should only be used carefully, when the caller knows we want to 2124157f377bSJens Axboe * bypass a potential IO scheduler on the target device. 2125157f377bSJens Axboe */ 212601e99aecSMing Lei void blk_mq_request_bypass_insert(struct request *rq, bool at_head, 212701e99aecSMing Lei bool run_queue) 2128157f377bSJens Axboe { 2129ea4f995eSJens Axboe struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2130157f377bSJens Axboe 2131157f377bSJens Axboe spin_lock(&hctx->lock); 213201e99aecSMing Lei if (at_head) 213301e99aecSMing Lei list_add(&rq->queuelist, &hctx->dispatch); 213401e99aecSMing Lei else 2135157f377bSJens Axboe list_add_tail(&rq->queuelist, &hctx->dispatch); 2136157f377bSJens Axboe spin_unlock(&hctx->lock); 2137157f377bSJens Axboe 2138b0850297SMing Lei if (run_queue) 2139157f377bSJens Axboe blk_mq_run_hw_queue(hctx, false); 2140157f377bSJens Axboe } 2141157f377bSJens Axboe 2142bd166ef1SJens Axboe void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 2143bd166ef1SJens Axboe struct list_head *list) 2144320ae51fSJens Axboe 2145320ae51fSJens Axboe { 21463f0cedc7SMing Lei struct request *rq; 2147c16d6b5aSMing Lei enum hctx_type type = hctx->type; 21483f0cedc7SMing Lei 2149320ae51fSJens Axboe /* 2150320ae51fSJens Axboe * preemption doesn't flush plug list, so it's possible ctx->cpu is 2151320ae51fSJens Axboe * offline now 2152320ae51fSJens Axboe */ 21533f0cedc7SMing Lei list_for_each_entry(rq, list, queuelist) { 2154e57690feSJens Axboe BUG_ON(rq->mq_ctx != ctx); 2155a54895faSChristoph Hellwig trace_block_rq_insert(rq); 2156320ae51fSJens Axboe } 21573f0cedc7SMing Lei 21583f0cedc7SMing Lei spin_lock(&ctx->lock); 2159c16d6b5aSMing Lei list_splice_tail_init(list, &ctx->rq_lists[type]); 2160cfd0c552SMing Lei blk_mq_hctx_mark_pending(hctx, ctx); 2161320ae51fSJens Axboe spin_unlock(&ctx->lock); 2162320ae51fSJens Axboe } 2163320ae51fSJens Axboe 21644f0f586bSSami Tolvanen static int plug_rq_cmp(void *priv, const struct list_head *a, 21654f0f586bSSami Tolvanen const struct list_head *b) 2166320ae51fSJens Axboe { 2167320ae51fSJens Axboe struct request *rqa = container_of(a, struct request, queuelist); 2168320ae51fSJens Axboe struct request *rqb = container_of(b, struct request, queuelist); 2169320ae51fSJens Axboe 21707d30a621SPavel Begunkov if (rqa->mq_ctx != rqb->mq_ctx) 21717d30a621SPavel Begunkov return rqa->mq_ctx > rqb->mq_ctx; 21727d30a621SPavel Begunkov if (rqa->mq_hctx != rqb->mq_hctx) 21737d30a621SPavel Begunkov return rqa->mq_hctx > rqb->mq_hctx; 21743110fc79SJens Axboe 21753110fc79SJens Axboe return blk_rq_pos(rqa) > blk_rq_pos(rqb); 2176320ae51fSJens Axboe } 2177320ae51fSJens Axboe 2178320ae51fSJens Axboe void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 2179320ae51fSJens Axboe { 2180320ae51fSJens Axboe LIST_HEAD(list); 2181320ae51fSJens Axboe 218295ed0c5bSPavel Begunkov if (list_empty(&plug->mq_list)) 218395ed0c5bSPavel Begunkov return; 2184320ae51fSJens Axboe list_splice_init(&plug->mq_list, &list); 2185320ae51fSJens Axboe 2186ce5b009cSJens Axboe if (plug->rq_count > 2 && plug->multiple_queues) 21873110fc79SJens Axboe list_sort(NULL, &list, plug_rq_cmp); 2188320ae51fSJens Axboe 2189bcc816dfSDongli Zhang plug->rq_count = 0; 2190bcc816dfSDongli Zhang 219195ed0c5bSPavel Begunkov do { 219295ed0c5bSPavel Begunkov struct list_head rq_list; 219395ed0c5bSPavel Begunkov struct request *rq, *head_rq = list_entry_rq(list.next); 219495ed0c5bSPavel Begunkov struct list_head *pos = &head_rq->queuelist; /* skip first */ 219595ed0c5bSPavel Begunkov struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; 219695ed0c5bSPavel Begunkov struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; 219795ed0c5bSPavel Begunkov unsigned int depth = 1; 2198320ae51fSJens Axboe 219995ed0c5bSPavel Begunkov list_for_each_continue(pos, &list) { 220095ed0c5bSPavel Begunkov rq = list_entry_rq(pos); 2201320ae51fSJens Axboe BUG_ON(!rq->q); 220295ed0c5bSPavel Begunkov if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) 220395ed0c5bSPavel Begunkov break; 2204320ae51fSJens Axboe depth++; 2205320ae51fSJens Axboe } 2206320ae51fSJens Axboe 220795ed0c5bSPavel Begunkov list_cut_before(&rq_list, &list, pos); 220895ed0c5bSPavel Begunkov trace_block_unplug(head_rq->q, depth, !from_schedule); 220967cae4c9SJens Axboe blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, 2210320ae51fSJens Axboe from_schedule); 221195ed0c5bSPavel Begunkov } while(!list_empty(&list)); 2212320ae51fSJens Axboe } 2213320ae51fSJens Axboe 221414ccb66bSChristoph Hellwig static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, 221514ccb66bSChristoph Hellwig unsigned int nr_segs) 2216320ae51fSJens Axboe { 221793f221aeSEric Biggers int err; 221893f221aeSEric Biggers 2219f924cddeSChristoph Hellwig if (bio->bi_opf & REQ_RAHEAD) 2220f924cddeSChristoph Hellwig rq->cmd_flags |= REQ_FAILFAST_MASK; 2221f924cddeSChristoph Hellwig 2222f924cddeSChristoph Hellwig rq->__sector = bio->bi_iter.bi_sector; 2223f924cddeSChristoph Hellwig rq->write_hint = bio->bi_write_hint; 222414ccb66bSChristoph Hellwig blk_rq_bio_prep(rq, bio, nr_segs); 222593f221aeSEric Biggers 222693f221aeSEric Biggers /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ 222793f221aeSEric Biggers err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); 222893f221aeSEric Biggers WARN_ON_ONCE(err); 22294b570521SJens Axboe 2230b5af37abSKonstantin Khlebnikov blk_account_io_start(rq); 2231320ae51fSJens Axboe } 2232320ae51fSJens Axboe 22330f95549cSMike Snitzer static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 22343e08773cSChristoph Hellwig struct request *rq, bool last) 2235f984df1fSShaohua Li { 2236f984df1fSShaohua Li struct request_queue *q = rq->q; 2237f984df1fSShaohua Li struct blk_mq_queue_data bd = { 2238f984df1fSShaohua Li .rq = rq, 2239be94f058SJens Axboe .last = last, 2240f984df1fSShaohua Li }; 2241f06345adSJens Axboe blk_status_t ret; 22420f95549cSMike Snitzer 22430f95549cSMike Snitzer /* 22440f95549cSMike Snitzer * For OK queue, we are done. For error, caller may kill it. 22450f95549cSMike Snitzer * Any other error (busy), just add it to our list as we 22460f95549cSMike Snitzer * previously would have done. 22470f95549cSMike Snitzer */ 22480f95549cSMike Snitzer ret = q->mq_ops->queue_rq(hctx, &bd); 22490f95549cSMike Snitzer switch (ret) { 22500f95549cSMike Snitzer case BLK_STS_OK: 22516ce3dd6eSMing Lei blk_mq_update_dispatch_busy(hctx, false); 22520f95549cSMike Snitzer break; 22530f95549cSMike Snitzer case BLK_STS_RESOURCE: 225486ff7c2aSMing Lei case BLK_STS_DEV_RESOURCE: 22556ce3dd6eSMing Lei blk_mq_update_dispatch_busy(hctx, true); 22560f95549cSMike Snitzer __blk_mq_requeue_request(rq); 22570f95549cSMike Snitzer break; 22580f95549cSMike Snitzer default: 22596ce3dd6eSMing Lei blk_mq_update_dispatch_busy(hctx, false); 22600f95549cSMike Snitzer break; 22610f95549cSMike Snitzer } 22620f95549cSMike Snitzer 22630f95549cSMike Snitzer return ret; 22640f95549cSMike Snitzer } 22650f95549cSMike Snitzer 2266fd9c40f6SBart Van Assche static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 22670f95549cSMike Snitzer struct request *rq, 2268fd9c40f6SBart Van Assche bool bypass_insert, bool last) 22690f95549cSMike Snitzer { 22700f95549cSMike Snitzer struct request_queue *q = rq->q; 2271d964f04aSMing Lei bool run_queue = true; 22722a5a24aaSMing Lei int budget_token; 2273d964f04aSMing Lei 227423d4ee19SMing Lei /* 2275fd9c40f6SBart Van Assche * RCU or SRCU read lock is needed before checking quiesced flag. 227623d4ee19SMing Lei * 2277fd9c40f6SBart Van Assche * When queue is stopped or quiesced, ignore 'bypass_insert' from 2278fd9c40f6SBart Van Assche * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, 2279fd9c40f6SBart Van Assche * and avoid driver to try to dispatch again. 228023d4ee19SMing Lei */ 2281fd9c40f6SBart Van Assche if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 2282d964f04aSMing Lei run_queue = false; 2283fd9c40f6SBart Van Assche bypass_insert = false; 2284fd9c40f6SBart Van Assche goto insert; 2285d964f04aSMing Lei } 2286f984df1fSShaohua Li 22872ff0682dSJens Axboe if ((rq->rq_flags & RQF_ELV) && !bypass_insert) 2288fd9c40f6SBart Van Assche goto insert; 22892253efc8SBart Van Assche 22902a5a24aaSMing Lei budget_token = blk_mq_get_dispatch_budget(q); 22912a5a24aaSMing Lei if (budget_token < 0) 2292fd9c40f6SBart Van Assche goto insert; 2293bd166ef1SJens Axboe 22942a5a24aaSMing Lei blk_mq_set_rq_budget_token(rq, budget_token); 22952a5a24aaSMing Lei 22968ab6bb9eSMing Lei if (!blk_mq_get_driver_tag(rq)) { 22972a5a24aaSMing Lei blk_mq_put_dispatch_budget(q, budget_token); 2298fd9c40f6SBart Van Assche goto insert; 229988022d72SMing Lei } 2300de148297SMing Lei 23013e08773cSChristoph Hellwig return __blk_mq_issue_directly(hctx, rq, last); 2302fd9c40f6SBart Van Assche insert: 2303fd9c40f6SBart Van Assche if (bypass_insert) 2304fd9c40f6SBart Van Assche return BLK_STS_RESOURCE; 2305fd9c40f6SBart Van Assche 2306db03f88fSMing Lei blk_mq_sched_insert_request(rq, false, run_queue, false); 2307db03f88fSMing Lei 2308fd9c40f6SBart Van Assche return BLK_STS_OK; 23097f556a44SJianchao Wang } 2310fd9c40f6SBart Van Assche 2311105663f7SAndré Almeida /** 2312105663f7SAndré Almeida * blk_mq_try_issue_directly - Try to send a request directly to device driver. 2313105663f7SAndré Almeida * @hctx: Pointer of the associated hardware queue. 2314105663f7SAndré Almeida * @rq: Pointer to request to be sent. 2315105663f7SAndré Almeida * 2316105663f7SAndré Almeida * If the device has enough resources to accept a new request now, send the 2317105663f7SAndré Almeida * request directly to device driver. Else, insert at hctx->dispatch queue, so 2318105663f7SAndré Almeida * we can try send it another time in the future. Requests inserted at this 2319105663f7SAndré Almeida * queue have higher priority. 2320105663f7SAndré Almeida */ 2321fd9c40f6SBart Van Assche static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 23223e08773cSChristoph Hellwig struct request *rq) 2323fd9c40f6SBart Van Assche { 2324fd9c40f6SBart Van Assche blk_status_t ret; 2325fd9c40f6SBart Van Assche int srcu_idx; 2326fd9c40f6SBart Van Assche 2327fd9c40f6SBart Van Assche might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 2328fd9c40f6SBart Van Assche 2329fd9c40f6SBart Van Assche hctx_lock(hctx, &srcu_idx); 2330fd9c40f6SBart Van Assche 23313e08773cSChristoph Hellwig ret = __blk_mq_try_issue_directly(hctx, rq, false, true); 2332fd9c40f6SBart Van Assche if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) 233301e99aecSMing Lei blk_mq_request_bypass_insert(rq, false, true); 2334fd9c40f6SBart Van Assche else if (ret != BLK_STS_OK) 23357f556a44SJianchao Wang blk_mq_end_request(rq, ret); 2336fd9c40f6SBart Van Assche 2337fd9c40f6SBart Van Assche hctx_unlock(hctx, srcu_idx); 23387f556a44SJianchao Wang } 23397f556a44SJianchao Wang 2340fd9c40f6SBart Van Assche blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) 2341fd9c40f6SBart Van Assche { 2342fd9c40f6SBart Van Assche blk_status_t ret; 2343fd9c40f6SBart Van Assche int srcu_idx; 2344fd9c40f6SBart Van Assche struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2345fd9c40f6SBart Van Assche 2346fd9c40f6SBart Van Assche hctx_lock(hctx, &srcu_idx); 23473e08773cSChristoph Hellwig ret = __blk_mq_try_issue_directly(hctx, rq, true, last); 2348fd9c40f6SBart Van Assche hctx_unlock(hctx, srcu_idx); 2349fd9c40f6SBart Van Assche 23507f556a44SJianchao Wang return ret; 23515eb6126eSChristoph Hellwig } 23525eb6126eSChristoph Hellwig 23536ce3dd6eSMing Lei void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 23546ce3dd6eSMing Lei struct list_head *list) 23556ce3dd6eSMing Lei { 2356536167d4SKeith Busch int queued = 0; 2357632bfb63Syangerkun int errors = 0; 2358536167d4SKeith Busch 23596ce3dd6eSMing Lei while (!list_empty(list)) { 2360fd9c40f6SBart Van Assche blk_status_t ret; 23616ce3dd6eSMing Lei struct request *rq = list_first_entry(list, struct request, 23626ce3dd6eSMing Lei queuelist); 23636ce3dd6eSMing Lei 23646ce3dd6eSMing Lei list_del_init(&rq->queuelist); 2365fd9c40f6SBart Van Assche ret = blk_mq_request_issue_directly(rq, list_empty(list)); 2366fd9c40f6SBart Van Assche if (ret != BLK_STS_OK) { 2367fd9c40f6SBart Van Assche if (ret == BLK_STS_RESOURCE || 2368fd9c40f6SBart Van Assche ret == BLK_STS_DEV_RESOURCE) { 236901e99aecSMing Lei blk_mq_request_bypass_insert(rq, false, 2370c616cbeeSJens Axboe list_empty(list)); 2371fd9c40f6SBart Van Assche break; 2372fd9c40f6SBart Van Assche } 2373fd9c40f6SBart Van Assche blk_mq_end_request(rq, ret); 2374632bfb63Syangerkun errors++; 2375536167d4SKeith Busch } else 2376536167d4SKeith Busch queued++; 23776ce3dd6eSMing Lei } 2378d666ba98SJens Axboe 2379d666ba98SJens Axboe /* 2380d666ba98SJens Axboe * If we didn't flush the entire list, we could have told 2381d666ba98SJens Axboe * the driver there was more coming, but that turned out to 2382d666ba98SJens Axboe * be a lie. 2383d666ba98SJens Axboe */ 2384632bfb63Syangerkun if ((!list_empty(list) || errors) && 2385632bfb63Syangerkun hctx->queue->mq_ops->commit_rqs && queued) 2386d666ba98SJens Axboe hctx->queue->mq_ops->commit_rqs(hctx); 23876ce3dd6eSMing Lei } 23886ce3dd6eSMing Lei 2389ce5b009cSJens Axboe static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 2390ce5b009cSJens Axboe { 2391ce5b009cSJens Axboe list_add_tail(&rq->queuelist, &plug->mq_list); 2392ce5b009cSJens Axboe plug->rq_count++; 2393ce5b009cSJens Axboe if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { 2394ce5b009cSJens Axboe struct request *tmp; 2395ce5b009cSJens Axboe 2396ce5b009cSJens Axboe tmp = list_first_entry(&plug->mq_list, struct request, 2397ce5b009cSJens Axboe queuelist); 2398ce5b009cSJens Axboe if (tmp->q != rq->q) 2399ce5b009cSJens Axboe plug->multiple_queues = true; 2400ce5b009cSJens Axboe } 2401ce5b009cSJens Axboe } 2402ce5b009cSJens Axboe 24037f2a6a69SSong Liu /* 2404ba0ffdd8SJens Axboe * Allow 2x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple 24057f2a6a69SSong Liu * queues. This is important for md arrays to benefit from merging 24067f2a6a69SSong Liu * requests. 24077f2a6a69SSong Liu */ 24087f2a6a69SSong Liu static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) 24097f2a6a69SSong Liu { 24107f2a6a69SSong Liu if (plug->multiple_queues) 2411ba0ffdd8SJens Axboe return BLK_MAX_REQUEST_COUNT * 2; 24127f2a6a69SSong Liu return BLK_MAX_REQUEST_COUNT; 24137f2a6a69SSong Liu } 24147f2a6a69SSong Liu 2415105663f7SAndré Almeida /** 2416c62b37d9SChristoph Hellwig * blk_mq_submit_bio - Create and send a request to block device. 2417105663f7SAndré Almeida * @bio: Bio pointer. 2418105663f7SAndré Almeida * 2419105663f7SAndré Almeida * Builds up a request structure from @q and @bio and send to the device. The 2420105663f7SAndré Almeida * request may not be queued directly to hardware if: 2421105663f7SAndré Almeida * * This request can be merged with another one 2422105663f7SAndré Almeida * * We want to place request at plug queue for possible future merging 2423105663f7SAndré Almeida * * There is an IO scheduler active at this queue 2424105663f7SAndré Almeida * 2425105663f7SAndré Almeida * It will not queue the request if there is an error with the bio, or at the 2426105663f7SAndré Almeida * request creation. 2427105663f7SAndré Almeida */ 24283e08773cSChristoph Hellwig void blk_mq_submit_bio(struct bio *bio) 242907068d5bSJens Axboe { 2430ed6cddefSPavel Begunkov struct request_queue *q = bdev_get_queue(bio->bi_bdev); 2431ef295ecfSChristoph Hellwig const int is_sync = op_is_sync(bio->bi_opf); 2432f73f44ebSChristoph Hellwig const int is_flush_fua = op_is_flush(bio->bi_opf); 243307068d5bSJens Axboe struct request *rq; 2434f984df1fSShaohua Li struct blk_plug *plug; 24355b3f341fSShaohua Li struct request *same_queue_rq = NULL; 2436abd45c15SJens Axboe unsigned int nr_segs = 1; 2437a892c8d5SSatya Tangirala blk_status_t ret; 243807068d5bSJens Axboe 243907068d5bSJens Axboe blk_queue_bounce(q, &bio); 2440abd45c15SJens Axboe if (blk_may_split(q, bio)) 2441abd45c15SJens Axboe __blk_queue_split(q, &bio, &nr_segs); 2442f36ea50cSWen Xiong 2443e23947bdSDmitry Monakhov if (!bio_integrity_prep(bio)) 2444ac7c5675SChristoph Hellwig goto queue_exit; 244507068d5bSJens Axboe 244687c279e6SOmar Sandoval if (!is_flush_fua && !blk_queue_nomerges(q) && 244714ccb66bSChristoph Hellwig blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) 2448ac7c5675SChristoph Hellwig goto queue_exit; 2449f984df1fSShaohua Li 245014ccb66bSChristoph Hellwig if (blk_mq_sched_bio_merge(q, bio, nr_segs)) 2451ac7c5675SChristoph Hellwig goto queue_exit; 2452bd166ef1SJens Axboe 2453d5337560SChristoph Hellwig rq_qos_throttle(q, bio); 245487760e5eSJens Axboe 245547c122e3SJens Axboe plug = blk_mq_plug(q, bio); 245647c122e3SJens Axboe if (plug && plug->cached_rq) { 2457013a7f95SJens Axboe rq = rq_list_pop(&plug->cached_rq); 245847c122e3SJens Axboe INIT_LIST_HEAD(&rq->queuelist); 245947c122e3SJens Axboe } else { 24600f38d766SChristoph Hellwig struct blk_mq_alloc_data data = { 24610f38d766SChristoph Hellwig .q = q, 24620f38d766SChristoph Hellwig .nr_tags = 1, 24630f38d766SChristoph Hellwig .cmd_flags = bio->bi_opf, 24640f38d766SChristoph Hellwig }; 24650f38d766SChristoph Hellwig 246647c122e3SJens Axboe if (plug) { 246747c122e3SJens Axboe data.nr_tags = plug->nr_ios; 246847c122e3SJens Axboe plug->nr_ios = 1; 246947c122e3SJens Axboe data.cached_rq = &plug->cached_rq; 247047c122e3SJens Axboe } 2471b90cfaedSChristoph Hellwig rq = __blk_mq_alloc_requests(&data); 247287760e5eSJens Axboe if (unlikely(!rq)) { 2473c1c80384SJosef Bacik rq_qos_cleanup(q, bio); 24747b6620d7SJens Axboe if (bio->bi_opf & REQ_NOWAIT) 247503a07c92SGoldwyn Rodrigues bio_wouldblock_error(bio); 2476ac7c5675SChristoph Hellwig goto queue_exit; 247787760e5eSJens Axboe } 247847c122e3SJens Axboe } 247987760e5eSJens Axboe 2480e8a676d6SChristoph Hellwig trace_block_getrq(bio); 2481d6f1dda2SXiaoguang Wang 2482c1c80384SJosef Bacik rq_qos_track(q, rq, bio); 248307068d5bSJens Axboe 248414ccb66bSChristoph Hellwig blk_mq_bio_to_request(rq, bio, nr_segs); 2485923218f6SMing Lei 2486a892c8d5SSatya Tangirala ret = blk_crypto_init_request(rq); 2487a892c8d5SSatya Tangirala if (ret != BLK_STS_OK) { 2488a892c8d5SSatya Tangirala bio->bi_status = ret; 2489a892c8d5SSatya Tangirala bio_endio(bio); 2490a892c8d5SSatya Tangirala blk_mq_free_request(rq); 24913e08773cSChristoph Hellwig return; 2492a892c8d5SSatya Tangirala } 2493a892c8d5SSatya Tangirala 2494970d168dSBart Van Assche if (unlikely(is_flush_fua)) { 24954a60f360SJens Axboe struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2496105663f7SAndré Almeida /* Bypass scheduler for flush requests */ 2497320ae51fSJens Axboe blk_insert_flush(rq); 24984a60f360SJens Axboe blk_mq_run_hw_queue(hctx, true); 249903f26d8fSMing Lei } else if (plug && (q->nr_hw_queues == 1 || 2500079a2e3eSJohn Garry blk_mq_is_shared_tags(rq->mq_hctx->flags) || 250103f26d8fSMing Lei q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { 2502b2c5d16bSJens Axboe /* 2503b2c5d16bSJens Axboe * Use plugging if we have a ->commit_rqs() hook as well, as 2504b2c5d16bSJens Axboe * we know the driver uses bd->last in a smart fashion. 25053154df26SMing Lei * 25063154df26SMing Lei * Use normal plugging if this disk is slow HDD, as sequential 25073154df26SMing Lei * IO may benefit a lot from plug merging. 2508b2c5d16bSJens Axboe */ 25095f0ed774SJens Axboe unsigned int request_count = plug->rq_count; 2510600271d9SShaohua Li struct request *last = NULL; 2511600271d9SShaohua Li 2512676d0607SMing Lei if (!request_count) 2513320ae51fSJens Axboe trace_block_plug(q); 2514600271d9SShaohua Li else 2515600271d9SShaohua Li last = list_entry_rq(plug->mq_list.prev); 2516b094f89cSJens Axboe 25177f2a6a69SSong Liu if (request_count >= blk_plug_max_rq_count(plug) || (last && 2518600271d9SShaohua Li blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 2519320ae51fSJens Axboe blk_flush_plug_list(plug, false); 2520320ae51fSJens Axboe trace_block_plug(q); 2521320ae51fSJens Axboe } 2522b094f89cSJens Axboe 2523ce5b009cSJens Axboe blk_add_rq_to_plug(plug, rq); 25242ff0682dSJens Axboe } else if (rq->rq_flags & RQF_ELV) { 2525105663f7SAndré Almeida /* Insert the request at the IO scheduler queue */ 2526a12de1d4SMing Lei blk_mq_sched_insert_request(rq, false, true, true); 25272299722cSChristoph Hellwig } else if (plug && !blk_queue_nomerges(q)) { 2528320ae51fSJens Axboe /* 2529320ae51fSJens Axboe * We do limited plugging. If the bio can be merged, do that. 2530320ae51fSJens Axboe * Otherwise the existing request in the plug list will be 2531320ae51fSJens Axboe * issued. So the plug list will have one request at most 25322299722cSChristoph Hellwig * The plug list might get flushed before this. If that happens, 25332299722cSChristoph Hellwig * the plug list is empty, and same_queue_rq is invalid. 2534320ae51fSJens Axboe */ 25352299722cSChristoph Hellwig if (list_empty(&plug->mq_list)) 25362299722cSChristoph Hellwig same_queue_rq = NULL; 25374711b573SJens Axboe if (same_queue_rq) { 25382299722cSChristoph Hellwig list_del_init(&same_queue_rq->queuelist); 25394711b573SJens Axboe plug->rq_count--; 25404711b573SJens Axboe } 2541ce5b009cSJens Axboe blk_add_rq_to_plug(plug, rq); 2542ff3b74b8SYufen Yu trace_block_plug(q); 25432299722cSChristoph Hellwig 2544dad7a3beSMing Lei if (same_queue_rq) { 2545ff3b74b8SYufen Yu trace_block_unplug(q, 1, true); 25460f38d766SChristoph Hellwig blk_mq_try_issue_directly(same_queue_rq->mq_hctx, 25473e08773cSChristoph Hellwig same_queue_rq); 2548dad7a3beSMing Lei } 2549a12de1d4SMing Lei } else if ((q->nr_hw_queues > 1 && is_sync) || 25500f38d766SChristoph Hellwig !rq->mq_hctx->dispatch_busy) { 2551105663f7SAndré Almeida /* 2552105663f7SAndré Almeida * There is no scheduler and we can try to send directly 2553105663f7SAndré Almeida * to the hardware. 2554105663f7SAndré Almeida */ 25553e08773cSChristoph Hellwig blk_mq_try_issue_directly(rq->mq_hctx, rq); 2556ab42f35dSMing Lei } else { 2557105663f7SAndré Almeida /* Default case. */ 25588fa9f556Shuhai blk_mq_sched_insert_request(rq, false, true, true); 2559ab42f35dSMing Lei } 2560b00c53e8SJens Axboe 25613e08773cSChristoph Hellwig return; 2562ac7c5675SChristoph Hellwig queue_exit: 2563ac7c5675SChristoph Hellwig blk_queue_exit(q); 2564320ae51fSJens Axboe } 2565320ae51fSJens Axboe 2566bd63141dSMing Lei static size_t order_to_size(unsigned int order) 2567bd63141dSMing Lei { 2568bd63141dSMing Lei return (size_t)PAGE_SIZE << order; 2569bd63141dSMing Lei } 2570bd63141dSMing Lei 2571bd63141dSMing Lei /* called before freeing request pool in @tags */ 2572f32e4eafSJohn Garry static void blk_mq_clear_rq_mapping(struct blk_mq_tags *drv_tags, 2573f32e4eafSJohn Garry struct blk_mq_tags *tags) 2574bd63141dSMing Lei { 2575bd63141dSMing Lei struct page *page; 2576bd63141dSMing Lei unsigned long flags; 2577bd63141dSMing Lei 25784f245d5bSJohn Garry /* There is no need to clear a driver tags own mapping */ 25794f245d5bSJohn Garry if (drv_tags == tags) 25804f245d5bSJohn Garry return; 25814f245d5bSJohn Garry 2582bd63141dSMing Lei list_for_each_entry(page, &tags->page_list, lru) { 2583bd63141dSMing Lei unsigned long start = (unsigned long)page_address(page); 2584bd63141dSMing Lei unsigned long end = start + order_to_size(page->private); 2585bd63141dSMing Lei int i; 2586bd63141dSMing Lei 2587f32e4eafSJohn Garry for (i = 0; i < drv_tags->nr_tags; i++) { 2588bd63141dSMing Lei struct request *rq = drv_tags->rqs[i]; 2589bd63141dSMing Lei unsigned long rq_addr = (unsigned long)rq; 2590bd63141dSMing Lei 2591bd63141dSMing Lei if (rq_addr >= start && rq_addr < end) { 2592bd63141dSMing Lei WARN_ON_ONCE(refcount_read(&rq->ref) != 0); 2593bd63141dSMing Lei cmpxchg(&drv_tags->rqs[i], rq, NULL); 2594bd63141dSMing Lei } 2595bd63141dSMing Lei } 2596bd63141dSMing Lei } 2597bd63141dSMing Lei 2598bd63141dSMing Lei /* 2599bd63141dSMing Lei * Wait until all pending iteration is done. 2600bd63141dSMing Lei * 2601bd63141dSMing Lei * Request reference is cleared and it is guaranteed to be observed 2602bd63141dSMing Lei * after the ->lock is released. 2603bd63141dSMing Lei */ 2604bd63141dSMing Lei spin_lock_irqsave(&drv_tags->lock, flags); 2605bd63141dSMing Lei spin_unlock_irqrestore(&drv_tags->lock, flags); 2606bd63141dSMing Lei } 2607bd63141dSMing Lei 2608cc71a6f4SJens Axboe void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 26092c3ad667SJens Axboe unsigned int hctx_idx) 2610320ae51fSJens Axboe { 2611f32e4eafSJohn Garry struct blk_mq_tags *drv_tags; 2612320ae51fSJens Axboe struct page *page; 2613320ae51fSJens Axboe 2614079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) 2615079a2e3eSJohn Garry drv_tags = set->shared_tags; 2616e155b0c2SJohn Garry else 2617f32e4eafSJohn Garry drv_tags = set->tags[hctx_idx]; 2618f32e4eafSJohn Garry 261965de57bbSJohn Garry if (tags->static_rqs && set->ops->exit_request) { 2620e9b267d9SChristoph Hellwig int i; 2621e9b267d9SChristoph Hellwig 262224d2f903SChristoph Hellwig for (i = 0; i < tags->nr_tags; i++) { 26232af8cbe3SJens Axboe struct request *rq = tags->static_rqs[i]; 26242af8cbe3SJens Axboe 26252af8cbe3SJens Axboe if (!rq) 2626e9b267d9SChristoph Hellwig continue; 2627d6296d39SChristoph Hellwig set->ops->exit_request(set, rq, hctx_idx); 26282af8cbe3SJens Axboe tags->static_rqs[i] = NULL; 2629e9b267d9SChristoph Hellwig } 2630e9b267d9SChristoph Hellwig } 2631e9b267d9SChristoph Hellwig 2632f32e4eafSJohn Garry blk_mq_clear_rq_mapping(drv_tags, tags); 2633bd63141dSMing Lei 263424d2f903SChristoph Hellwig while (!list_empty(&tags->page_list)) { 263524d2f903SChristoph Hellwig page = list_first_entry(&tags->page_list, struct page, lru); 26366753471cSDave Hansen list_del_init(&page->lru); 2637f75782e4SCatalin Marinas /* 2638f75782e4SCatalin Marinas * Remove kmemleak object previously allocated in 2639273938bfSRaul E Rangel * blk_mq_alloc_rqs(). 2640f75782e4SCatalin Marinas */ 2641f75782e4SCatalin Marinas kmemleak_free(page_address(page)); 2642320ae51fSJens Axboe __free_pages(page, page->private); 2643320ae51fSJens Axboe } 2644cc71a6f4SJens Axboe } 2645320ae51fSJens Axboe 2646e155b0c2SJohn Garry void blk_mq_free_rq_map(struct blk_mq_tags *tags) 2647cc71a6f4SJens Axboe { 264824d2f903SChristoph Hellwig kfree(tags->rqs); 2649cc71a6f4SJens Axboe tags->rqs = NULL; 26502af8cbe3SJens Axboe kfree(tags->static_rqs); 26512af8cbe3SJens Axboe tags->static_rqs = NULL; 2652320ae51fSJens Axboe 2653e155b0c2SJohn Garry blk_mq_free_tags(tags); 2654320ae51fSJens Axboe } 2655320ae51fSJens Axboe 265663064be1SJohn Garry static struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 2657cc71a6f4SJens Axboe unsigned int hctx_idx, 2658cc71a6f4SJens Axboe unsigned int nr_tags, 2659e155b0c2SJohn Garry unsigned int reserved_tags) 2660320ae51fSJens Axboe { 266124d2f903SChristoph Hellwig struct blk_mq_tags *tags; 266259f082e4SShaohua Li int node; 2663320ae51fSJens Axboe 26647d76f856SDongli Zhang node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 266559f082e4SShaohua Li if (node == NUMA_NO_NODE) 266659f082e4SShaohua Li node = set->numa_node; 266759f082e4SShaohua Li 2668e155b0c2SJohn Garry tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 2669e155b0c2SJohn Garry BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 267024d2f903SChristoph Hellwig if (!tags) 267124d2f903SChristoph Hellwig return NULL; 2672320ae51fSJens Axboe 2673590b5b7dSKees Cook tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 267436e1f3d1SGabriel Krisman Bertazi GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 267559f082e4SShaohua Li node); 267624d2f903SChristoph Hellwig if (!tags->rqs) { 2677e155b0c2SJohn Garry blk_mq_free_tags(tags); 267824d2f903SChristoph Hellwig return NULL; 267924d2f903SChristoph Hellwig } 2680320ae51fSJens Axboe 2681590b5b7dSKees Cook tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 26822af8cbe3SJens Axboe GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 268359f082e4SShaohua Li node); 26842af8cbe3SJens Axboe if (!tags->static_rqs) { 26852af8cbe3SJens Axboe kfree(tags->rqs); 2686e155b0c2SJohn Garry blk_mq_free_tags(tags); 26872af8cbe3SJens Axboe return NULL; 26882af8cbe3SJens Axboe } 26892af8cbe3SJens Axboe 2690cc71a6f4SJens Axboe return tags; 2691cc71a6f4SJens Axboe } 2692cc71a6f4SJens Axboe 26931d9bd516STejun Heo static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 26941d9bd516STejun Heo unsigned int hctx_idx, int node) 26951d9bd516STejun Heo { 26961d9bd516STejun Heo int ret; 26971d9bd516STejun Heo 26981d9bd516STejun Heo if (set->ops->init_request) { 26991d9bd516STejun Heo ret = set->ops->init_request(set, rq, hctx_idx, node); 27001d9bd516STejun Heo if (ret) 27011d9bd516STejun Heo return ret; 27021d9bd516STejun Heo } 27031d9bd516STejun Heo 270412f5b931SKeith Busch WRITE_ONCE(rq->state, MQ_RQ_IDLE); 27051d9bd516STejun Heo return 0; 27061d9bd516STejun Heo } 27071d9bd516STejun Heo 270863064be1SJohn Garry static int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, 270963064be1SJohn Garry struct blk_mq_tags *tags, 2710cc71a6f4SJens Axboe unsigned int hctx_idx, unsigned int depth) 2711cc71a6f4SJens Axboe { 2712cc71a6f4SJens Axboe unsigned int i, j, entries_per_page, max_order = 4; 2713cc71a6f4SJens Axboe size_t rq_size, left; 271459f082e4SShaohua Li int node; 271559f082e4SShaohua Li 27167d76f856SDongli Zhang node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 271759f082e4SShaohua Li if (node == NUMA_NO_NODE) 271859f082e4SShaohua Li node = set->numa_node; 2719cc71a6f4SJens Axboe 2720cc71a6f4SJens Axboe INIT_LIST_HEAD(&tags->page_list); 2721cc71a6f4SJens Axboe 2722320ae51fSJens Axboe /* 2723320ae51fSJens Axboe * rq_size is the size of the request plus driver payload, rounded 2724320ae51fSJens Axboe * to the cacheline size 2725320ae51fSJens Axboe */ 272624d2f903SChristoph Hellwig rq_size = round_up(sizeof(struct request) + set->cmd_size, 2727320ae51fSJens Axboe cache_line_size()); 2728cc71a6f4SJens Axboe left = rq_size * depth; 2729320ae51fSJens Axboe 2730cc71a6f4SJens Axboe for (i = 0; i < depth; ) { 2731320ae51fSJens Axboe int this_order = max_order; 2732320ae51fSJens Axboe struct page *page; 2733320ae51fSJens Axboe int to_do; 2734320ae51fSJens Axboe void *p; 2735320ae51fSJens Axboe 2736b3a834b1SBartlomiej Zolnierkiewicz while (this_order && left < order_to_size(this_order - 1)) 2737320ae51fSJens Axboe this_order--; 2738320ae51fSJens Axboe 2739320ae51fSJens Axboe do { 274059f082e4SShaohua Li page = alloc_pages_node(node, 274136e1f3d1SGabriel Krisman Bertazi GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 274224d2f903SChristoph Hellwig this_order); 2743320ae51fSJens Axboe if (page) 2744320ae51fSJens Axboe break; 2745320ae51fSJens Axboe if (!this_order--) 2746320ae51fSJens Axboe break; 2747320ae51fSJens Axboe if (order_to_size(this_order) < rq_size) 2748320ae51fSJens Axboe break; 2749320ae51fSJens Axboe } while (1); 2750320ae51fSJens Axboe 2751320ae51fSJens Axboe if (!page) 275224d2f903SChristoph Hellwig goto fail; 2753320ae51fSJens Axboe 2754320ae51fSJens Axboe page->private = this_order; 275524d2f903SChristoph Hellwig list_add_tail(&page->lru, &tags->page_list); 2756320ae51fSJens Axboe 2757320ae51fSJens Axboe p = page_address(page); 2758f75782e4SCatalin Marinas /* 2759f75782e4SCatalin Marinas * Allow kmemleak to scan these pages as they contain pointers 2760f75782e4SCatalin Marinas * to additional allocations like via ops->init_request(). 2761f75782e4SCatalin Marinas */ 276236e1f3d1SGabriel Krisman Bertazi kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 2763320ae51fSJens Axboe entries_per_page = order_to_size(this_order) / rq_size; 2764cc71a6f4SJens Axboe to_do = min(entries_per_page, depth - i); 2765320ae51fSJens Axboe left -= to_do * rq_size; 2766320ae51fSJens Axboe for (j = 0; j < to_do; j++) { 27672af8cbe3SJens Axboe struct request *rq = p; 27682af8cbe3SJens Axboe 27692af8cbe3SJens Axboe tags->static_rqs[i] = rq; 27701d9bd516STejun Heo if (blk_mq_init_request(set, rq, hctx_idx, node)) { 27712af8cbe3SJens Axboe tags->static_rqs[i] = NULL; 277224d2f903SChristoph Hellwig goto fail; 2773e9b267d9SChristoph Hellwig } 2774e9b267d9SChristoph Hellwig 2775320ae51fSJens Axboe p += rq_size; 2776320ae51fSJens Axboe i++; 2777320ae51fSJens Axboe } 2778320ae51fSJens Axboe } 2779cc71a6f4SJens Axboe return 0; 2780320ae51fSJens Axboe 278124d2f903SChristoph Hellwig fail: 2782cc71a6f4SJens Axboe blk_mq_free_rqs(set, tags, hctx_idx); 2783cc71a6f4SJens Axboe return -ENOMEM; 2784320ae51fSJens Axboe } 2785320ae51fSJens Axboe 2786bf0beec0SMing Lei struct rq_iter_data { 2787bf0beec0SMing Lei struct blk_mq_hw_ctx *hctx; 2788bf0beec0SMing Lei bool has_rq; 2789bf0beec0SMing Lei }; 2790bf0beec0SMing Lei 2791bf0beec0SMing Lei static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) 2792bf0beec0SMing Lei { 2793bf0beec0SMing Lei struct rq_iter_data *iter_data = data; 2794bf0beec0SMing Lei 2795bf0beec0SMing Lei if (rq->mq_hctx != iter_data->hctx) 2796bf0beec0SMing Lei return true; 2797bf0beec0SMing Lei iter_data->has_rq = true; 2798bf0beec0SMing Lei return false; 2799bf0beec0SMing Lei } 2800bf0beec0SMing Lei 2801bf0beec0SMing Lei static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) 2802bf0beec0SMing Lei { 2803bf0beec0SMing Lei struct blk_mq_tags *tags = hctx->sched_tags ? 2804bf0beec0SMing Lei hctx->sched_tags : hctx->tags; 2805bf0beec0SMing Lei struct rq_iter_data data = { 2806bf0beec0SMing Lei .hctx = hctx, 2807bf0beec0SMing Lei }; 2808bf0beec0SMing Lei 2809bf0beec0SMing Lei blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); 2810bf0beec0SMing Lei return data.has_rq; 2811bf0beec0SMing Lei } 2812bf0beec0SMing Lei 2813bf0beec0SMing Lei static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, 2814bf0beec0SMing Lei struct blk_mq_hw_ctx *hctx) 2815bf0beec0SMing Lei { 2816bf0beec0SMing Lei if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) 2817bf0beec0SMing Lei return false; 2818bf0beec0SMing Lei if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) 2819bf0beec0SMing Lei return false; 2820bf0beec0SMing Lei return true; 2821bf0beec0SMing Lei } 2822bf0beec0SMing Lei 2823bf0beec0SMing Lei static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) 2824bf0beec0SMing Lei { 2825bf0beec0SMing Lei struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2826bf0beec0SMing Lei struct blk_mq_hw_ctx, cpuhp_online); 2827bf0beec0SMing Lei 2828bf0beec0SMing Lei if (!cpumask_test_cpu(cpu, hctx->cpumask) || 2829bf0beec0SMing Lei !blk_mq_last_cpu_in_hctx(cpu, hctx)) 2830bf0beec0SMing Lei return 0; 2831bf0beec0SMing Lei 2832bf0beec0SMing Lei /* 2833bf0beec0SMing Lei * Prevent new request from being allocated on the current hctx. 2834bf0beec0SMing Lei * 2835bf0beec0SMing Lei * The smp_mb__after_atomic() Pairs with the implied barrier in 2836bf0beec0SMing Lei * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is 2837bf0beec0SMing Lei * seen once we return from the tag allocator. 2838bf0beec0SMing Lei */ 2839bf0beec0SMing Lei set_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2840bf0beec0SMing Lei smp_mb__after_atomic(); 2841bf0beec0SMing Lei 2842bf0beec0SMing Lei /* 2843bf0beec0SMing Lei * Try to grab a reference to the queue and wait for any outstanding 2844bf0beec0SMing Lei * requests. If we could not grab a reference the queue has been 2845bf0beec0SMing Lei * frozen and there are no requests. 2846bf0beec0SMing Lei */ 2847bf0beec0SMing Lei if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { 2848bf0beec0SMing Lei while (blk_mq_hctx_has_requests(hctx)) 2849bf0beec0SMing Lei msleep(5); 2850bf0beec0SMing Lei percpu_ref_put(&hctx->queue->q_usage_counter); 2851bf0beec0SMing Lei } 2852bf0beec0SMing Lei 2853bf0beec0SMing Lei return 0; 2854bf0beec0SMing Lei } 2855bf0beec0SMing Lei 2856bf0beec0SMing Lei static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) 2857bf0beec0SMing Lei { 2858bf0beec0SMing Lei struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2859bf0beec0SMing Lei struct blk_mq_hw_ctx, cpuhp_online); 2860bf0beec0SMing Lei 2861bf0beec0SMing Lei if (cpumask_test_cpu(cpu, hctx->cpumask)) 2862bf0beec0SMing Lei clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2863bf0beec0SMing Lei return 0; 2864bf0beec0SMing Lei } 2865bf0beec0SMing Lei 2866e57690feSJens Axboe /* 2867e57690feSJens Axboe * 'cpu' is going away. splice any existing rq_list entries from this 2868e57690feSJens Axboe * software queue to the hw queue dispatch list, and ensure that it 2869e57690feSJens Axboe * gets run. 2870e57690feSJens Axboe */ 28719467f859SThomas Gleixner static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 2872484b4061SJens Axboe { 28739467f859SThomas Gleixner struct blk_mq_hw_ctx *hctx; 2874484b4061SJens Axboe struct blk_mq_ctx *ctx; 2875484b4061SJens Axboe LIST_HEAD(tmp); 2876c16d6b5aSMing Lei enum hctx_type type; 2877484b4061SJens Axboe 28789467f859SThomas Gleixner hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2879bf0beec0SMing Lei if (!cpumask_test_cpu(cpu, hctx->cpumask)) 2880bf0beec0SMing Lei return 0; 2881bf0beec0SMing Lei 2882e57690feSJens Axboe ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2883c16d6b5aSMing Lei type = hctx->type; 2884484b4061SJens Axboe 2885484b4061SJens Axboe spin_lock(&ctx->lock); 2886c16d6b5aSMing Lei if (!list_empty(&ctx->rq_lists[type])) { 2887c16d6b5aSMing Lei list_splice_init(&ctx->rq_lists[type], &tmp); 2888484b4061SJens Axboe blk_mq_hctx_clear_pending(hctx, ctx); 2889484b4061SJens Axboe } 2890484b4061SJens Axboe spin_unlock(&ctx->lock); 2891484b4061SJens Axboe 2892484b4061SJens Axboe if (list_empty(&tmp)) 28939467f859SThomas Gleixner return 0; 2894484b4061SJens Axboe 2895e57690feSJens Axboe spin_lock(&hctx->lock); 2896e57690feSJens Axboe list_splice_tail_init(&tmp, &hctx->dispatch); 2897e57690feSJens Axboe spin_unlock(&hctx->lock); 2898484b4061SJens Axboe 2899484b4061SJens Axboe blk_mq_run_hw_queue(hctx, true); 29009467f859SThomas Gleixner return 0; 2901484b4061SJens Axboe } 2902484b4061SJens Axboe 29039467f859SThomas Gleixner static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 2904484b4061SJens Axboe { 2905bf0beec0SMing Lei if (!(hctx->flags & BLK_MQ_F_STACKING)) 2906bf0beec0SMing Lei cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 2907bf0beec0SMing Lei &hctx->cpuhp_online); 29089467f859SThomas Gleixner cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 29099467f859SThomas Gleixner &hctx->cpuhp_dead); 2910484b4061SJens Axboe } 2911484b4061SJens Axboe 2912364b6181SMing Lei /* 2913364b6181SMing Lei * Before freeing hw queue, clearing the flush request reference in 2914364b6181SMing Lei * tags->rqs[] for avoiding potential UAF. 2915364b6181SMing Lei */ 2916364b6181SMing Lei static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, 2917364b6181SMing Lei unsigned int queue_depth, struct request *flush_rq) 2918364b6181SMing Lei { 2919364b6181SMing Lei int i; 2920364b6181SMing Lei unsigned long flags; 2921364b6181SMing Lei 2922364b6181SMing Lei /* The hw queue may not be mapped yet */ 2923364b6181SMing Lei if (!tags) 2924364b6181SMing Lei return; 2925364b6181SMing Lei 2926364b6181SMing Lei WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); 2927364b6181SMing Lei 2928364b6181SMing Lei for (i = 0; i < queue_depth; i++) 2929364b6181SMing Lei cmpxchg(&tags->rqs[i], flush_rq, NULL); 2930364b6181SMing Lei 2931364b6181SMing Lei /* 2932364b6181SMing Lei * Wait until all pending iteration is done. 2933364b6181SMing Lei * 2934364b6181SMing Lei * Request reference is cleared and it is guaranteed to be observed 2935364b6181SMing Lei * after the ->lock is released. 2936364b6181SMing Lei */ 2937364b6181SMing Lei spin_lock_irqsave(&tags->lock, flags); 2938364b6181SMing Lei spin_unlock_irqrestore(&tags->lock, flags); 2939364b6181SMing Lei } 2940364b6181SMing Lei 2941c3b4afcaSMing Lei /* hctx->ctxs will be freed in queue's release handler */ 294208e98fc6SMing Lei static void blk_mq_exit_hctx(struct request_queue *q, 294308e98fc6SMing Lei struct blk_mq_tag_set *set, 294408e98fc6SMing Lei struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 294508e98fc6SMing Lei { 2946364b6181SMing Lei struct request *flush_rq = hctx->fq->flush_rq; 2947364b6181SMing Lei 29488ab0b7dcSMing Lei if (blk_mq_hw_queue_mapped(hctx)) 294908e98fc6SMing Lei blk_mq_tag_idle(hctx); 295008e98fc6SMing Lei 2951364b6181SMing Lei blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], 2952364b6181SMing Lei set->queue_depth, flush_rq); 2953f70ced09SMing Lei if (set->ops->exit_request) 2954364b6181SMing Lei set->ops->exit_request(set, flush_rq, hctx_idx); 2955f70ced09SMing Lei 295608e98fc6SMing Lei if (set->ops->exit_hctx) 295708e98fc6SMing Lei set->ops->exit_hctx(hctx, hctx_idx); 295808e98fc6SMing Lei 29599467f859SThomas Gleixner blk_mq_remove_cpuhp(hctx); 29602f8f1336SMing Lei 29612f8f1336SMing Lei spin_lock(&q->unused_hctx_lock); 29622f8f1336SMing Lei list_add(&hctx->hctx_list, &q->unused_hctx_list); 29632f8f1336SMing Lei spin_unlock(&q->unused_hctx_lock); 296408e98fc6SMing Lei } 296508e98fc6SMing Lei 2966624dbe47SMing Lei static void blk_mq_exit_hw_queues(struct request_queue *q, 2967624dbe47SMing Lei struct blk_mq_tag_set *set, int nr_queue) 2968624dbe47SMing Lei { 2969624dbe47SMing Lei struct blk_mq_hw_ctx *hctx; 2970624dbe47SMing Lei unsigned int i; 2971624dbe47SMing Lei 2972624dbe47SMing Lei queue_for_each_hw_ctx(q, hctx, i) { 2973624dbe47SMing Lei if (i == nr_queue) 2974624dbe47SMing Lei break; 2975477e19deSJianchao Wang blk_mq_debugfs_unregister_hctx(hctx); 297608e98fc6SMing Lei blk_mq_exit_hctx(q, set, hctx, i); 2977624dbe47SMing Lei } 2978624dbe47SMing Lei } 2979624dbe47SMing Lei 29807c6c5b7cSMing Lei static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) 29817c6c5b7cSMing Lei { 29827c6c5b7cSMing Lei int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 29837c6c5b7cSMing Lei 29847c6c5b7cSMing Lei BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), 29857c6c5b7cSMing Lei __alignof__(struct blk_mq_hw_ctx)) != 29867c6c5b7cSMing Lei sizeof(struct blk_mq_hw_ctx)); 29877c6c5b7cSMing Lei 29887c6c5b7cSMing Lei if (tag_set->flags & BLK_MQ_F_BLOCKING) 29897c6c5b7cSMing Lei hw_ctx_size += sizeof(struct srcu_struct); 29907c6c5b7cSMing Lei 29917c6c5b7cSMing Lei return hw_ctx_size; 29927c6c5b7cSMing Lei } 29937c6c5b7cSMing Lei 299408e98fc6SMing Lei static int blk_mq_init_hctx(struct request_queue *q, 299508e98fc6SMing Lei struct blk_mq_tag_set *set, 299608e98fc6SMing Lei struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2997320ae51fSJens Axboe { 29987c6c5b7cSMing Lei hctx->queue_num = hctx_idx; 2999320ae51fSJens Axboe 3000bf0beec0SMing Lei if (!(hctx->flags & BLK_MQ_F_STACKING)) 3001bf0beec0SMing Lei cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 3002bf0beec0SMing Lei &hctx->cpuhp_online); 30037c6c5b7cSMing Lei cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 30047c6c5b7cSMing Lei 30057c6c5b7cSMing Lei hctx->tags = set->tags[hctx_idx]; 30067c6c5b7cSMing Lei 30077c6c5b7cSMing Lei if (set->ops->init_hctx && 30087c6c5b7cSMing Lei set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 30097c6c5b7cSMing Lei goto unregister_cpu_notifier; 30107c6c5b7cSMing Lei 30117c6c5b7cSMing Lei if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, 30127c6c5b7cSMing Lei hctx->numa_node)) 30137c6c5b7cSMing Lei goto exit_hctx; 30147c6c5b7cSMing Lei return 0; 30157c6c5b7cSMing Lei 30167c6c5b7cSMing Lei exit_hctx: 30177c6c5b7cSMing Lei if (set->ops->exit_hctx) 30187c6c5b7cSMing Lei set->ops->exit_hctx(hctx, hctx_idx); 30197c6c5b7cSMing Lei unregister_cpu_notifier: 30207c6c5b7cSMing Lei blk_mq_remove_cpuhp(hctx); 30217c6c5b7cSMing Lei return -1; 30227c6c5b7cSMing Lei } 30237c6c5b7cSMing Lei 30247c6c5b7cSMing Lei static struct blk_mq_hw_ctx * 30257c6c5b7cSMing Lei blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, 30267c6c5b7cSMing Lei int node) 30277c6c5b7cSMing Lei { 30287c6c5b7cSMing Lei struct blk_mq_hw_ctx *hctx; 30297c6c5b7cSMing Lei gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 30307c6c5b7cSMing Lei 30317c6c5b7cSMing Lei hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); 30327c6c5b7cSMing Lei if (!hctx) 30337c6c5b7cSMing Lei goto fail_alloc_hctx; 30347c6c5b7cSMing Lei 30357c6c5b7cSMing Lei if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) 30367c6c5b7cSMing Lei goto free_hctx; 30377c6c5b7cSMing Lei 30387c6c5b7cSMing Lei atomic_set(&hctx->nr_active, 0); 3039320ae51fSJens Axboe if (node == NUMA_NO_NODE) 30407c6c5b7cSMing Lei node = set->numa_node; 30417c6c5b7cSMing Lei hctx->numa_node = node; 3042320ae51fSJens Axboe 30439f993737SJens Axboe INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 3044320ae51fSJens Axboe spin_lock_init(&hctx->lock); 3045320ae51fSJens Axboe INIT_LIST_HEAD(&hctx->dispatch); 3046320ae51fSJens Axboe hctx->queue = q; 304751db1c37SMing Lei hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 3048320ae51fSJens Axboe 30492f8f1336SMing Lei INIT_LIST_HEAD(&hctx->hctx_list); 30502f8f1336SMing Lei 3051320ae51fSJens Axboe /* 3052a68aafa5SJens Axboe * Allocate space for all possible cpus to avoid allocation at 3053320ae51fSJens Axboe * runtime 3054320ae51fSJens Axboe */ 3055d904bfa7SJohannes Thumshirn hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 30567c6c5b7cSMing Lei gfp, node); 3057320ae51fSJens Axboe if (!hctx->ctxs) 30587c6c5b7cSMing Lei goto free_cpumask; 3059320ae51fSJens Axboe 30605b202853SJianchao Wang if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 3061c548e62bSMing Lei gfp, node, false, false)) 306208e98fc6SMing Lei goto free_ctxs; 3063320ae51fSJens Axboe hctx->nr_ctx = 0; 3064320ae51fSJens Axboe 30655815839bSMing Lei spin_lock_init(&hctx->dispatch_wait_lock); 3066eb619fdbSJens Axboe init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 3067eb619fdbSJens Axboe INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 3068eb619fdbSJens Axboe 3069754a1572SGuoqing Jiang hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); 3070f70ced09SMing Lei if (!hctx->fq) 30717c6c5b7cSMing Lei goto free_bitmap; 3072f70ced09SMing Lei 30736a83e74dSBart Van Assche if (hctx->flags & BLK_MQ_F_BLOCKING) 307405707b64STejun Heo init_srcu_struct(hctx->srcu); 30757c6c5b7cSMing Lei blk_mq_hctx_kobj_init(hctx); 30766a83e74dSBart Van Assche 30777c6c5b7cSMing Lei return hctx; 307808e98fc6SMing Lei 307908e98fc6SMing Lei free_bitmap: 308088459642SOmar Sandoval sbitmap_free(&hctx->ctx_map); 308108e98fc6SMing Lei free_ctxs: 308208e98fc6SMing Lei kfree(hctx->ctxs); 30837c6c5b7cSMing Lei free_cpumask: 30847c6c5b7cSMing Lei free_cpumask_var(hctx->cpumask); 30857c6c5b7cSMing Lei free_hctx: 30867c6c5b7cSMing Lei kfree(hctx); 30877c6c5b7cSMing Lei fail_alloc_hctx: 30887c6c5b7cSMing Lei return NULL; 308908e98fc6SMing Lei } 309008e98fc6SMing Lei 3091320ae51fSJens Axboe static void blk_mq_init_cpu_queues(struct request_queue *q, 3092320ae51fSJens Axboe unsigned int nr_hw_queues) 3093320ae51fSJens Axboe { 3094b3c661b1SJens Axboe struct blk_mq_tag_set *set = q->tag_set; 3095b3c661b1SJens Axboe unsigned int i, j; 3096320ae51fSJens Axboe 3097320ae51fSJens Axboe for_each_possible_cpu(i) { 3098320ae51fSJens Axboe struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 3099320ae51fSJens Axboe struct blk_mq_hw_ctx *hctx; 3100c16d6b5aSMing Lei int k; 3101320ae51fSJens Axboe 3102320ae51fSJens Axboe __ctx->cpu = i; 3103320ae51fSJens Axboe spin_lock_init(&__ctx->lock); 3104c16d6b5aSMing Lei for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 3105c16d6b5aSMing Lei INIT_LIST_HEAD(&__ctx->rq_lists[k]); 3106c16d6b5aSMing Lei 3107320ae51fSJens Axboe __ctx->queue = q; 3108320ae51fSJens Axboe 3109320ae51fSJens Axboe /* 3110320ae51fSJens Axboe * Set local node, IFF we have more than one hw queue. If 3111320ae51fSJens Axboe * not, we remain on the home node of the device 3112320ae51fSJens Axboe */ 3113b3c661b1SJens Axboe for (j = 0; j < set->nr_maps; j++) { 3114b3c661b1SJens Axboe hctx = blk_mq_map_queue_type(q, j, i); 3115320ae51fSJens Axboe if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 3116576e85c5SXianting Tian hctx->numa_node = cpu_to_node(i); 3117320ae51fSJens Axboe } 3118320ae51fSJens Axboe } 3119b3c661b1SJens Axboe } 3120320ae51fSJens Axboe 312163064be1SJohn Garry struct blk_mq_tags *blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 312263064be1SJohn Garry unsigned int hctx_idx, 312363064be1SJohn Garry unsigned int depth) 312463064be1SJohn Garry { 312563064be1SJohn Garry struct blk_mq_tags *tags; 312663064be1SJohn Garry int ret; 312763064be1SJohn Garry 3128e155b0c2SJohn Garry tags = blk_mq_alloc_rq_map(set, hctx_idx, depth, set->reserved_tags); 312963064be1SJohn Garry if (!tags) 313063064be1SJohn Garry return NULL; 313163064be1SJohn Garry 313263064be1SJohn Garry ret = blk_mq_alloc_rqs(set, tags, hctx_idx, depth); 313363064be1SJohn Garry if (ret) { 3134e155b0c2SJohn Garry blk_mq_free_rq_map(tags); 313563064be1SJohn Garry return NULL; 313663064be1SJohn Garry } 313763064be1SJohn Garry 313863064be1SJohn Garry return tags; 313963064be1SJohn Garry } 314063064be1SJohn Garry 314163064be1SJohn Garry static bool __blk_mq_alloc_map_and_rqs(struct blk_mq_tag_set *set, 314203b63b02SWeiping Zhang int hctx_idx) 3143cc71a6f4SJens Axboe { 3144079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) { 3145079a2e3eSJohn Garry set->tags[hctx_idx] = set->shared_tags; 3146e155b0c2SJohn Garry 3147e155b0c2SJohn Garry return true; 3148e155b0c2SJohn Garry } 3149e155b0c2SJohn Garry 315063064be1SJohn Garry set->tags[hctx_idx] = blk_mq_alloc_map_and_rqs(set, hctx_idx, 3151cc71a6f4SJens Axboe set->queue_depth); 3152cc71a6f4SJens Axboe 315363064be1SJohn Garry return set->tags[hctx_idx]; 3154cc71a6f4SJens Axboe } 3155cc71a6f4SJens Axboe 3156645db34eSJohn Garry void blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 3157645db34eSJohn Garry struct blk_mq_tags *tags, 3158cc71a6f4SJens Axboe unsigned int hctx_idx) 3159cc71a6f4SJens Axboe { 3160645db34eSJohn Garry if (tags) { 3161645db34eSJohn Garry blk_mq_free_rqs(set, tags, hctx_idx); 3162e155b0c2SJohn Garry blk_mq_free_rq_map(tags); 3163cc71a6f4SJens Axboe } 3164bd166ef1SJens Axboe } 3165cc71a6f4SJens Axboe 3166e155b0c2SJohn Garry static void __blk_mq_free_map_and_rqs(struct blk_mq_tag_set *set, 3167e155b0c2SJohn Garry unsigned int hctx_idx) 3168e155b0c2SJohn Garry { 3169079a2e3eSJohn Garry if (!blk_mq_is_shared_tags(set->flags)) 3170e155b0c2SJohn Garry blk_mq_free_map_and_rqs(set, set->tags[hctx_idx], hctx_idx); 3171e155b0c2SJohn Garry 3172e155b0c2SJohn Garry set->tags[hctx_idx] = NULL; 3173e155b0c2SJohn Garry } 3174e155b0c2SJohn Garry 31754b855ad3SChristoph Hellwig static void blk_mq_map_swqueue(struct request_queue *q) 3176320ae51fSJens Axboe { 3177b3c661b1SJens Axboe unsigned int i, j, hctx_idx; 3178320ae51fSJens Axboe struct blk_mq_hw_ctx *hctx; 3179320ae51fSJens Axboe struct blk_mq_ctx *ctx; 31802a34c087SMing Lei struct blk_mq_tag_set *set = q->tag_set; 3181320ae51fSJens Axboe 3182320ae51fSJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 3183e4043dcfSJens Axboe cpumask_clear(hctx->cpumask); 3184320ae51fSJens Axboe hctx->nr_ctx = 0; 3185d416c92cShuhai hctx->dispatch_from = NULL; 3186320ae51fSJens Axboe } 3187320ae51fSJens Axboe 3188320ae51fSJens Axboe /* 31894b855ad3SChristoph Hellwig * Map software to hardware queues. 31904412efecSMing Lei * 31914412efecSMing Lei * If the cpu isn't present, the cpu is mapped to first hctx. 3192320ae51fSJens Axboe */ 319320e4d813SChristoph Hellwig for_each_possible_cpu(i) { 3194fd689871SMing Lei 3195fd689871SMing Lei ctx = per_cpu_ptr(q->queue_ctx, i); 3196fd689871SMing Lei for (j = 0; j < set->nr_maps; j++) { 3197fd689871SMing Lei if (!set->map[j].nr_queues) { 3198fd689871SMing Lei ctx->hctxs[j] = blk_mq_map_queue_type(q, 3199fd689871SMing Lei HCTX_TYPE_DEFAULT, i); 3200fd689871SMing Lei continue; 3201fd689871SMing Lei } 3202fd689871SMing Lei hctx_idx = set->map[j].mq_map[i]; 32034412efecSMing Lei /* unmapped hw queue can be remapped after CPU topo changed */ 32044412efecSMing Lei if (!set->tags[hctx_idx] && 320563064be1SJohn Garry !__blk_mq_alloc_map_and_rqs(set, hctx_idx)) { 32064412efecSMing Lei /* 32074412efecSMing Lei * If tags initialization fail for some hctx, 32084412efecSMing Lei * that hctx won't be brought online. In this 32094412efecSMing Lei * case, remap the current ctx to hctx[0] which 32104412efecSMing Lei * is guaranteed to always have tags allocated 32114412efecSMing Lei */ 3212fd689871SMing Lei set->map[j].mq_map[i] = 0; 3213bb94aea1SJianchao Wang } 3214e5edd5f2SMing Lei 3215b3c661b1SJens Axboe hctx = blk_mq_map_queue_type(q, j, i); 32168ccdf4a3SJianchao Wang ctx->hctxs[j] = hctx; 3217b3c661b1SJens Axboe /* 3218b3c661b1SJens Axboe * If the CPU is already set in the mask, then we've 3219b3c661b1SJens Axboe * mapped this one already. This can happen if 3220b3c661b1SJens Axboe * devices share queues across queue maps. 3221b3c661b1SJens Axboe */ 3222b3c661b1SJens Axboe if (cpumask_test_cpu(i, hctx->cpumask)) 3223b3c661b1SJens Axboe continue; 3224b3c661b1SJens Axboe 3225e4043dcfSJens Axboe cpumask_set_cpu(i, hctx->cpumask); 3226b3c661b1SJens Axboe hctx->type = j; 3227f31967f0SJens Axboe ctx->index_hw[hctx->type] = hctx->nr_ctx; 3228320ae51fSJens Axboe hctx->ctxs[hctx->nr_ctx++] = ctx; 3229f31967f0SJens Axboe 3230f31967f0SJens Axboe /* 3231f31967f0SJens Axboe * If the nr_ctx type overflows, we have exceeded the 3232f31967f0SJens Axboe * amount of sw queues we can support. 3233f31967f0SJens Axboe */ 3234f31967f0SJens Axboe BUG_ON(!hctx->nr_ctx); 3235320ae51fSJens Axboe } 3236bb94aea1SJianchao Wang 3237bb94aea1SJianchao Wang for (; j < HCTX_MAX_TYPES; j++) 3238bb94aea1SJianchao Wang ctx->hctxs[j] = blk_mq_map_queue_type(q, 3239bb94aea1SJianchao Wang HCTX_TYPE_DEFAULT, i); 3240b3c661b1SJens Axboe } 3241506e931fSJens Axboe 3242506e931fSJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 32434412efecSMing Lei /* 32444412efecSMing Lei * If no software queues are mapped to this hardware queue, 32454412efecSMing Lei * disable it and free the request entries. 32464412efecSMing Lei */ 32474412efecSMing Lei if (!hctx->nr_ctx) { 32484412efecSMing Lei /* Never unmap queue 0. We need it as a 32494412efecSMing Lei * fallback in case of a new remap fails 32504412efecSMing Lei * allocation 32514412efecSMing Lei */ 3252e155b0c2SJohn Garry if (i) 3253e155b0c2SJohn Garry __blk_mq_free_map_and_rqs(set, i); 32544412efecSMing Lei 32554412efecSMing Lei hctx->tags = NULL; 32564412efecSMing Lei continue; 32574412efecSMing Lei } 3258484b4061SJens Axboe 32592a34c087SMing Lei hctx->tags = set->tags[i]; 32602a34c087SMing Lei WARN_ON(!hctx->tags); 32612a34c087SMing Lei 3262484b4061SJens Axboe /* 3263889fa31fSChong Yuan * Set the map size to the number of mapped software queues. 3264889fa31fSChong Yuan * This is more accurate and more efficient than looping 3265889fa31fSChong Yuan * over all possibly mapped software queues. 3266889fa31fSChong Yuan */ 326788459642SOmar Sandoval sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 3268889fa31fSChong Yuan 3269889fa31fSChong Yuan /* 3270484b4061SJens Axboe * Initialize batch roundrobin counts 3271484b4061SJens Axboe */ 3272f82ddf19SMing Lei hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 3273506e931fSJens Axboe hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 3274506e931fSJens Axboe } 3275320ae51fSJens Axboe } 3276320ae51fSJens Axboe 32778e8320c9SJens Axboe /* 32788e8320c9SJens Axboe * Caller needs to ensure that we're either frozen/quiesced, or that 32798e8320c9SJens Axboe * the queue isn't live yet. 32808e8320c9SJens Axboe */ 32812404e607SJeff Moyer static void queue_set_hctx_shared(struct request_queue *q, bool shared) 32820d2602caSJens Axboe { 32830d2602caSJens Axboe struct blk_mq_hw_ctx *hctx; 32840d2602caSJens Axboe int i; 32850d2602caSJens Axboe 32860d2602caSJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 3287454bb677SYu Kuai if (shared) { 328851db1c37SMing Lei hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 3289454bb677SYu Kuai } else { 3290454bb677SYu Kuai blk_mq_tag_idle(hctx); 329151db1c37SMing Lei hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 32920d2602caSJens Axboe } 32932404e607SJeff Moyer } 3294454bb677SYu Kuai } 32952404e607SJeff Moyer 3296655ac300SHannes Reinecke static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, 32978e8320c9SJens Axboe bool shared) 32982404e607SJeff Moyer { 32992404e607SJeff Moyer struct request_queue *q; 33002404e607SJeff Moyer 3301705cda97SBart Van Assche lockdep_assert_held(&set->tag_list_lock); 3302705cda97SBart Van Assche 33032404e607SJeff Moyer list_for_each_entry(q, &set->tag_list, tag_set_list) { 33042404e607SJeff Moyer blk_mq_freeze_queue(q); 33052404e607SJeff Moyer queue_set_hctx_shared(q, shared); 33060d2602caSJens Axboe blk_mq_unfreeze_queue(q); 33070d2602caSJens Axboe } 33080d2602caSJens Axboe } 33090d2602caSJens Axboe 33100d2602caSJens Axboe static void blk_mq_del_queue_tag_set(struct request_queue *q) 33110d2602caSJens Axboe { 33120d2602caSJens Axboe struct blk_mq_tag_set *set = q->tag_set; 33130d2602caSJens Axboe 33140d2602caSJens Axboe mutex_lock(&set->tag_list_lock); 331508c875cbSDaniel Wagner list_del(&q->tag_set_list); 33162404e607SJeff Moyer if (list_is_singular(&set->tag_list)) { 33172404e607SJeff Moyer /* just transitioned to unshared */ 331851db1c37SMing Lei set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 33192404e607SJeff Moyer /* update existing queue */ 3320655ac300SHannes Reinecke blk_mq_update_tag_set_shared(set, false); 33212404e607SJeff Moyer } 33220d2602caSJens Axboe mutex_unlock(&set->tag_list_lock); 3323a347c7adSRoman Pen INIT_LIST_HEAD(&q->tag_set_list); 33240d2602caSJens Axboe } 33250d2602caSJens Axboe 33260d2602caSJens Axboe static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 33270d2602caSJens Axboe struct request_queue *q) 33280d2602caSJens Axboe { 33290d2602caSJens Axboe mutex_lock(&set->tag_list_lock); 33302404e607SJeff Moyer 3331ff821d27SJens Axboe /* 3332ff821d27SJens Axboe * Check to see if we're transitioning to shared (from 1 to 2 queues). 3333ff821d27SJens Axboe */ 3334ff821d27SJens Axboe if (!list_empty(&set->tag_list) && 333551db1c37SMing Lei !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 333651db1c37SMing Lei set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 33372404e607SJeff Moyer /* update existing queue */ 3338655ac300SHannes Reinecke blk_mq_update_tag_set_shared(set, true); 33392404e607SJeff Moyer } 334051db1c37SMing Lei if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 33412404e607SJeff Moyer queue_set_hctx_shared(q, true); 334208c875cbSDaniel Wagner list_add_tail(&q->tag_set_list, &set->tag_list); 33432404e607SJeff Moyer 33440d2602caSJens Axboe mutex_unlock(&set->tag_list_lock); 33450d2602caSJens Axboe } 33460d2602caSJens Axboe 33471db4909eSMing Lei /* All allocations will be freed in release handler of q->mq_kobj */ 33481db4909eSMing Lei static int blk_mq_alloc_ctxs(struct request_queue *q) 33491db4909eSMing Lei { 33501db4909eSMing Lei struct blk_mq_ctxs *ctxs; 33511db4909eSMing Lei int cpu; 33521db4909eSMing Lei 33531db4909eSMing Lei ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 33541db4909eSMing Lei if (!ctxs) 33551db4909eSMing Lei return -ENOMEM; 33561db4909eSMing Lei 33571db4909eSMing Lei ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 33581db4909eSMing Lei if (!ctxs->queue_ctx) 33591db4909eSMing Lei goto fail; 33601db4909eSMing Lei 33611db4909eSMing Lei for_each_possible_cpu(cpu) { 33621db4909eSMing Lei struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 33631db4909eSMing Lei ctx->ctxs = ctxs; 33641db4909eSMing Lei } 33651db4909eSMing Lei 33661db4909eSMing Lei q->mq_kobj = &ctxs->kobj; 33671db4909eSMing Lei q->queue_ctx = ctxs->queue_ctx; 33681db4909eSMing Lei 33691db4909eSMing Lei return 0; 33701db4909eSMing Lei fail: 33711db4909eSMing Lei kfree(ctxs); 33721db4909eSMing Lei return -ENOMEM; 33731db4909eSMing Lei } 33741db4909eSMing Lei 3375e09aae7eSMing Lei /* 3376e09aae7eSMing Lei * It is the actual release handler for mq, but we do it from 3377e09aae7eSMing Lei * request queue's release handler for avoiding use-after-free 3378e09aae7eSMing Lei * and headache because q->mq_kobj shouldn't have been introduced, 3379e09aae7eSMing Lei * but we can't group ctx/kctx kobj without it. 3380e09aae7eSMing Lei */ 3381e09aae7eSMing Lei void blk_mq_release(struct request_queue *q) 3382e09aae7eSMing Lei { 33832f8f1336SMing Lei struct blk_mq_hw_ctx *hctx, *next; 33842f8f1336SMing Lei int i; 3385e09aae7eSMing Lei 33862f8f1336SMing Lei queue_for_each_hw_ctx(q, hctx, i) 33872f8f1336SMing Lei WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); 33882f8f1336SMing Lei 33892f8f1336SMing Lei /* all hctx are in .unused_hctx_list now */ 33902f8f1336SMing Lei list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { 33912f8f1336SMing Lei list_del_init(&hctx->hctx_list); 33926c8b232eSMing Lei kobject_put(&hctx->kobj); 3393c3b4afcaSMing Lei } 3394e09aae7eSMing Lei 3395e09aae7eSMing Lei kfree(q->queue_hw_ctx); 3396e09aae7eSMing Lei 33977ea5fe31SMing Lei /* 33987ea5fe31SMing Lei * release .mq_kobj and sw queue's kobject now because 33997ea5fe31SMing Lei * both share lifetime with request queue. 34007ea5fe31SMing Lei */ 34017ea5fe31SMing Lei blk_mq_sysfs_deinit(q); 3402e09aae7eSMing Lei } 3403e09aae7eSMing Lei 34045ec780a6SChristoph Hellwig static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 34052f227bb9SChristoph Hellwig void *queuedata) 3406320ae51fSJens Axboe { 340726a9750aSChristoph Hellwig struct request_queue *q; 340826a9750aSChristoph Hellwig int ret; 3409b62c21b7SMike Snitzer 341026a9750aSChristoph Hellwig q = blk_alloc_queue(set->numa_node); 341126a9750aSChristoph Hellwig if (!q) 3412b62c21b7SMike Snitzer return ERR_PTR(-ENOMEM); 341326a9750aSChristoph Hellwig q->queuedata = queuedata; 341426a9750aSChristoph Hellwig ret = blk_mq_init_allocated_queue(set, q); 341526a9750aSChristoph Hellwig if (ret) { 341626a9750aSChristoph Hellwig blk_cleanup_queue(q); 341726a9750aSChristoph Hellwig return ERR_PTR(ret); 341826a9750aSChristoph Hellwig } 3419b62c21b7SMike Snitzer return q; 3420b62c21b7SMike Snitzer } 34212f227bb9SChristoph Hellwig 34222f227bb9SChristoph Hellwig struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 34232f227bb9SChristoph Hellwig { 34242f227bb9SChristoph Hellwig return blk_mq_init_queue_data(set, NULL); 34252f227bb9SChristoph Hellwig } 3426b62c21b7SMike Snitzer EXPORT_SYMBOL(blk_mq_init_queue); 3427b62c21b7SMike Snitzer 34284dcc4874SChristoph Hellwig struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 34294dcc4874SChristoph Hellwig struct lock_class_key *lkclass) 34309316a9edSJens Axboe { 34319316a9edSJens Axboe struct request_queue *q; 3432b461dfc4SChristoph Hellwig struct gendisk *disk; 34339316a9edSJens Axboe 3434b461dfc4SChristoph Hellwig q = blk_mq_init_queue_data(set, queuedata); 3435b461dfc4SChristoph Hellwig if (IS_ERR(q)) 3436b461dfc4SChristoph Hellwig return ERR_CAST(q); 34379316a9edSJens Axboe 34384a1fa41dSChristoph Hellwig disk = __alloc_disk_node(q, set->numa_node, lkclass); 3439b461dfc4SChristoph Hellwig if (!disk) { 3440b461dfc4SChristoph Hellwig blk_cleanup_queue(q); 3441b461dfc4SChristoph Hellwig return ERR_PTR(-ENOMEM); 34429316a9edSJens Axboe } 3443b461dfc4SChristoph Hellwig return disk; 34449316a9edSJens Axboe } 3445b461dfc4SChristoph Hellwig EXPORT_SYMBOL(__blk_mq_alloc_disk); 34469316a9edSJens Axboe 344734d11ffaSJianchao Wang static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 344834d11ffaSJianchao Wang struct blk_mq_tag_set *set, struct request_queue *q, 344934d11ffaSJianchao Wang int hctx_idx, int node) 345034d11ffaSJianchao Wang { 34512f8f1336SMing Lei struct blk_mq_hw_ctx *hctx = NULL, *tmp; 345234d11ffaSJianchao Wang 34532f8f1336SMing Lei /* reuse dead hctx first */ 34542f8f1336SMing Lei spin_lock(&q->unused_hctx_lock); 34552f8f1336SMing Lei list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { 34562f8f1336SMing Lei if (tmp->numa_node == node) { 34572f8f1336SMing Lei hctx = tmp; 34582f8f1336SMing Lei break; 34592f8f1336SMing Lei } 34602f8f1336SMing Lei } 34612f8f1336SMing Lei if (hctx) 34622f8f1336SMing Lei list_del_init(&hctx->hctx_list); 34632f8f1336SMing Lei spin_unlock(&q->unused_hctx_lock); 34642f8f1336SMing Lei 34652f8f1336SMing Lei if (!hctx) 34667c6c5b7cSMing Lei hctx = blk_mq_alloc_hctx(q, set, node); 346734d11ffaSJianchao Wang if (!hctx) 34687c6c5b7cSMing Lei goto fail; 346934d11ffaSJianchao Wang 34707c6c5b7cSMing Lei if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) 34717c6c5b7cSMing Lei goto free_hctx; 347234d11ffaSJianchao Wang 347334d11ffaSJianchao Wang return hctx; 34747c6c5b7cSMing Lei 34757c6c5b7cSMing Lei free_hctx: 34767c6c5b7cSMing Lei kobject_put(&hctx->kobj); 34777c6c5b7cSMing Lei fail: 34787c6c5b7cSMing Lei return NULL; 347934d11ffaSJianchao Wang } 348034d11ffaSJianchao Wang 3481868f2f0bSKeith Busch static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 3482b62c21b7SMike Snitzer struct request_queue *q) 3483b62c21b7SMike Snitzer { 3484e01ad46dSJianchao Wang int i, j, end; 3485868f2f0bSKeith Busch struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 3486320ae51fSJens Axboe 3487ac0d6b92SBart Van Assche if (q->nr_hw_queues < set->nr_hw_queues) { 3488ac0d6b92SBart Van Assche struct blk_mq_hw_ctx **new_hctxs; 3489ac0d6b92SBart Van Assche 3490ac0d6b92SBart Van Assche new_hctxs = kcalloc_node(set->nr_hw_queues, 3491ac0d6b92SBart Van Assche sizeof(*new_hctxs), GFP_KERNEL, 3492ac0d6b92SBart Van Assche set->numa_node); 3493ac0d6b92SBart Van Assche if (!new_hctxs) 3494ac0d6b92SBart Van Assche return; 3495ac0d6b92SBart Van Assche if (hctxs) 3496ac0d6b92SBart Van Assche memcpy(new_hctxs, hctxs, q->nr_hw_queues * 3497ac0d6b92SBart Van Assche sizeof(*hctxs)); 3498ac0d6b92SBart Van Assche q->queue_hw_ctx = new_hctxs; 3499ac0d6b92SBart Van Assche kfree(hctxs); 3500ac0d6b92SBart Van Assche hctxs = new_hctxs; 3501ac0d6b92SBart Van Assche } 3502ac0d6b92SBart Van Assche 3503fb350e0aSMing Lei /* protect against switching io scheduler */ 3504fb350e0aSMing Lei mutex_lock(&q->sysfs_lock); 350524d2f903SChristoph Hellwig for (i = 0; i < set->nr_hw_queues; i++) { 3506868f2f0bSKeith Busch int node; 350734d11ffaSJianchao Wang struct blk_mq_hw_ctx *hctx; 3508868f2f0bSKeith Busch 35097d76f856SDongli Zhang node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); 351034d11ffaSJianchao Wang /* 351134d11ffaSJianchao Wang * If the hw queue has been mapped to another numa node, 351234d11ffaSJianchao Wang * we need to realloc the hctx. If allocation fails, fallback 351334d11ffaSJianchao Wang * to use the previous one. 351434d11ffaSJianchao Wang */ 351534d11ffaSJianchao Wang if (hctxs[i] && (hctxs[i]->numa_node == node)) 351634d11ffaSJianchao Wang continue; 3517320ae51fSJens Axboe 351834d11ffaSJianchao Wang hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); 351934d11ffaSJianchao Wang if (hctx) { 35202f8f1336SMing Lei if (hctxs[i]) 352134d11ffaSJianchao Wang blk_mq_exit_hctx(q, set, hctxs[i], i); 352234d11ffaSJianchao Wang hctxs[i] = hctx; 352334d11ffaSJianchao Wang } else { 352434d11ffaSJianchao Wang if (hctxs[i]) 352534d11ffaSJianchao Wang pr_warn("Allocate new hctx on node %d fails,\ 352634d11ffaSJianchao Wang fallback to previous one on node %d\n", 352734d11ffaSJianchao Wang node, hctxs[i]->numa_node); 352834d11ffaSJianchao Wang else 3529868f2f0bSKeith Busch break; 3530868f2f0bSKeith Busch } 3531320ae51fSJens Axboe } 3532e01ad46dSJianchao Wang /* 3533e01ad46dSJianchao Wang * Increasing nr_hw_queues fails. Free the newly allocated 3534e01ad46dSJianchao Wang * hctxs and keep the previous q->nr_hw_queues. 3535e01ad46dSJianchao Wang */ 3536e01ad46dSJianchao Wang if (i != set->nr_hw_queues) { 3537e01ad46dSJianchao Wang j = q->nr_hw_queues; 3538e01ad46dSJianchao Wang end = i; 3539e01ad46dSJianchao Wang } else { 3540e01ad46dSJianchao Wang j = i; 3541e01ad46dSJianchao Wang end = q->nr_hw_queues; 3542e01ad46dSJianchao Wang q->nr_hw_queues = set->nr_hw_queues; 3543e01ad46dSJianchao Wang } 354434d11ffaSJianchao Wang 3545e01ad46dSJianchao Wang for (; j < end; j++) { 3546868f2f0bSKeith Busch struct blk_mq_hw_ctx *hctx = hctxs[j]; 3547868f2f0bSKeith Busch 3548868f2f0bSKeith Busch if (hctx) { 3549e155b0c2SJohn Garry __blk_mq_free_map_and_rqs(set, j); 3550868f2f0bSKeith Busch blk_mq_exit_hctx(q, set, hctx, j); 3551868f2f0bSKeith Busch hctxs[j] = NULL; 3552868f2f0bSKeith Busch } 3553868f2f0bSKeith Busch } 3554fb350e0aSMing Lei mutex_unlock(&q->sysfs_lock); 3555868f2f0bSKeith Busch } 3556868f2f0bSKeith Busch 355726a9750aSChristoph Hellwig int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 355826a9750aSChristoph Hellwig struct request_queue *q) 3559868f2f0bSKeith Busch { 356066841672SMing Lei /* mark the queue as mq asap */ 356166841672SMing Lei q->mq_ops = set->ops; 356266841672SMing Lei 356334dbad5dSOmar Sandoval q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 3564720b8cccSStephen Bates blk_mq_poll_stats_bkt, 3565720b8cccSStephen Bates BLK_MQ_POLL_STATS_BKTS, q); 356634dbad5dSOmar Sandoval if (!q->poll_cb) 356734dbad5dSOmar Sandoval goto err_exit; 356834dbad5dSOmar Sandoval 35691db4909eSMing Lei if (blk_mq_alloc_ctxs(q)) 357041de54c6SJes Sorensen goto err_poll; 3571868f2f0bSKeith Busch 3572737f98cfSMing Lei /* init q->mq_kobj and sw queues' kobjects */ 3573737f98cfSMing Lei blk_mq_sysfs_init(q); 3574737f98cfSMing Lei 35752f8f1336SMing Lei INIT_LIST_HEAD(&q->unused_hctx_list); 35762f8f1336SMing Lei spin_lock_init(&q->unused_hctx_lock); 35772f8f1336SMing Lei 3578868f2f0bSKeith Busch blk_mq_realloc_hw_ctxs(set, q); 3579868f2f0bSKeith Busch if (!q->nr_hw_queues) 3580868f2f0bSKeith Busch goto err_hctxs; 3581320ae51fSJens Axboe 3582287922ebSChristoph Hellwig INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 3583e56f698bSMing Lei blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 3584320ae51fSJens Axboe 3585a8908939SJens Axboe q->tag_set = set; 3586320ae51fSJens Axboe 358794eddfbeSJens Axboe q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 3588cd19181bSMing Lei if (set->nr_maps > HCTX_TYPE_POLL && 3589cd19181bSMing Lei set->map[HCTX_TYPE_POLL].nr_queues) 35906544d229SChristoph Hellwig blk_queue_flag_set(QUEUE_FLAG_POLL, q); 3591320ae51fSJens Axboe 35922849450aSMike Snitzer INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 35936fca6a61SChristoph Hellwig INIT_LIST_HEAD(&q->requeue_list); 35946fca6a61SChristoph Hellwig spin_lock_init(&q->requeue_lock); 35956fca6a61SChristoph Hellwig 3596eba71768SJens Axboe q->nr_requests = set->queue_depth; 3597eba71768SJens Axboe 359864f1c21eSJens Axboe /* 359964f1c21eSJens Axboe * Default to classic polling 360064f1c21eSJens Axboe */ 360129ece8b4SYufen Yu q->poll_nsec = BLK_MQ_POLL_CLASSIC; 360264f1c21eSJens Axboe 360324d2f903SChristoph Hellwig blk_mq_init_cpu_queues(q, set->nr_hw_queues); 36040d2602caSJens Axboe blk_mq_add_queue_tag_set(set, q); 36054b855ad3SChristoph Hellwig blk_mq_map_swqueue(q); 360626a9750aSChristoph Hellwig return 0; 360718741986SChristoph Hellwig 3608320ae51fSJens Axboe err_hctxs: 3609868f2f0bSKeith Busch kfree(q->queue_hw_ctx); 361073d9c8d4Szhengbin q->nr_hw_queues = 0; 36111db4909eSMing Lei blk_mq_sysfs_deinit(q); 361241de54c6SJes Sorensen err_poll: 361341de54c6SJes Sorensen blk_stat_free_callback(q->poll_cb); 361441de54c6SJes Sorensen q->poll_cb = NULL; 3615c7de5726SMing Lin err_exit: 3616c7de5726SMing Lin q->mq_ops = NULL; 361726a9750aSChristoph Hellwig return -ENOMEM; 3618320ae51fSJens Axboe } 3619b62c21b7SMike Snitzer EXPORT_SYMBOL(blk_mq_init_allocated_queue); 3620320ae51fSJens Axboe 3621c7e2d94bSMing Lei /* tags can _not_ be used after returning from blk_mq_exit_queue */ 3622c7e2d94bSMing Lei void blk_mq_exit_queue(struct request_queue *q) 3623320ae51fSJens Axboe { 3624624dbe47SMing Lei struct blk_mq_tag_set *set = q->tag_set; 3625320ae51fSJens Axboe 3626630ef623SBart Van Assche /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ 3627624dbe47SMing Lei blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 3628630ef623SBart Van Assche /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ 3629630ef623SBart Van Assche blk_mq_del_queue_tag_set(q); 3630320ae51fSJens Axboe } 3631320ae51fSJens Axboe 3632a5164405SJens Axboe static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 3633a5164405SJens Axboe { 3634a5164405SJens Axboe int i; 3635a5164405SJens Axboe 3636079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) { 3637079a2e3eSJohn Garry set->shared_tags = blk_mq_alloc_map_and_rqs(set, 3638e155b0c2SJohn Garry BLK_MQ_NO_HCTX_IDX, 3639e155b0c2SJohn Garry set->queue_depth); 3640079a2e3eSJohn Garry if (!set->shared_tags) 3641e155b0c2SJohn Garry return -ENOMEM; 3642e155b0c2SJohn Garry } 3643e155b0c2SJohn Garry 36448229cca8SXianting Tian for (i = 0; i < set->nr_hw_queues; i++) { 364563064be1SJohn Garry if (!__blk_mq_alloc_map_and_rqs(set, i)) 3646a5164405SJens Axboe goto out_unwind; 36478229cca8SXianting Tian cond_resched(); 36488229cca8SXianting Tian } 3649a5164405SJens Axboe 3650a5164405SJens Axboe return 0; 3651a5164405SJens Axboe 3652a5164405SJens Axboe out_unwind: 3653e155b0c2SJohn Garry while (--i >= 0) 3654e155b0c2SJohn Garry __blk_mq_free_map_and_rqs(set, i); 3655e155b0c2SJohn Garry 3656079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) { 3657079a2e3eSJohn Garry blk_mq_free_map_and_rqs(set, set->shared_tags, 3658e155b0c2SJohn Garry BLK_MQ_NO_HCTX_IDX); 3659645db34eSJohn Garry } 3660a5164405SJens Axboe 3661a5164405SJens Axboe return -ENOMEM; 3662a5164405SJens Axboe } 3663a5164405SJens Axboe 3664a5164405SJens Axboe /* 3665a5164405SJens Axboe * Allocate the request maps associated with this tag_set. Note that this 3666a5164405SJens Axboe * may reduce the depth asked for, if memory is tight. set->queue_depth 3667a5164405SJens Axboe * will be updated to reflect the allocated depth. 3668a5164405SJens Axboe */ 366963064be1SJohn Garry static int blk_mq_alloc_set_map_and_rqs(struct blk_mq_tag_set *set) 3670a5164405SJens Axboe { 3671a5164405SJens Axboe unsigned int depth; 3672a5164405SJens Axboe int err; 3673a5164405SJens Axboe 3674a5164405SJens Axboe depth = set->queue_depth; 3675a5164405SJens Axboe do { 3676a5164405SJens Axboe err = __blk_mq_alloc_rq_maps(set); 3677a5164405SJens Axboe if (!err) 3678a5164405SJens Axboe break; 3679a5164405SJens Axboe 3680a5164405SJens Axboe set->queue_depth >>= 1; 3681a5164405SJens Axboe if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 3682a5164405SJens Axboe err = -ENOMEM; 3683a5164405SJens Axboe break; 3684a5164405SJens Axboe } 3685a5164405SJens Axboe } while (set->queue_depth); 3686a5164405SJens Axboe 3687a5164405SJens Axboe if (!set->queue_depth || err) { 3688a5164405SJens Axboe pr_err("blk-mq: failed to allocate request map\n"); 3689a5164405SJens Axboe return -ENOMEM; 3690a5164405SJens Axboe } 3691a5164405SJens Axboe 3692a5164405SJens Axboe if (depth != set->queue_depth) 3693a5164405SJens Axboe pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 3694a5164405SJens Axboe depth, set->queue_depth); 3695a5164405SJens Axboe 3696a5164405SJens Axboe return 0; 3697a5164405SJens Axboe } 3698a5164405SJens Axboe 3699ebe8bddbSOmar Sandoval static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 3700ebe8bddbSOmar Sandoval { 37016e66b493SBart Van Assche /* 37026e66b493SBart Van Assche * blk_mq_map_queues() and multiple .map_queues() implementations 37036e66b493SBart Van Assche * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the 37046e66b493SBart Van Assche * number of hardware queues. 37056e66b493SBart Van Assche */ 37066e66b493SBart Van Assche if (set->nr_maps == 1) 37076e66b493SBart Van Assche set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; 37086e66b493SBart Van Assche 370959388702SMing Lei if (set->ops->map_queues && !is_kdump_kernel()) { 3710b3c661b1SJens Axboe int i; 3711b3c661b1SJens Axboe 37127d4901a9SMing Lei /* 37137d4901a9SMing Lei * transport .map_queues is usually done in the following 37147d4901a9SMing Lei * way: 37157d4901a9SMing Lei * 37167d4901a9SMing Lei * for (queue = 0; queue < set->nr_hw_queues; queue++) { 37177d4901a9SMing Lei * mask = get_cpu_mask(queue) 37187d4901a9SMing Lei * for_each_cpu(cpu, mask) 3719b3c661b1SJens Axboe * set->map[x].mq_map[cpu] = queue; 37207d4901a9SMing Lei * } 37217d4901a9SMing Lei * 37227d4901a9SMing Lei * When we need to remap, the table has to be cleared for 37237d4901a9SMing Lei * killing stale mapping since one CPU may not be mapped 37247d4901a9SMing Lei * to any hw queue. 37257d4901a9SMing Lei */ 3726b3c661b1SJens Axboe for (i = 0; i < set->nr_maps; i++) 3727b3c661b1SJens Axboe blk_mq_clear_mq_map(&set->map[i]); 37287d4901a9SMing Lei 3729ebe8bddbSOmar Sandoval return set->ops->map_queues(set); 3730b3c661b1SJens Axboe } else { 3731b3c661b1SJens Axboe BUG_ON(set->nr_maps > 1); 37327d76f856SDongli Zhang return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 3733ebe8bddbSOmar Sandoval } 3734b3c661b1SJens Axboe } 3735ebe8bddbSOmar Sandoval 3736f7e76dbcSBart Van Assche static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, 3737f7e76dbcSBart Van Assche int cur_nr_hw_queues, int new_nr_hw_queues) 3738f7e76dbcSBart Van Assche { 3739f7e76dbcSBart Van Assche struct blk_mq_tags **new_tags; 3740f7e76dbcSBart Van Assche 3741f7e76dbcSBart Van Assche if (cur_nr_hw_queues >= new_nr_hw_queues) 3742f7e76dbcSBart Van Assche return 0; 3743f7e76dbcSBart Van Assche 3744f7e76dbcSBart Van Assche new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), 3745f7e76dbcSBart Van Assche GFP_KERNEL, set->numa_node); 3746f7e76dbcSBart Van Assche if (!new_tags) 3747f7e76dbcSBart Van Assche return -ENOMEM; 3748f7e76dbcSBart Van Assche 3749f7e76dbcSBart Van Assche if (set->tags) 3750f7e76dbcSBart Van Assche memcpy(new_tags, set->tags, cur_nr_hw_queues * 3751f7e76dbcSBart Van Assche sizeof(*set->tags)); 3752f7e76dbcSBart Van Assche kfree(set->tags); 3753f7e76dbcSBart Van Assche set->tags = new_tags; 3754f7e76dbcSBart Van Assche set->nr_hw_queues = new_nr_hw_queues; 3755f7e76dbcSBart Van Assche 3756f7e76dbcSBart Van Assche return 0; 3757f7e76dbcSBart Van Assche } 3758f7e76dbcSBart Van Assche 375991cdf265SMinwoo Im static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, 376091cdf265SMinwoo Im int new_nr_hw_queues) 376191cdf265SMinwoo Im { 376291cdf265SMinwoo Im return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); 376391cdf265SMinwoo Im } 376491cdf265SMinwoo Im 3765a4391c64SJens Axboe /* 3766a4391c64SJens Axboe * Alloc a tag set to be associated with one or more request queues. 3767a4391c64SJens Axboe * May fail with EINVAL for various error conditions. May adjust the 3768c018c84fSMinwoo Im * requested depth down, if it's too large. In that case, the set 3769a4391c64SJens Axboe * value will be stored in set->queue_depth. 3770a4391c64SJens Axboe */ 377124d2f903SChristoph Hellwig int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 377224d2f903SChristoph Hellwig { 3773b3c661b1SJens Axboe int i, ret; 3774da695ba2SChristoph Hellwig 3775205fb5f5SBart Van Assche BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 3776205fb5f5SBart Van Assche 377724d2f903SChristoph Hellwig if (!set->nr_hw_queues) 377824d2f903SChristoph Hellwig return -EINVAL; 3779a4391c64SJens Axboe if (!set->queue_depth) 378024d2f903SChristoph Hellwig return -EINVAL; 378124d2f903SChristoph Hellwig if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 378224d2f903SChristoph Hellwig return -EINVAL; 378324d2f903SChristoph Hellwig 37847d7e0f90SChristoph Hellwig if (!set->ops->queue_rq) 378524d2f903SChristoph Hellwig return -EINVAL; 378624d2f903SChristoph Hellwig 3787de148297SMing Lei if (!set->ops->get_budget ^ !set->ops->put_budget) 3788de148297SMing Lei return -EINVAL; 3789de148297SMing Lei 3790a4391c64SJens Axboe if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 3791a4391c64SJens Axboe pr_info("blk-mq: reduced tag depth to %u\n", 3792a4391c64SJens Axboe BLK_MQ_MAX_DEPTH); 3793a4391c64SJens Axboe set->queue_depth = BLK_MQ_MAX_DEPTH; 3794a4391c64SJens Axboe } 379524d2f903SChristoph Hellwig 3796b3c661b1SJens Axboe if (!set->nr_maps) 3797b3c661b1SJens Axboe set->nr_maps = 1; 3798b3c661b1SJens Axboe else if (set->nr_maps > HCTX_MAX_TYPES) 3799b3c661b1SJens Axboe return -EINVAL; 3800b3c661b1SJens Axboe 38016637fadfSShaohua Li /* 38026637fadfSShaohua Li * If a crashdump is active, then we are potentially in a very 38036637fadfSShaohua Li * memory constrained environment. Limit us to 1 queue and 38046637fadfSShaohua Li * 64 tags to prevent using too much memory. 38056637fadfSShaohua Li */ 38066637fadfSShaohua Li if (is_kdump_kernel()) { 38076637fadfSShaohua Li set->nr_hw_queues = 1; 380859388702SMing Lei set->nr_maps = 1; 38096637fadfSShaohua Li set->queue_depth = min(64U, set->queue_depth); 38106637fadfSShaohua Li } 3811868f2f0bSKeith Busch /* 3812392546aeSJens Axboe * There is no use for more h/w queues than cpus if we just have 3813392546aeSJens Axboe * a single map 3814868f2f0bSKeith Busch */ 3815392546aeSJens Axboe if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 3816868f2f0bSKeith Busch set->nr_hw_queues = nr_cpu_ids; 38176637fadfSShaohua Li 381891cdf265SMinwoo Im if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) 3819a5164405SJens Axboe return -ENOMEM; 382024d2f903SChristoph Hellwig 3821da695ba2SChristoph Hellwig ret = -ENOMEM; 3822b3c661b1SJens Axboe for (i = 0; i < set->nr_maps; i++) { 3823b3c661b1SJens Axboe set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 382407b35eb5SMing Lei sizeof(set->map[i].mq_map[0]), 3825da695ba2SChristoph Hellwig GFP_KERNEL, set->numa_node); 3826b3c661b1SJens Axboe if (!set->map[i].mq_map) 3827b3c661b1SJens Axboe goto out_free_mq_map; 382859388702SMing Lei set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 3829b3c661b1SJens Axboe } 3830bdd17e75SChristoph Hellwig 3831ebe8bddbSOmar Sandoval ret = blk_mq_update_queue_map(set); 3832da695ba2SChristoph Hellwig if (ret) 3833da695ba2SChristoph Hellwig goto out_free_mq_map; 3834da695ba2SChristoph Hellwig 383563064be1SJohn Garry ret = blk_mq_alloc_set_map_and_rqs(set); 3836da695ba2SChristoph Hellwig if (ret) 3837bdd17e75SChristoph Hellwig goto out_free_mq_map; 383824d2f903SChristoph Hellwig 38390d2602caSJens Axboe mutex_init(&set->tag_list_lock); 38400d2602caSJens Axboe INIT_LIST_HEAD(&set->tag_list); 38410d2602caSJens Axboe 384224d2f903SChristoph Hellwig return 0; 3843bdd17e75SChristoph Hellwig 3844bdd17e75SChristoph Hellwig out_free_mq_map: 3845b3c661b1SJens Axboe for (i = 0; i < set->nr_maps; i++) { 3846b3c661b1SJens Axboe kfree(set->map[i].mq_map); 3847b3c661b1SJens Axboe set->map[i].mq_map = NULL; 3848b3c661b1SJens Axboe } 38495676e7b6SRobert Elliott kfree(set->tags); 38505676e7b6SRobert Elliott set->tags = NULL; 3851da695ba2SChristoph Hellwig return ret; 385224d2f903SChristoph Hellwig } 385324d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_alloc_tag_set); 385424d2f903SChristoph Hellwig 3855cdb14e0fSChristoph Hellwig /* allocate and initialize a tagset for a simple single-queue device */ 3856cdb14e0fSChristoph Hellwig int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 3857cdb14e0fSChristoph Hellwig const struct blk_mq_ops *ops, unsigned int queue_depth, 3858cdb14e0fSChristoph Hellwig unsigned int set_flags) 3859cdb14e0fSChristoph Hellwig { 3860cdb14e0fSChristoph Hellwig memset(set, 0, sizeof(*set)); 3861cdb14e0fSChristoph Hellwig set->ops = ops; 3862cdb14e0fSChristoph Hellwig set->nr_hw_queues = 1; 3863cdb14e0fSChristoph Hellwig set->nr_maps = 1; 3864cdb14e0fSChristoph Hellwig set->queue_depth = queue_depth; 3865cdb14e0fSChristoph Hellwig set->numa_node = NUMA_NO_NODE; 3866cdb14e0fSChristoph Hellwig set->flags = set_flags; 3867cdb14e0fSChristoph Hellwig return blk_mq_alloc_tag_set(set); 3868cdb14e0fSChristoph Hellwig } 3869cdb14e0fSChristoph Hellwig EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); 3870cdb14e0fSChristoph Hellwig 387124d2f903SChristoph Hellwig void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 387224d2f903SChristoph Hellwig { 3873b3c661b1SJens Axboe int i, j; 387424d2f903SChristoph Hellwig 3875e155b0c2SJohn Garry for (i = 0; i < set->nr_hw_queues; i++) 3876e155b0c2SJohn Garry __blk_mq_free_map_and_rqs(set, i); 3877484b4061SJens Axboe 3878079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) { 3879079a2e3eSJohn Garry blk_mq_free_map_and_rqs(set, set->shared_tags, 3880e155b0c2SJohn Garry BLK_MQ_NO_HCTX_IDX); 3881e155b0c2SJohn Garry } 388232bc15afSJohn Garry 3883b3c661b1SJens Axboe for (j = 0; j < set->nr_maps; j++) { 3884b3c661b1SJens Axboe kfree(set->map[j].mq_map); 3885b3c661b1SJens Axboe set->map[j].mq_map = NULL; 3886b3c661b1SJens Axboe } 3887bdd17e75SChristoph Hellwig 3888981bd189SMing Lei kfree(set->tags); 38895676e7b6SRobert Elliott set->tags = NULL; 389024d2f903SChristoph Hellwig } 389124d2f903SChristoph Hellwig EXPORT_SYMBOL(blk_mq_free_tag_set); 389224d2f903SChristoph Hellwig 3893e3a2b3f9SJens Axboe int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 3894e3a2b3f9SJens Axboe { 3895e3a2b3f9SJens Axboe struct blk_mq_tag_set *set = q->tag_set; 3896e3a2b3f9SJens Axboe struct blk_mq_hw_ctx *hctx; 3897e3a2b3f9SJens Axboe int i, ret; 3898e3a2b3f9SJens Axboe 3899bd166ef1SJens Axboe if (!set) 3900e3a2b3f9SJens Axboe return -EINVAL; 3901e3a2b3f9SJens Axboe 3902e5fa8140SAleksei Zakharov if (q->nr_requests == nr) 3903e5fa8140SAleksei Zakharov return 0; 3904e5fa8140SAleksei Zakharov 390570f36b60SJens Axboe blk_mq_freeze_queue(q); 390624f5a90fSMing Lei blk_mq_quiesce_queue(q); 390770f36b60SJens Axboe 3908e3a2b3f9SJens Axboe ret = 0; 3909e3a2b3f9SJens Axboe queue_for_each_hw_ctx(q, hctx, i) { 3910e9137d4bSKeith Busch if (!hctx->tags) 3911e9137d4bSKeith Busch continue; 3912bd166ef1SJens Axboe /* 3913bd166ef1SJens Axboe * If we're using an MQ scheduler, just update the scheduler 3914bd166ef1SJens Axboe * queue depth. This is similar to what the old code would do. 3915bd166ef1SJens Axboe */ 3916f6adcef5SJohn Garry if (hctx->sched_tags) { 391770f36b60SJens Axboe ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 391870f36b60SJens Axboe nr, true); 3919f6adcef5SJohn Garry } else { 3920f6adcef5SJohn Garry ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 3921f6adcef5SJohn Garry false); 392270f36b60SJens Axboe } 3923e3a2b3f9SJens Axboe if (ret) 3924e3a2b3f9SJens Axboe break; 392577f1e0a5SJens Axboe if (q->elevator && q->elevator->type->ops.depth_updated) 392677f1e0a5SJens Axboe q->elevator->type->ops.depth_updated(hctx); 3927e3a2b3f9SJens Axboe } 3928d97e594cSJohn Garry if (!ret) { 3929e3a2b3f9SJens Axboe q->nr_requests = nr; 3930079a2e3eSJohn Garry if (blk_mq_is_shared_tags(set->flags)) { 39318fa04464SJohn Garry if (q->elevator) 3932079a2e3eSJohn Garry blk_mq_tag_update_sched_shared_tags(q); 39338fa04464SJohn Garry else 3934079a2e3eSJohn Garry blk_mq_tag_resize_shared_tags(set, nr); 39358fa04464SJohn Garry } 3936d97e594cSJohn Garry } 3937e3a2b3f9SJens Axboe 393824f5a90fSMing Lei blk_mq_unquiesce_queue(q); 393970f36b60SJens Axboe blk_mq_unfreeze_queue(q); 394070f36b60SJens Axboe 3941e3a2b3f9SJens Axboe return ret; 3942e3a2b3f9SJens Axboe } 3943e3a2b3f9SJens Axboe 3944d48ece20SJianchao Wang /* 3945d48ece20SJianchao Wang * request_queue and elevator_type pair. 3946d48ece20SJianchao Wang * It is just used by __blk_mq_update_nr_hw_queues to cache 3947d48ece20SJianchao Wang * the elevator_type associated with a request_queue. 3948d48ece20SJianchao Wang */ 3949d48ece20SJianchao Wang struct blk_mq_qe_pair { 3950d48ece20SJianchao Wang struct list_head node; 3951d48ece20SJianchao Wang struct request_queue *q; 3952d48ece20SJianchao Wang struct elevator_type *type; 3953d48ece20SJianchao Wang }; 3954d48ece20SJianchao Wang 3955d48ece20SJianchao Wang /* 3956d48ece20SJianchao Wang * Cache the elevator_type in qe pair list and switch the 3957d48ece20SJianchao Wang * io scheduler to 'none' 3958d48ece20SJianchao Wang */ 3959d48ece20SJianchao Wang static bool blk_mq_elv_switch_none(struct list_head *head, 3960d48ece20SJianchao Wang struct request_queue *q) 3961d48ece20SJianchao Wang { 3962d48ece20SJianchao Wang struct blk_mq_qe_pair *qe; 3963d48ece20SJianchao Wang 3964d48ece20SJianchao Wang if (!q->elevator) 3965d48ece20SJianchao Wang return true; 3966d48ece20SJianchao Wang 3967d48ece20SJianchao Wang qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 3968d48ece20SJianchao Wang if (!qe) 3969d48ece20SJianchao Wang return false; 3970d48ece20SJianchao Wang 3971d48ece20SJianchao Wang INIT_LIST_HEAD(&qe->node); 3972d48ece20SJianchao Wang qe->q = q; 3973d48ece20SJianchao Wang qe->type = q->elevator->type; 3974d48ece20SJianchao Wang list_add(&qe->node, head); 3975d48ece20SJianchao Wang 3976d48ece20SJianchao Wang mutex_lock(&q->sysfs_lock); 3977d48ece20SJianchao Wang /* 3978d48ece20SJianchao Wang * After elevator_switch_mq, the previous elevator_queue will be 3979d48ece20SJianchao Wang * released by elevator_release. The reference of the io scheduler 3980d48ece20SJianchao Wang * module get by elevator_get will also be put. So we need to get 3981d48ece20SJianchao Wang * a reference of the io scheduler module here to prevent it to be 3982d48ece20SJianchao Wang * removed. 3983d48ece20SJianchao Wang */ 3984d48ece20SJianchao Wang __module_get(qe->type->elevator_owner); 3985d48ece20SJianchao Wang elevator_switch_mq(q, NULL); 3986d48ece20SJianchao Wang mutex_unlock(&q->sysfs_lock); 3987d48ece20SJianchao Wang 3988d48ece20SJianchao Wang return true; 3989d48ece20SJianchao Wang } 3990d48ece20SJianchao Wang 3991d48ece20SJianchao Wang static void blk_mq_elv_switch_back(struct list_head *head, 3992d48ece20SJianchao Wang struct request_queue *q) 3993d48ece20SJianchao Wang { 3994d48ece20SJianchao Wang struct blk_mq_qe_pair *qe; 3995d48ece20SJianchao Wang struct elevator_type *t = NULL; 3996d48ece20SJianchao Wang 3997d48ece20SJianchao Wang list_for_each_entry(qe, head, node) 3998d48ece20SJianchao Wang if (qe->q == q) { 3999d48ece20SJianchao Wang t = qe->type; 4000d48ece20SJianchao Wang break; 4001d48ece20SJianchao Wang } 4002d48ece20SJianchao Wang 4003d48ece20SJianchao Wang if (!t) 4004d48ece20SJianchao Wang return; 4005d48ece20SJianchao Wang 4006d48ece20SJianchao Wang list_del(&qe->node); 4007d48ece20SJianchao Wang kfree(qe); 4008d48ece20SJianchao Wang 4009d48ece20SJianchao Wang mutex_lock(&q->sysfs_lock); 4010d48ece20SJianchao Wang elevator_switch_mq(q, t); 4011d48ece20SJianchao Wang mutex_unlock(&q->sysfs_lock); 4012d48ece20SJianchao Wang } 4013d48ece20SJianchao Wang 4014e4dc2b32SKeith Busch static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 4015e4dc2b32SKeith Busch int nr_hw_queues) 4016868f2f0bSKeith Busch { 4017868f2f0bSKeith Busch struct request_queue *q; 4018d48ece20SJianchao Wang LIST_HEAD(head); 4019e01ad46dSJianchao Wang int prev_nr_hw_queues; 4020868f2f0bSKeith Busch 4021705cda97SBart Van Assche lockdep_assert_held(&set->tag_list_lock); 4022705cda97SBart Van Assche 4023392546aeSJens Axboe if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 4024868f2f0bSKeith Busch nr_hw_queues = nr_cpu_ids; 4025fe35ec58SWeiping Zhang if (nr_hw_queues < 1) 4026fe35ec58SWeiping Zhang return; 4027fe35ec58SWeiping Zhang if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) 4028868f2f0bSKeith Busch return; 4029868f2f0bSKeith Busch 4030868f2f0bSKeith Busch list_for_each_entry(q, &set->tag_list, tag_set_list) 4031868f2f0bSKeith Busch blk_mq_freeze_queue(q); 4032d48ece20SJianchao Wang /* 4033d48ece20SJianchao Wang * Switch IO scheduler to 'none', cleaning up the data associated 4034d48ece20SJianchao Wang * with the previous scheduler. We will switch back once we are done 4035d48ece20SJianchao Wang * updating the new sw to hw queue mappings. 4036d48ece20SJianchao Wang */ 4037d48ece20SJianchao Wang list_for_each_entry(q, &set->tag_list, tag_set_list) 4038d48ece20SJianchao Wang if (!blk_mq_elv_switch_none(&head, q)) 4039d48ece20SJianchao Wang goto switch_back; 4040868f2f0bSKeith Busch 4041477e19deSJianchao Wang list_for_each_entry(q, &set->tag_list, tag_set_list) { 4042477e19deSJianchao Wang blk_mq_debugfs_unregister_hctxs(q); 4043477e19deSJianchao Wang blk_mq_sysfs_unregister(q); 4044477e19deSJianchao Wang } 4045477e19deSJianchao Wang 4046a2584e43SWeiping Zhang prev_nr_hw_queues = set->nr_hw_queues; 4047f7e76dbcSBart Van Assche if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < 4048f7e76dbcSBart Van Assche 0) 4049f7e76dbcSBart Van Assche goto reregister; 4050f7e76dbcSBart Van Assche 4051868f2f0bSKeith Busch set->nr_hw_queues = nr_hw_queues; 4052e01ad46dSJianchao Wang fallback: 4053aa880ad6SWeiping Zhang blk_mq_update_queue_map(set); 4054868f2f0bSKeith Busch list_for_each_entry(q, &set->tag_list, tag_set_list) { 4055868f2f0bSKeith Busch blk_mq_realloc_hw_ctxs(set, q); 4056e01ad46dSJianchao Wang if (q->nr_hw_queues != set->nr_hw_queues) { 4057e01ad46dSJianchao Wang pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 4058e01ad46dSJianchao Wang nr_hw_queues, prev_nr_hw_queues); 4059e01ad46dSJianchao Wang set->nr_hw_queues = prev_nr_hw_queues; 40607d76f856SDongli Zhang blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 4061e01ad46dSJianchao Wang goto fallback; 4062e01ad46dSJianchao Wang } 4063477e19deSJianchao Wang blk_mq_map_swqueue(q); 4064477e19deSJianchao Wang } 4065477e19deSJianchao Wang 4066f7e76dbcSBart Van Assche reregister: 4067477e19deSJianchao Wang list_for_each_entry(q, &set->tag_list, tag_set_list) { 4068477e19deSJianchao Wang blk_mq_sysfs_register(q); 4069477e19deSJianchao Wang blk_mq_debugfs_register_hctxs(q); 4070868f2f0bSKeith Busch } 4071868f2f0bSKeith Busch 4072d48ece20SJianchao Wang switch_back: 4073d48ece20SJianchao Wang list_for_each_entry(q, &set->tag_list, tag_set_list) 4074d48ece20SJianchao Wang blk_mq_elv_switch_back(&head, q); 4075d48ece20SJianchao Wang 4076868f2f0bSKeith Busch list_for_each_entry(q, &set->tag_list, tag_set_list) 4077868f2f0bSKeith Busch blk_mq_unfreeze_queue(q); 4078868f2f0bSKeith Busch } 4079e4dc2b32SKeith Busch 4080e4dc2b32SKeith Busch void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 4081e4dc2b32SKeith Busch { 4082e4dc2b32SKeith Busch mutex_lock(&set->tag_list_lock); 4083e4dc2b32SKeith Busch __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 4084e4dc2b32SKeith Busch mutex_unlock(&set->tag_list_lock); 4085e4dc2b32SKeith Busch } 4086868f2f0bSKeith Busch EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 4087868f2f0bSKeith Busch 408834dbad5dSOmar Sandoval /* Enable polling stats and return whether they were already enabled. */ 408934dbad5dSOmar Sandoval static bool blk_poll_stats_enable(struct request_queue *q) 409034dbad5dSOmar Sandoval { 409134dbad5dSOmar Sandoval if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 40927dfdbc73SBart Van Assche blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) 409334dbad5dSOmar Sandoval return true; 409434dbad5dSOmar Sandoval blk_stat_add_callback(q, q->poll_cb); 409534dbad5dSOmar Sandoval return false; 409634dbad5dSOmar Sandoval } 409734dbad5dSOmar Sandoval 409834dbad5dSOmar Sandoval static void blk_mq_poll_stats_start(struct request_queue *q) 409934dbad5dSOmar Sandoval { 410034dbad5dSOmar Sandoval /* 410134dbad5dSOmar Sandoval * We don't arm the callback if polling stats are not enabled or the 410234dbad5dSOmar Sandoval * callback is already active. 410334dbad5dSOmar Sandoval */ 410434dbad5dSOmar Sandoval if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 410534dbad5dSOmar Sandoval blk_stat_is_active(q->poll_cb)) 410634dbad5dSOmar Sandoval return; 410734dbad5dSOmar Sandoval 410834dbad5dSOmar Sandoval blk_stat_activate_msecs(q->poll_cb, 100); 410934dbad5dSOmar Sandoval } 411034dbad5dSOmar Sandoval 411134dbad5dSOmar Sandoval static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 411234dbad5dSOmar Sandoval { 411334dbad5dSOmar Sandoval struct request_queue *q = cb->data; 4114720b8cccSStephen Bates int bucket; 411534dbad5dSOmar Sandoval 4116720b8cccSStephen Bates for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 4117720b8cccSStephen Bates if (cb->stat[bucket].nr_samples) 4118720b8cccSStephen Bates q->poll_stat[bucket] = cb->stat[bucket]; 4119720b8cccSStephen Bates } 412034dbad5dSOmar Sandoval } 412134dbad5dSOmar Sandoval 412264f1c21eSJens Axboe static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 412364f1c21eSJens Axboe struct request *rq) 412464f1c21eSJens Axboe { 412564f1c21eSJens Axboe unsigned long ret = 0; 4126720b8cccSStephen Bates int bucket; 412764f1c21eSJens Axboe 412864f1c21eSJens Axboe /* 412964f1c21eSJens Axboe * If stats collection isn't on, don't sleep but turn it on for 413064f1c21eSJens Axboe * future users 413164f1c21eSJens Axboe */ 413234dbad5dSOmar Sandoval if (!blk_poll_stats_enable(q)) 413364f1c21eSJens Axboe return 0; 413464f1c21eSJens Axboe 413564f1c21eSJens Axboe /* 413664f1c21eSJens Axboe * As an optimistic guess, use half of the mean service time 413764f1c21eSJens Axboe * for this type of request. We can (and should) make this smarter. 413864f1c21eSJens Axboe * For instance, if the completion latencies are tight, we can 413964f1c21eSJens Axboe * get closer than just half the mean. This is especially 414064f1c21eSJens Axboe * important on devices where the completion latencies are longer 4141720b8cccSStephen Bates * than ~10 usec. We do use the stats for the relevant IO size 4142720b8cccSStephen Bates * if available which does lead to better estimates. 414364f1c21eSJens Axboe */ 4144720b8cccSStephen Bates bucket = blk_mq_poll_stats_bkt(rq); 4145720b8cccSStephen Bates if (bucket < 0) 4146720b8cccSStephen Bates return ret; 4147720b8cccSStephen Bates 4148720b8cccSStephen Bates if (q->poll_stat[bucket].nr_samples) 4149720b8cccSStephen Bates ret = (q->poll_stat[bucket].mean + 1) / 2; 415064f1c21eSJens Axboe 415164f1c21eSJens Axboe return ret; 415264f1c21eSJens Axboe } 415364f1c21eSJens Axboe 4154c6699d6fSChristoph Hellwig static bool blk_mq_poll_hybrid(struct request_queue *q, blk_qc_t qc) 415506426adfSJens Axboe { 4156c6699d6fSChristoph Hellwig struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, qc); 4157c6699d6fSChristoph Hellwig struct request *rq = blk_qc_to_rq(hctx, qc); 415806426adfSJens Axboe struct hrtimer_sleeper hs; 415906426adfSJens Axboe enum hrtimer_mode mode; 416064f1c21eSJens Axboe unsigned int nsecs; 416106426adfSJens Axboe ktime_t kt; 416206426adfSJens Axboe 4163c6699d6fSChristoph Hellwig /* 4164c6699d6fSChristoph Hellwig * If a request has completed on queue that uses an I/O scheduler, we 4165c6699d6fSChristoph Hellwig * won't get back a request from blk_qc_to_rq. 4166c6699d6fSChristoph Hellwig */ 4167c6699d6fSChristoph Hellwig if (!rq || (rq->rq_flags & RQF_MQ_POLL_SLEPT)) 416864f1c21eSJens Axboe return false; 416964f1c21eSJens Axboe 417064f1c21eSJens Axboe /* 41711052b8acSJens Axboe * If we get here, hybrid polling is enabled. Hence poll_nsec can be: 417264f1c21eSJens Axboe * 417364f1c21eSJens Axboe * 0: use half of prev avg 417464f1c21eSJens Axboe * >0: use this specific value 417564f1c21eSJens Axboe */ 41761052b8acSJens Axboe if (q->poll_nsec > 0) 417764f1c21eSJens Axboe nsecs = q->poll_nsec; 417864f1c21eSJens Axboe else 4179cae740a0SJohn Garry nsecs = blk_mq_poll_nsecs(q, rq); 418064f1c21eSJens Axboe 418164f1c21eSJens Axboe if (!nsecs) 418206426adfSJens Axboe return false; 418306426adfSJens Axboe 418476a86f9dSJens Axboe rq->rq_flags |= RQF_MQ_POLL_SLEPT; 418506426adfSJens Axboe 418606426adfSJens Axboe /* 418706426adfSJens Axboe * This will be replaced with the stats tracking code, using 418806426adfSJens Axboe * 'avg_completion_time / 2' as the pre-sleep target. 418906426adfSJens Axboe */ 41908b0e1953SThomas Gleixner kt = nsecs; 419106426adfSJens Axboe 419206426adfSJens Axboe mode = HRTIMER_MODE_REL; 4193dbc1625fSSebastian Andrzej Siewior hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); 419406426adfSJens Axboe hrtimer_set_expires(&hs.timer, kt); 419506426adfSJens Axboe 419606426adfSJens Axboe do { 41975a61c363STejun Heo if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) 419806426adfSJens Axboe break; 419906426adfSJens Axboe set_current_state(TASK_UNINTERRUPTIBLE); 42009dd8813eSThomas Gleixner hrtimer_sleeper_start_expires(&hs, mode); 420106426adfSJens Axboe if (hs.task) 420206426adfSJens Axboe io_schedule(); 420306426adfSJens Axboe hrtimer_cancel(&hs.timer); 420406426adfSJens Axboe mode = HRTIMER_MODE_ABS; 420506426adfSJens Axboe } while (hs.task && !signal_pending(current)); 420606426adfSJens Axboe 420706426adfSJens Axboe __set_current_state(TASK_RUNNING); 420806426adfSJens Axboe destroy_hrtimer_on_stack(&hs.timer); 4209c6699d6fSChristoph Hellwig 4210c6699d6fSChristoph Hellwig /* 4211c6699d6fSChristoph Hellwig * If we sleep, have the caller restart the poll loop to reset the 4212c6699d6fSChristoph Hellwig * state. Like for the other success return cases, the caller is 4213c6699d6fSChristoph Hellwig * responsible for checking if the IO completed. If the IO isn't 4214c6699d6fSChristoph Hellwig * complete, we'll get called again and will go straight to the busy 4215c6699d6fSChristoph Hellwig * poll loop. 4216c6699d6fSChristoph Hellwig */ 421706426adfSJens Axboe return true; 421806426adfSJens Axboe } 421906426adfSJens Axboe 4220c6699d6fSChristoph Hellwig static int blk_mq_poll_classic(struct request_queue *q, blk_qc_t cookie, 42215a72e899SJens Axboe struct io_comp_batch *iob, unsigned int flags) 4222bbd7bb70SJens Axboe { 4223c6699d6fSChristoph Hellwig struct blk_mq_hw_ctx *hctx = blk_qc_to_hctx(q, cookie); 4224c6699d6fSChristoph Hellwig long state = get_current_state(); 4225c6699d6fSChristoph Hellwig int ret; 42261052b8acSJens Axboe 4227c6699d6fSChristoph Hellwig do { 42285a72e899SJens Axboe ret = q->mq_ops->poll(hctx, iob); 4229c6699d6fSChristoph Hellwig if (ret > 0) { 4230c6699d6fSChristoph Hellwig __set_current_state(TASK_RUNNING); 4231c6699d6fSChristoph Hellwig return ret; 42321052b8acSJens Axboe } 42331052b8acSJens Axboe 4234c6699d6fSChristoph Hellwig if (signal_pending_state(state, current)) 4235c6699d6fSChristoph Hellwig __set_current_state(TASK_RUNNING); 4236c6699d6fSChristoph Hellwig if (task_is_running(current)) 4237c6699d6fSChristoph Hellwig return 1; 4238c6699d6fSChristoph Hellwig 4239ef99b2d3SChristoph Hellwig if (ret < 0 || (flags & BLK_POLL_ONESHOT)) 4240c6699d6fSChristoph Hellwig break; 4241c6699d6fSChristoph Hellwig cpu_relax(); 4242c6699d6fSChristoph Hellwig } while (!need_resched()); 4243c6699d6fSChristoph Hellwig 4244c6699d6fSChristoph Hellwig __set_current_state(TASK_RUNNING); 4245c6699d6fSChristoph Hellwig return 0; 42461052b8acSJens Axboe } 42471052b8acSJens Axboe 42485a72e899SJens Axboe int blk_mq_poll(struct request_queue *q, blk_qc_t cookie, struct io_comp_batch *iob, 42495a72e899SJens Axboe unsigned int flags) 42501052b8acSJens Axboe { 4251d729cf9aSChristoph Hellwig if (!(flags & BLK_POLL_NOSLEEP) && 4252d729cf9aSChristoph Hellwig q->poll_nsec != BLK_MQ_POLL_CLASSIC) { 4253c6699d6fSChristoph Hellwig if (blk_mq_poll_hybrid(q, cookie)) 425485f4d4b6SJens Axboe return 1; 4255bbd7bb70SJens Axboe } 42565a72e899SJens Axboe return blk_mq_poll_classic(q, cookie, iob, flags); 4257bbd7bb70SJens Axboe } 4258bbd7bb70SJens Axboe 42599cf2bab6SJens Axboe unsigned int blk_mq_rq_cpu(struct request *rq) 42609cf2bab6SJens Axboe { 42619cf2bab6SJens Axboe return rq->mq_ctx->cpu; 42629cf2bab6SJens Axboe } 42639cf2bab6SJens Axboe EXPORT_SYMBOL(blk_mq_rq_cpu); 42649cf2bab6SJens Axboe 4265320ae51fSJens Axboe static int __init blk_mq_init(void) 4266320ae51fSJens Axboe { 4267c3077b5dSChristoph Hellwig int i; 4268c3077b5dSChristoph Hellwig 4269c3077b5dSChristoph Hellwig for_each_possible_cpu(i) 4270f9ab4918SSebastian Andrzej Siewior init_llist_head(&per_cpu(blk_cpu_done, i)); 4271c3077b5dSChristoph Hellwig open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); 4272c3077b5dSChristoph Hellwig 4273c3077b5dSChristoph Hellwig cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, 4274c3077b5dSChristoph Hellwig "block/softirq:dead", NULL, 4275c3077b5dSChristoph Hellwig blk_softirq_cpu_dead); 42769467f859SThomas Gleixner cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 42779467f859SThomas Gleixner blk_mq_hctx_notify_dead); 4278bf0beec0SMing Lei cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", 4279bf0beec0SMing Lei blk_mq_hctx_notify_online, 4280bf0beec0SMing Lei blk_mq_hctx_notify_offline); 4281320ae51fSJens Axboe return 0; 4282320ae51fSJens Axboe } 4283320ae51fSJens Axboe subsys_initcall(blk_mq_init); 4284