1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Block multiqueue core code 4 * 5 * Copyright (C) 2013-2014 Jens Axboe 6 * Copyright (C) 2013-2014 Christoph Hellwig 7 */ 8 #include <linux/kernel.h> 9 #include <linux/module.h> 10 #include <linux/backing-dev.h> 11 #include <linux/bio.h> 12 #include <linux/blkdev.h> 13 #include <linux/kmemleak.h> 14 #include <linux/mm.h> 15 #include <linux/init.h> 16 #include <linux/slab.h> 17 #include <linux/workqueue.h> 18 #include <linux/smp.h> 19 #include <linux/llist.h> 20 #include <linux/list_sort.h> 21 #include <linux/cpu.h> 22 #include <linux/cache.h> 23 #include <linux/sched/sysctl.h> 24 #include <linux/sched/topology.h> 25 #include <linux/sched/signal.h> 26 #include <linux/delay.h> 27 #include <linux/crash_dump.h> 28 #include <linux/prefetch.h> 29 #include <linux/blk-crypto.h> 30 31 #include <trace/events/block.h> 32 33 #include <linux/blk-mq.h> 34 #include <linux/t10-pi.h> 35 #include "blk.h" 36 #include "blk-mq.h" 37 #include "blk-mq-debugfs.h" 38 #include "blk-mq-tag.h" 39 #include "blk-pm.h" 40 #include "blk-stat.h" 41 #include "blk-mq-sched.h" 42 #include "blk-rq-qos.h" 43 44 static DEFINE_PER_CPU(struct llist_head, blk_cpu_done); 45 46 static void blk_mq_poll_stats_start(struct request_queue *q); 47 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 48 49 static int blk_mq_poll_stats_bkt(const struct request *rq) 50 { 51 int ddir, sectors, bucket; 52 53 ddir = rq_data_dir(rq); 54 sectors = blk_rq_stats_sectors(rq); 55 56 bucket = ddir + 2 * ilog2(sectors); 57 58 if (bucket < 0) 59 return -1; 60 else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 61 return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 62 63 return bucket; 64 } 65 66 /* 67 * Check if any of the ctx, dispatch list or elevator 68 * have pending work in this hardware queue. 69 */ 70 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 71 { 72 return !list_empty_careful(&hctx->dispatch) || 73 sbitmap_any_bit_set(&hctx->ctx_map) || 74 blk_mq_sched_has_work(hctx); 75 } 76 77 /* 78 * Mark this ctx as having pending work in this hardware queue 79 */ 80 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 81 struct blk_mq_ctx *ctx) 82 { 83 const int bit = ctx->index_hw[hctx->type]; 84 85 if (!sbitmap_test_bit(&hctx->ctx_map, bit)) 86 sbitmap_set_bit(&hctx->ctx_map, bit); 87 } 88 89 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 90 struct blk_mq_ctx *ctx) 91 { 92 const int bit = ctx->index_hw[hctx->type]; 93 94 sbitmap_clear_bit(&hctx->ctx_map, bit); 95 } 96 97 struct mq_inflight { 98 struct block_device *part; 99 unsigned int inflight[2]; 100 }; 101 102 static bool blk_mq_check_inflight(struct blk_mq_hw_ctx *hctx, 103 struct request *rq, void *priv, 104 bool reserved) 105 { 106 struct mq_inflight *mi = priv; 107 108 if ((!mi->part->bd_partno || rq->part == mi->part) && 109 blk_mq_rq_state(rq) == MQ_RQ_IN_FLIGHT) 110 mi->inflight[rq_data_dir(rq)]++; 111 112 return true; 113 } 114 115 unsigned int blk_mq_in_flight(struct request_queue *q, 116 struct block_device *part) 117 { 118 struct mq_inflight mi = { .part = part }; 119 120 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 121 122 return mi.inflight[0] + mi.inflight[1]; 123 } 124 125 void blk_mq_in_flight_rw(struct request_queue *q, struct block_device *part, 126 unsigned int inflight[2]) 127 { 128 struct mq_inflight mi = { .part = part }; 129 130 blk_mq_queue_tag_busy_iter(q, blk_mq_check_inflight, &mi); 131 inflight[0] = mi.inflight[0]; 132 inflight[1] = mi.inflight[1]; 133 } 134 135 void blk_freeze_queue_start(struct request_queue *q) 136 { 137 mutex_lock(&q->mq_freeze_lock); 138 if (++q->mq_freeze_depth == 1) { 139 percpu_ref_kill(&q->q_usage_counter); 140 mutex_unlock(&q->mq_freeze_lock); 141 if (queue_is_mq(q)) 142 blk_mq_run_hw_queues(q, false); 143 } else { 144 mutex_unlock(&q->mq_freeze_lock); 145 } 146 } 147 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 148 149 void blk_mq_freeze_queue_wait(struct request_queue *q) 150 { 151 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 152 } 153 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 154 155 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 156 unsigned long timeout) 157 { 158 return wait_event_timeout(q->mq_freeze_wq, 159 percpu_ref_is_zero(&q->q_usage_counter), 160 timeout); 161 } 162 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 163 164 /* 165 * Guarantee no request is in use, so we can change any data structure of 166 * the queue afterward. 167 */ 168 void blk_freeze_queue(struct request_queue *q) 169 { 170 /* 171 * In the !blk_mq case we are only calling this to kill the 172 * q_usage_counter, otherwise this increases the freeze depth 173 * and waits for it to return to zero. For this reason there is 174 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 175 * exported to drivers as the only user for unfreeze is blk_mq. 176 */ 177 blk_freeze_queue_start(q); 178 blk_mq_freeze_queue_wait(q); 179 } 180 181 void blk_mq_freeze_queue(struct request_queue *q) 182 { 183 /* 184 * ...just an alias to keep freeze and unfreeze actions balanced 185 * in the blk_mq_* namespace 186 */ 187 blk_freeze_queue(q); 188 } 189 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 190 191 void __blk_mq_unfreeze_queue(struct request_queue *q, bool force_atomic) 192 { 193 mutex_lock(&q->mq_freeze_lock); 194 if (force_atomic) 195 q->q_usage_counter.data->force_atomic = true; 196 q->mq_freeze_depth--; 197 WARN_ON_ONCE(q->mq_freeze_depth < 0); 198 if (!q->mq_freeze_depth) { 199 percpu_ref_resurrect(&q->q_usage_counter); 200 wake_up_all(&q->mq_freeze_wq); 201 } 202 mutex_unlock(&q->mq_freeze_lock); 203 } 204 205 void blk_mq_unfreeze_queue(struct request_queue *q) 206 { 207 __blk_mq_unfreeze_queue(q, false); 208 } 209 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 210 211 /* 212 * FIXME: replace the scsi_internal_device_*block_nowait() calls in the 213 * mpt3sas driver such that this function can be removed. 214 */ 215 void blk_mq_quiesce_queue_nowait(struct request_queue *q) 216 { 217 blk_queue_flag_set(QUEUE_FLAG_QUIESCED, q); 218 } 219 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue_nowait); 220 221 /** 222 * blk_mq_quiesce_queue() - wait until all ongoing dispatches have finished 223 * @q: request queue. 224 * 225 * Note: this function does not prevent that the struct request end_io() 226 * callback function is invoked. Once this function is returned, we make 227 * sure no dispatch can happen until the queue is unquiesced via 228 * blk_mq_unquiesce_queue(). 229 */ 230 void blk_mq_quiesce_queue(struct request_queue *q) 231 { 232 struct blk_mq_hw_ctx *hctx; 233 unsigned int i; 234 bool rcu = false; 235 236 blk_mq_quiesce_queue_nowait(q); 237 238 queue_for_each_hw_ctx(q, hctx, i) { 239 if (hctx->flags & BLK_MQ_F_BLOCKING) 240 synchronize_srcu(hctx->srcu); 241 else 242 rcu = true; 243 } 244 if (rcu) 245 synchronize_rcu(); 246 } 247 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 248 249 /* 250 * blk_mq_unquiesce_queue() - counterpart of blk_mq_quiesce_queue() 251 * @q: request queue. 252 * 253 * This function recovers queue into the state before quiescing 254 * which is done by blk_mq_quiesce_queue. 255 */ 256 void blk_mq_unquiesce_queue(struct request_queue *q) 257 { 258 blk_queue_flag_clear(QUEUE_FLAG_QUIESCED, q); 259 260 /* dispatch requests which are inserted during quiescing */ 261 blk_mq_run_hw_queues(q, true); 262 } 263 EXPORT_SYMBOL_GPL(blk_mq_unquiesce_queue); 264 265 void blk_mq_wake_waiters(struct request_queue *q) 266 { 267 struct blk_mq_hw_ctx *hctx; 268 unsigned int i; 269 270 queue_for_each_hw_ctx(q, hctx, i) 271 if (blk_mq_hw_queue_mapped(hctx)) 272 blk_mq_tag_wakeup_all(hctx->tags, true); 273 } 274 275 /* 276 * Only need start/end time stamping if we have iostat or 277 * blk stats enabled, or using an IO scheduler. 278 */ 279 static inline bool blk_mq_need_time_stamp(struct request *rq) 280 { 281 return (rq->rq_flags & (RQF_IO_STAT | RQF_STATS)) || rq->q->elevator; 282 } 283 284 static struct request *blk_mq_rq_ctx_init(struct blk_mq_alloc_data *data, 285 unsigned int tag, u64 alloc_time_ns) 286 { 287 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 288 struct request *rq = tags->static_rqs[tag]; 289 290 if (data->q->elevator) { 291 rq->tag = BLK_MQ_NO_TAG; 292 rq->internal_tag = tag; 293 } else { 294 rq->tag = tag; 295 rq->internal_tag = BLK_MQ_NO_TAG; 296 } 297 298 /* csd/requeue_work/fifo_time is initialized before use */ 299 rq->q = data->q; 300 rq->mq_ctx = data->ctx; 301 rq->mq_hctx = data->hctx; 302 rq->rq_flags = 0; 303 rq->cmd_flags = data->cmd_flags; 304 if (data->flags & BLK_MQ_REQ_PM) 305 rq->rq_flags |= RQF_PM; 306 if (blk_queue_io_stat(data->q)) 307 rq->rq_flags |= RQF_IO_STAT; 308 INIT_LIST_HEAD(&rq->queuelist); 309 INIT_HLIST_NODE(&rq->hash); 310 RB_CLEAR_NODE(&rq->rb_node); 311 rq->rq_disk = NULL; 312 rq->part = NULL; 313 #ifdef CONFIG_BLK_RQ_ALLOC_TIME 314 rq->alloc_time_ns = alloc_time_ns; 315 #endif 316 if (blk_mq_need_time_stamp(rq)) 317 rq->start_time_ns = ktime_get_ns(); 318 else 319 rq->start_time_ns = 0; 320 rq->io_start_time_ns = 0; 321 rq->stats_sectors = 0; 322 rq->nr_phys_segments = 0; 323 #if defined(CONFIG_BLK_DEV_INTEGRITY) 324 rq->nr_integrity_segments = 0; 325 #endif 326 blk_crypto_rq_set_defaults(rq); 327 /* tag was already set */ 328 WRITE_ONCE(rq->deadline, 0); 329 330 rq->timeout = 0; 331 332 rq->end_io = NULL; 333 rq->end_io_data = NULL; 334 335 data->ctx->rq_dispatched[op_is_sync(data->cmd_flags)]++; 336 refcount_set(&rq->ref, 1); 337 338 if (!op_is_flush(data->cmd_flags)) { 339 struct elevator_queue *e = data->q->elevator; 340 341 rq->elv.icq = NULL; 342 if (e && e->type->ops.prepare_request) { 343 if (e->type->icq_cache) 344 blk_mq_sched_assign_ioc(rq); 345 346 e->type->ops.prepare_request(rq); 347 rq->rq_flags |= RQF_ELVPRIV; 348 } 349 } 350 351 data->hctx->queued++; 352 return rq; 353 } 354 355 static struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data) 356 { 357 struct request_queue *q = data->q; 358 struct elevator_queue *e = q->elevator; 359 u64 alloc_time_ns = 0; 360 unsigned int tag; 361 362 /* alloc_time includes depth and tag waits */ 363 if (blk_queue_rq_alloc_time(q)) 364 alloc_time_ns = ktime_get_ns(); 365 366 if (data->cmd_flags & REQ_NOWAIT) 367 data->flags |= BLK_MQ_REQ_NOWAIT; 368 369 if (e) { 370 /* 371 * Flush/passthrough requests are special and go directly to the 372 * dispatch list. Don't include reserved tags in the 373 * limiting, as it isn't useful. 374 */ 375 if (!op_is_flush(data->cmd_flags) && 376 !blk_op_is_passthrough(data->cmd_flags) && 377 e->type->ops.limit_depth && 378 !(data->flags & BLK_MQ_REQ_RESERVED)) 379 e->type->ops.limit_depth(data->cmd_flags, data); 380 } 381 382 retry: 383 data->ctx = blk_mq_get_ctx(q); 384 data->hctx = blk_mq_map_queue(q, data->cmd_flags, data->ctx); 385 if (!e) 386 blk_mq_tag_busy(data->hctx); 387 388 /* 389 * Waiting allocations only fail because of an inactive hctx. In that 390 * case just retry the hctx assignment and tag allocation as CPU hotplug 391 * should have migrated us to an online CPU by now. 392 */ 393 tag = blk_mq_get_tag(data); 394 if (tag == BLK_MQ_NO_TAG) { 395 if (data->flags & BLK_MQ_REQ_NOWAIT) 396 return NULL; 397 398 /* 399 * Give up the CPU and sleep for a random short time to ensure 400 * that thread using a realtime scheduling class are migrated 401 * off the CPU, and thus off the hctx that is going away. 402 */ 403 msleep(3); 404 goto retry; 405 } 406 return blk_mq_rq_ctx_init(data, tag, alloc_time_ns); 407 } 408 409 struct request *blk_mq_alloc_request(struct request_queue *q, unsigned int op, 410 blk_mq_req_flags_t flags) 411 { 412 struct blk_mq_alloc_data data = { 413 .q = q, 414 .flags = flags, 415 .cmd_flags = op, 416 }; 417 struct request *rq; 418 int ret; 419 420 ret = blk_queue_enter(q, flags); 421 if (ret) 422 return ERR_PTR(ret); 423 424 rq = __blk_mq_alloc_request(&data); 425 if (!rq) 426 goto out_queue_exit; 427 rq->__data_len = 0; 428 rq->__sector = (sector_t) -1; 429 rq->bio = rq->biotail = NULL; 430 return rq; 431 out_queue_exit: 432 blk_queue_exit(q); 433 return ERR_PTR(-EWOULDBLOCK); 434 } 435 EXPORT_SYMBOL(blk_mq_alloc_request); 436 437 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, 438 unsigned int op, blk_mq_req_flags_t flags, unsigned int hctx_idx) 439 { 440 struct blk_mq_alloc_data data = { 441 .q = q, 442 .flags = flags, 443 .cmd_flags = op, 444 }; 445 u64 alloc_time_ns = 0; 446 unsigned int cpu; 447 unsigned int tag; 448 int ret; 449 450 /* alloc_time includes depth and tag waits */ 451 if (blk_queue_rq_alloc_time(q)) 452 alloc_time_ns = ktime_get_ns(); 453 454 /* 455 * If the tag allocator sleeps we could get an allocation for a 456 * different hardware context. No need to complicate the low level 457 * allocator for this for the rare use case of a command tied to 458 * a specific queue. 459 */ 460 if (WARN_ON_ONCE(!(flags & (BLK_MQ_REQ_NOWAIT | BLK_MQ_REQ_RESERVED)))) 461 return ERR_PTR(-EINVAL); 462 463 if (hctx_idx >= q->nr_hw_queues) 464 return ERR_PTR(-EIO); 465 466 ret = blk_queue_enter(q, flags); 467 if (ret) 468 return ERR_PTR(ret); 469 470 /* 471 * Check if the hardware context is actually mapped to anything. 472 * If not tell the caller that it should skip this queue. 473 */ 474 ret = -EXDEV; 475 data.hctx = q->queue_hw_ctx[hctx_idx]; 476 if (!blk_mq_hw_queue_mapped(data.hctx)) 477 goto out_queue_exit; 478 cpu = cpumask_first_and(data.hctx->cpumask, cpu_online_mask); 479 data.ctx = __blk_mq_get_ctx(q, cpu); 480 481 if (!q->elevator) 482 blk_mq_tag_busy(data.hctx); 483 484 ret = -EWOULDBLOCK; 485 tag = blk_mq_get_tag(&data); 486 if (tag == BLK_MQ_NO_TAG) 487 goto out_queue_exit; 488 return blk_mq_rq_ctx_init(&data, tag, alloc_time_ns); 489 490 out_queue_exit: 491 blk_queue_exit(q); 492 return ERR_PTR(ret); 493 } 494 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 495 496 static void __blk_mq_free_request(struct request *rq) 497 { 498 struct request_queue *q = rq->q; 499 struct blk_mq_ctx *ctx = rq->mq_ctx; 500 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 501 const int sched_tag = rq->internal_tag; 502 503 blk_crypto_free_request(rq); 504 blk_pm_mark_last_busy(rq); 505 rq->mq_hctx = NULL; 506 if (rq->tag != BLK_MQ_NO_TAG) 507 blk_mq_put_tag(hctx->tags, ctx, rq->tag); 508 if (sched_tag != BLK_MQ_NO_TAG) 509 blk_mq_put_tag(hctx->sched_tags, ctx, sched_tag); 510 blk_mq_sched_restart(hctx); 511 blk_queue_exit(q); 512 } 513 514 void blk_mq_free_request(struct request *rq) 515 { 516 struct request_queue *q = rq->q; 517 struct elevator_queue *e = q->elevator; 518 struct blk_mq_ctx *ctx = rq->mq_ctx; 519 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 520 521 if (rq->rq_flags & RQF_ELVPRIV) { 522 if (e && e->type->ops.finish_request) 523 e->type->ops.finish_request(rq); 524 if (rq->elv.icq) { 525 put_io_context(rq->elv.icq->ioc); 526 rq->elv.icq = NULL; 527 } 528 } 529 530 ctx->rq_completed[rq_is_sync(rq)]++; 531 if (rq->rq_flags & RQF_MQ_INFLIGHT) 532 __blk_mq_dec_active_requests(hctx); 533 534 if (unlikely(laptop_mode && !blk_rq_is_passthrough(rq))) 535 laptop_io_completion(q->disk->bdi); 536 537 rq_qos_done(q, rq); 538 539 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 540 if (refcount_dec_and_test(&rq->ref)) 541 __blk_mq_free_request(rq); 542 } 543 EXPORT_SYMBOL_GPL(blk_mq_free_request); 544 545 inline void __blk_mq_end_request(struct request *rq, blk_status_t error) 546 { 547 u64 now = 0; 548 549 if (blk_mq_need_time_stamp(rq)) 550 now = ktime_get_ns(); 551 552 if (rq->rq_flags & RQF_STATS) { 553 blk_mq_poll_stats_start(rq->q); 554 blk_stat_add(rq, now); 555 } 556 557 blk_mq_sched_completed_request(rq, now); 558 559 blk_account_io_done(rq, now); 560 561 if (rq->end_io) { 562 rq_qos_done(rq->q, rq); 563 rq->end_io(rq, error); 564 } else { 565 blk_mq_free_request(rq); 566 } 567 } 568 EXPORT_SYMBOL(__blk_mq_end_request); 569 570 void blk_mq_end_request(struct request *rq, blk_status_t error) 571 { 572 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 573 BUG(); 574 __blk_mq_end_request(rq, error); 575 } 576 EXPORT_SYMBOL(blk_mq_end_request); 577 578 static void blk_complete_reqs(struct llist_head *list) 579 { 580 struct llist_node *entry = llist_reverse_order(llist_del_all(list)); 581 struct request *rq, *next; 582 583 llist_for_each_entry_safe(rq, next, entry, ipi_list) 584 rq->q->mq_ops->complete(rq); 585 } 586 587 static __latent_entropy void blk_done_softirq(struct softirq_action *h) 588 { 589 blk_complete_reqs(this_cpu_ptr(&blk_cpu_done)); 590 } 591 592 static int blk_softirq_cpu_dead(unsigned int cpu) 593 { 594 blk_complete_reqs(&per_cpu(blk_cpu_done, cpu)); 595 return 0; 596 } 597 598 static void __blk_mq_complete_request_remote(void *data) 599 { 600 __raise_softirq_irqoff(BLOCK_SOFTIRQ); 601 } 602 603 static inline bool blk_mq_complete_need_ipi(struct request *rq) 604 { 605 int cpu = raw_smp_processor_id(); 606 607 if (!IS_ENABLED(CONFIG_SMP) || 608 !test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) 609 return false; 610 /* 611 * With force threaded interrupts enabled, raising softirq from an SMP 612 * function call will always result in waking the ksoftirqd thread. 613 * This is probably worse than completing the request on a different 614 * cache domain. 615 */ 616 if (force_irqthreads()) 617 return false; 618 619 /* same CPU or cache domain? Complete locally */ 620 if (cpu == rq->mq_ctx->cpu || 621 (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags) && 622 cpus_share_cache(cpu, rq->mq_ctx->cpu))) 623 return false; 624 625 /* don't try to IPI to an offline CPU */ 626 return cpu_online(rq->mq_ctx->cpu); 627 } 628 629 static void blk_mq_complete_send_ipi(struct request *rq) 630 { 631 struct llist_head *list; 632 unsigned int cpu; 633 634 cpu = rq->mq_ctx->cpu; 635 list = &per_cpu(blk_cpu_done, cpu); 636 if (llist_add(&rq->ipi_list, list)) { 637 INIT_CSD(&rq->csd, __blk_mq_complete_request_remote, rq); 638 smp_call_function_single_async(cpu, &rq->csd); 639 } 640 } 641 642 static void blk_mq_raise_softirq(struct request *rq) 643 { 644 struct llist_head *list; 645 646 preempt_disable(); 647 list = this_cpu_ptr(&blk_cpu_done); 648 if (llist_add(&rq->ipi_list, list)) 649 raise_softirq(BLOCK_SOFTIRQ); 650 preempt_enable(); 651 } 652 653 bool blk_mq_complete_request_remote(struct request *rq) 654 { 655 WRITE_ONCE(rq->state, MQ_RQ_COMPLETE); 656 657 /* 658 * For a polled request, always complete locallly, it's pointless 659 * to redirect the completion. 660 */ 661 if (rq->cmd_flags & REQ_HIPRI) 662 return false; 663 664 if (blk_mq_complete_need_ipi(rq)) { 665 blk_mq_complete_send_ipi(rq); 666 return true; 667 } 668 669 if (rq->q->nr_hw_queues == 1) { 670 blk_mq_raise_softirq(rq); 671 return true; 672 } 673 return false; 674 } 675 EXPORT_SYMBOL_GPL(blk_mq_complete_request_remote); 676 677 /** 678 * blk_mq_complete_request - end I/O on a request 679 * @rq: the request being processed 680 * 681 * Description: 682 * Complete a request by scheduling the ->complete_rq operation. 683 **/ 684 void blk_mq_complete_request(struct request *rq) 685 { 686 if (!blk_mq_complete_request_remote(rq)) 687 rq->q->mq_ops->complete(rq); 688 } 689 EXPORT_SYMBOL(blk_mq_complete_request); 690 691 static void hctx_unlock(struct blk_mq_hw_ctx *hctx, int srcu_idx) 692 __releases(hctx->srcu) 693 { 694 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) 695 rcu_read_unlock(); 696 else 697 srcu_read_unlock(hctx->srcu, srcu_idx); 698 } 699 700 static void hctx_lock(struct blk_mq_hw_ctx *hctx, int *srcu_idx) 701 __acquires(hctx->srcu) 702 { 703 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 704 /* shut up gcc false positive */ 705 *srcu_idx = 0; 706 rcu_read_lock(); 707 } else 708 *srcu_idx = srcu_read_lock(hctx->srcu); 709 } 710 711 /** 712 * blk_mq_start_request - Start processing a request 713 * @rq: Pointer to request to be started 714 * 715 * Function used by device drivers to notify the block layer that a request 716 * is going to be processed now, so blk layer can do proper initializations 717 * such as starting the timeout timer. 718 */ 719 void blk_mq_start_request(struct request *rq) 720 { 721 struct request_queue *q = rq->q; 722 723 trace_block_rq_issue(rq); 724 725 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 726 rq->io_start_time_ns = ktime_get_ns(); 727 rq->stats_sectors = blk_rq_sectors(rq); 728 rq->rq_flags |= RQF_STATS; 729 rq_qos_issue(q, rq); 730 } 731 732 WARN_ON_ONCE(blk_mq_rq_state(rq) != MQ_RQ_IDLE); 733 734 blk_add_timer(rq); 735 WRITE_ONCE(rq->state, MQ_RQ_IN_FLIGHT); 736 737 #ifdef CONFIG_BLK_DEV_INTEGRITY 738 if (blk_integrity_rq(rq) && req_op(rq) == REQ_OP_WRITE) 739 q->integrity.profile->prepare_fn(rq); 740 #endif 741 } 742 EXPORT_SYMBOL(blk_mq_start_request); 743 744 static void __blk_mq_requeue_request(struct request *rq) 745 { 746 struct request_queue *q = rq->q; 747 748 blk_mq_put_driver_tag(rq); 749 750 trace_block_rq_requeue(rq); 751 rq_qos_requeue(q, rq); 752 753 if (blk_mq_request_started(rq)) { 754 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 755 rq->rq_flags &= ~RQF_TIMED_OUT; 756 } 757 } 758 759 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 760 { 761 __blk_mq_requeue_request(rq); 762 763 /* this request will be re-inserted to io scheduler queue */ 764 blk_mq_sched_requeue_request(rq); 765 766 BUG_ON(!list_empty(&rq->queuelist)); 767 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 768 } 769 EXPORT_SYMBOL(blk_mq_requeue_request); 770 771 static void blk_mq_requeue_work(struct work_struct *work) 772 { 773 struct request_queue *q = 774 container_of(work, struct request_queue, requeue_work.work); 775 LIST_HEAD(rq_list); 776 struct request *rq, *next; 777 778 spin_lock_irq(&q->requeue_lock); 779 list_splice_init(&q->requeue_list, &rq_list); 780 spin_unlock_irq(&q->requeue_lock); 781 782 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 783 if (!(rq->rq_flags & (RQF_SOFTBARRIER | RQF_DONTPREP))) 784 continue; 785 786 rq->rq_flags &= ~RQF_SOFTBARRIER; 787 list_del_init(&rq->queuelist); 788 /* 789 * If RQF_DONTPREP, rq has contained some driver specific 790 * data, so insert it to hctx dispatch list to avoid any 791 * merge. 792 */ 793 if (rq->rq_flags & RQF_DONTPREP) 794 blk_mq_request_bypass_insert(rq, false, false); 795 else 796 blk_mq_sched_insert_request(rq, true, false, false); 797 } 798 799 while (!list_empty(&rq_list)) { 800 rq = list_entry(rq_list.next, struct request, queuelist); 801 list_del_init(&rq->queuelist); 802 blk_mq_sched_insert_request(rq, false, false, false); 803 } 804 805 blk_mq_run_hw_queues(q, false); 806 } 807 808 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 809 bool kick_requeue_list) 810 { 811 struct request_queue *q = rq->q; 812 unsigned long flags; 813 814 /* 815 * We abuse this flag that is otherwise used by the I/O scheduler to 816 * request head insertion from the workqueue. 817 */ 818 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 819 820 spin_lock_irqsave(&q->requeue_lock, flags); 821 if (at_head) { 822 rq->rq_flags |= RQF_SOFTBARRIER; 823 list_add(&rq->queuelist, &q->requeue_list); 824 } else { 825 list_add_tail(&rq->queuelist, &q->requeue_list); 826 } 827 spin_unlock_irqrestore(&q->requeue_lock, flags); 828 829 if (kick_requeue_list) 830 blk_mq_kick_requeue_list(q); 831 } 832 833 void blk_mq_kick_requeue_list(struct request_queue *q) 834 { 835 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 0); 836 } 837 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 838 839 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 840 unsigned long msecs) 841 { 842 kblockd_mod_delayed_work_on(WORK_CPU_UNBOUND, &q->requeue_work, 843 msecs_to_jiffies(msecs)); 844 } 845 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 846 847 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 848 { 849 if (tag < tags->nr_tags) { 850 prefetch(tags->rqs[tag]); 851 return tags->rqs[tag]; 852 } 853 854 return NULL; 855 } 856 EXPORT_SYMBOL(blk_mq_tag_to_rq); 857 858 static bool blk_mq_rq_inflight(struct blk_mq_hw_ctx *hctx, struct request *rq, 859 void *priv, bool reserved) 860 { 861 /* 862 * If we find a request that isn't idle and the queue matches, 863 * we know the queue is busy. Return false to stop the iteration. 864 */ 865 if (blk_mq_request_started(rq) && rq->q == hctx->queue) { 866 bool *busy = priv; 867 868 *busy = true; 869 return false; 870 } 871 872 return true; 873 } 874 875 bool blk_mq_queue_inflight(struct request_queue *q) 876 { 877 bool busy = false; 878 879 blk_mq_queue_tag_busy_iter(q, blk_mq_rq_inflight, &busy); 880 return busy; 881 } 882 EXPORT_SYMBOL_GPL(blk_mq_queue_inflight); 883 884 static void blk_mq_rq_timed_out(struct request *req, bool reserved) 885 { 886 req->rq_flags |= RQF_TIMED_OUT; 887 if (req->q->mq_ops->timeout) { 888 enum blk_eh_timer_return ret; 889 890 ret = req->q->mq_ops->timeout(req, reserved); 891 if (ret == BLK_EH_DONE) 892 return; 893 WARN_ON_ONCE(ret != BLK_EH_RESET_TIMER); 894 } 895 896 blk_add_timer(req); 897 } 898 899 static bool blk_mq_req_expired(struct request *rq, unsigned long *next) 900 { 901 unsigned long deadline; 902 903 if (blk_mq_rq_state(rq) != MQ_RQ_IN_FLIGHT) 904 return false; 905 if (rq->rq_flags & RQF_TIMED_OUT) 906 return false; 907 908 deadline = READ_ONCE(rq->deadline); 909 if (time_after_eq(jiffies, deadline)) 910 return true; 911 912 if (*next == 0) 913 *next = deadline; 914 else if (time_after(*next, deadline)) 915 *next = deadline; 916 return false; 917 } 918 919 void blk_mq_put_rq_ref(struct request *rq) 920 { 921 if (is_flush_rq(rq)) 922 rq->end_io(rq, 0); 923 else if (refcount_dec_and_test(&rq->ref)) 924 __blk_mq_free_request(rq); 925 } 926 927 static bool blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 928 struct request *rq, void *priv, bool reserved) 929 { 930 unsigned long *next = priv; 931 932 /* 933 * blk_mq_queue_tag_busy_iter() has locked the request, so it cannot 934 * be reallocated underneath the timeout handler's processing, then 935 * the expire check is reliable. If the request is not expired, then 936 * it was completed and reallocated as a new request after returning 937 * from blk_mq_check_expired(). 938 */ 939 if (blk_mq_req_expired(rq, next)) 940 blk_mq_rq_timed_out(rq, reserved); 941 return true; 942 } 943 944 static void blk_mq_timeout_work(struct work_struct *work) 945 { 946 struct request_queue *q = 947 container_of(work, struct request_queue, timeout_work); 948 unsigned long next = 0; 949 struct blk_mq_hw_ctx *hctx; 950 int i; 951 952 /* A deadlock might occur if a request is stuck requiring a 953 * timeout at the same time a queue freeze is waiting 954 * completion, since the timeout code would not be able to 955 * acquire the queue reference here. 956 * 957 * That's why we don't use blk_queue_enter here; instead, we use 958 * percpu_ref_tryget directly, because we need to be able to 959 * obtain a reference even in the short window between the queue 960 * starting to freeze, by dropping the first reference in 961 * blk_freeze_queue_start, and the moment the last request is 962 * consumed, marked by the instant q_usage_counter reaches 963 * zero. 964 */ 965 if (!percpu_ref_tryget(&q->q_usage_counter)) 966 return; 967 968 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &next); 969 970 if (next != 0) { 971 mod_timer(&q->timeout, next); 972 } else { 973 /* 974 * Request timeouts are handled as a forward rolling timer. If 975 * we end up here it means that no requests are pending and 976 * also that no request has been pending for a while. Mark 977 * each hctx as idle. 978 */ 979 queue_for_each_hw_ctx(q, hctx, i) { 980 /* the hctx may be unmapped, so check it here */ 981 if (blk_mq_hw_queue_mapped(hctx)) 982 blk_mq_tag_idle(hctx); 983 } 984 } 985 blk_queue_exit(q); 986 } 987 988 struct flush_busy_ctx_data { 989 struct blk_mq_hw_ctx *hctx; 990 struct list_head *list; 991 }; 992 993 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 994 { 995 struct flush_busy_ctx_data *flush_data = data; 996 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 997 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 998 enum hctx_type type = hctx->type; 999 1000 spin_lock(&ctx->lock); 1001 list_splice_tail_init(&ctx->rq_lists[type], flush_data->list); 1002 sbitmap_clear_bit(sb, bitnr); 1003 spin_unlock(&ctx->lock); 1004 return true; 1005 } 1006 1007 /* 1008 * Process software queues that have been marked busy, splicing them 1009 * to the for-dispatch 1010 */ 1011 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 1012 { 1013 struct flush_busy_ctx_data data = { 1014 .hctx = hctx, 1015 .list = list, 1016 }; 1017 1018 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 1019 } 1020 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 1021 1022 struct dispatch_rq_data { 1023 struct blk_mq_hw_ctx *hctx; 1024 struct request *rq; 1025 }; 1026 1027 static bool dispatch_rq_from_ctx(struct sbitmap *sb, unsigned int bitnr, 1028 void *data) 1029 { 1030 struct dispatch_rq_data *dispatch_data = data; 1031 struct blk_mq_hw_ctx *hctx = dispatch_data->hctx; 1032 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 1033 enum hctx_type type = hctx->type; 1034 1035 spin_lock(&ctx->lock); 1036 if (!list_empty(&ctx->rq_lists[type])) { 1037 dispatch_data->rq = list_entry_rq(ctx->rq_lists[type].next); 1038 list_del_init(&dispatch_data->rq->queuelist); 1039 if (list_empty(&ctx->rq_lists[type])) 1040 sbitmap_clear_bit(sb, bitnr); 1041 } 1042 spin_unlock(&ctx->lock); 1043 1044 return !dispatch_data->rq; 1045 } 1046 1047 struct request *blk_mq_dequeue_from_ctx(struct blk_mq_hw_ctx *hctx, 1048 struct blk_mq_ctx *start) 1049 { 1050 unsigned off = start ? start->index_hw[hctx->type] : 0; 1051 struct dispatch_rq_data data = { 1052 .hctx = hctx, 1053 .rq = NULL, 1054 }; 1055 1056 __sbitmap_for_each_set(&hctx->ctx_map, off, 1057 dispatch_rq_from_ctx, &data); 1058 1059 return data.rq; 1060 } 1061 1062 static inline unsigned int queued_to_index(unsigned int queued) 1063 { 1064 if (!queued) 1065 return 0; 1066 1067 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 1068 } 1069 1070 static bool __blk_mq_get_driver_tag(struct request *rq) 1071 { 1072 struct sbitmap_queue *bt = rq->mq_hctx->tags->bitmap_tags; 1073 unsigned int tag_offset = rq->mq_hctx->tags->nr_reserved_tags; 1074 int tag; 1075 1076 blk_mq_tag_busy(rq->mq_hctx); 1077 1078 if (blk_mq_tag_is_reserved(rq->mq_hctx->sched_tags, rq->internal_tag)) { 1079 bt = rq->mq_hctx->tags->breserved_tags; 1080 tag_offset = 0; 1081 } else { 1082 if (!hctx_may_queue(rq->mq_hctx, bt)) 1083 return false; 1084 } 1085 1086 tag = __sbitmap_queue_get(bt); 1087 if (tag == BLK_MQ_NO_TAG) 1088 return false; 1089 1090 rq->tag = tag + tag_offset; 1091 return true; 1092 } 1093 1094 bool blk_mq_get_driver_tag(struct request *rq) 1095 { 1096 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1097 1098 if (rq->tag == BLK_MQ_NO_TAG && !__blk_mq_get_driver_tag(rq)) 1099 return false; 1100 1101 if ((hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED) && 1102 !(rq->rq_flags & RQF_MQ_INFLIGHT)) { 1103 rq->rq_flags |= RQF_MQ_INFLIGHT; 1104 __blk_mq_inc_active_requests(hctx); 1105 } 1106 hctx->tags->rqs[rq->tag] = rq; 1107 return true; 1108 } 1109 1110 static int blk_mq_dispatch_wake(wait_queue_entry_t *wait, unsigned mode, 1111 int flags, void *key) 1112 { 1113 struct blk_mq_hw_ctx *hctx; 1114 1115 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 1116 1117 spin_lock(&hctx->dispatch_wait_lock); 1118 if (!list_empty(&wait->entry)) { 1119 struct sbitmap_queue *sbq; 1120 1121 list_del_init(&wait->entry); 1122 sbq = hctx->tags->bitmap_tags; 1123 atomic_dec(&sbq->ws_active); 1124 } 1125 spin_unlock(&hctx->dispatch_wait_lock); 1126 1127 blk_mq_run_hw_queue(hctx, true); 1128 return 1; 1129 } 1130 1131 /* 1132 * Mark us waiting for a tag. For shared tags, this involves hooking us into 1133 * the tag wakeups. For non-shared tags, we can simply mark us needing a 1134 * restart. For both cases, take care to check the condition again after 1135 * marking us as waiting. 1136 */ 1137 static bool blk_mq_mark_tag_wait(struct blk_mq_hw_ctx *hctx, 1138 struct request *rq) 1139 { 1140 struct sbitmap_queue *sbq = hctx->tags->bitmap_tags; 1141 struct wait_queue_head *wq; 1142 wait_queue_entry_t *wait; 1143 bool ret; 1144 1145 if (!(hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 1146 blk_mq_sched_mark_restart_hctx(hctx); 1147 1148 /* 1149 * It's possible that a tag was freed in the window between the 1150 * allocation failure and adding the hardware queue to the wait 1151 * queue. 1152 * 1153 * Don't clear RESTART here, someone else could have set it. 1154 * At most this will cost an extra queue run. 1155 */ 1156 return blk_mq_get_driver_tag(rq); 1157 } 1158 1159 wait = &hctx->dispatch_wait; 1160 if (!list_empty_careful(&wait->entry)) 1161 return false; 1162 1163 wq = &bt_wait_ptr(sbq, hctx)->wait; 1164 1165 spin_lock_irq(&wq->lock); 1166 spin_lock(&hctx->dispatch_wait_lock); 1167 if (!list_empty(&wait->entry)) { 1168 spin_unlock(&hctx->dispatch_wait_lock); 1169 spin_unlock_irq(&wq->lock); 1170 return false; 1171 } 1172 1173 atomic_inc(&sbq->ws_active); 1174 wait->flags &= ~WQ_FLAG_EXCLUSIVE; 1175 __add_wait_queue(wq, wait); 1176 1177 /* 1178 * It's possible that a tag was freed in the window between the 1179 * allocation failure and adding the hardware queue to the wait 1180 * queue. 1181 */ 1182 ret = blk_mq_get_driver_tag(rq); 1183 if (!ret) { 1184 spin_unlock(&hctx->dispatch_wait_lock); 1185 spin_unlock_irq(&wq->lock); 1186 return false; 1187 } 1188 1189 /* 1190 * We got a tag, remove ourselves from the wait queue to ensure 1191 * someone else gets the wakeup. 1192 */ 1193 list_del_init(&wait->entry); 1194 atomic_dec(&sbq->ws_active); 1195 spin_unlock(&hctx->dispatch_wait_lock); 1196 spin_unlock_irq(&wq->lock); 1197 1198 return true; 1199 } 1200 1201 #define BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT 8 1202 #define BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR 4 1203 /* 1204 * Update dispatch busy with the Exponential Weighted Moving Average(EWMA): 1205 * - EWMA is one simple way to compute running average value 1206 * - weight(7/8 and 1/8) is applied so that it can decrease exponentially 1207 * - take 4 as factor for avoiding to get too small(0) result, and this 1208 * factor doesn't matter because EWMA decreases exponentially 1209 */ 1210 static void blk_mq_update_dispatch_busy(struct blk_mq_hw_ctx *hctx, bool busy) 1211 { 1212 unsigned int ewma; 1213 1214 ewma = hctx->dispatch_busy; 1215 1216 if (!ewma && !busy) 1217 return; 1218 1219 ewma *= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT - 1; 1220 if (busy) 1221 ewma += 1 << BLK_MQ_DISPATCH_BUSY_EWMA_FACTOR; 1222 ewma /= BLK_MQ_DISPATCH_BUSY_EWMA_WEIGHT; 1223 1224 hctx->dispatch_busy = ewma; 1225 } 1226 1227 #define BLK_MQ_RESOURCE_DELAY 3 /* ms units */ 1228 1229 static void blk_mq_handle_dev_resource(struct request *rq, 1230 struct list_head *list) 1231 { 1232 struct request *next = 1233 list_first_entry_or_null(list, struct request, queuelist); 1234 1235 /* 1236 * If an I/O scheduler has been configured and we got a driver tag for 1237 * the next request already, free it. 1238 */ 1239 if (next) 1240 blk_mq_put_driver_tag(next); 1241 1242 list_add(&rq->queuelist, list); 1243 __blk_mq_requeue_request(rq); 1244 } 1245 1246 static void blk_mq_handle_zone_resource(struct request *rq, 1247 struct list_head *zone_list) 1248 { 1249 /* 1250 * If we end up here it is because we cannot dispatch a request to a 1251 * specific zone due to LLD level zone-write locking or other zone 1252 * related resource not being available. In this case, set the request 1253 * aside in zone_list for retrying it later. 1254 */ 1255 list_add(&rq->queuelist, zone_list); 1256 __blk_mq_requeue_request(rq); 1257 } 1258 1259 enum prep_dispatch { 1260 PREP_DISPATCH_OK, 1261 PREP_DISPATCH_NO_TAG, 1262 PREP_DISPATCH_NO_BUDGET, 1263 }; 1264 1265 static enum prep_dispatch blk_mq_prep_dispatch_rq(struct request *rq, 1266 bool need_budget) 1267 { 1268 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1269 int budget_token = -1; 1270 1271 if (need_budget) { 1272 budget_token = blk_mq_get_dispatch_budget(rq->q); 1273 if (budget_token < 0) { 1274 blk_mq_put_driver_tag(rq); 1275 return PREP_DISPATCH_NO_BUDGET; 1276 } 1277 blk_mq_set_rq_budget_token(rq, budget_token); 1278 } 1279 1280 if (!blk_mq_get_driver_tag(rq)) { 1281 /* 1282 * The initial allocation attempt failed, so we need to 1283 * rerun the hardware queue when a tag is freed. The 1284 * waitqueue takes care of that. If the queue is run 1285 * before we add this entry back on the dispatch list, 1286 * we'll re-run it below. 1287 */ 1288 if (!blk_mq_mark_tag_wait(hctx, rq)) { 1289 /* 1290 * All budgets not got from this function will be put 1291 * together during handling partial dispatch 1292 */ 1293 if (need_budget) 1294 blk_mq_put_dispatch_budget(rq->q, budget_token); 1295 return PREP_DISPATCH_NO_TAG; 1296 } 1297 } 1298 1299 return PREP_DISPATCH_OK; 1300 } 1301 1302 /* release all allocated budgets before calling to blk_mq_dispatch_rq_list */ 1303 static void blk_mq_release_budgets(struct request_queue *q, 1304 struct list_head *list) 1305 { 1306 struct request *rq; 1307 1308 list_for_each_entry(rq, list, queuelist) { 1309 int budget_token = blk_mq_get_rq_budget_token(rq); 1310 1311 if (budget_token >= 0) 1312 blk_mq_put_dispatch_budget(q, budget_token); 1313 } 1314 } 1315 1316 /* 1317 * Returns true if we did some work AND can potentially do more. 1318 */ 1319 bool blk_mq_dispatch_rq_list(struct blk_mq_hw_ctx *hctx, struct list_head *list, 1320 unsigned int nr_budgets) 1321 { 1322 enum prep_dispatch prep; 1323 struct request_queue *q = hctx->queue; 1324 struct request *rq, *nxt; 1325 int errors, queued; 1326 blk_status_t ret = BLK_STS_OK; 1327 LIST_HEAD(zone_list); 1328 1329 if (list_empty(list)) 1330 return false; 1331 1332 /* 1333 * Now process all the entries, sending them to the driver. 1334 */ 1335 errors = queued = 0; 1336 do { 1337 struct blk_mq_queue_data bd; 1338 1339 rq = list_first_entry(list, struct request, queuelist); 1340 1341 WARN_ON_ONCE(hctx != rq->mq_hctx); 1342 prep = blk_mq_prep_dispatch_rq(rq, !nr_budgets); 1343 if (prep != PREP_DISPATCH_OK) 1344 break; 1345 1346 list_del_init(&rq->queuelist); 1347 1348 bd.rq = rq; 1349 1350 /* 1351 * Flag last if we have no more requests, or if we have more 1352 * but can't assign a driver tag to it. 1353 */ 1354 if (list_empty(list)) 1355 bd.last = true; 1356 else { 1357 nxt = list_first_entry(list, struct request, queuelist); 1358 bd.last = !blk_mq_get_driver_tag(nxt); 1359 } 1360 1361 /* 1362 * once the request is queued to lld, no need to cover the 1363 * budget any more 1364 */ 1365 if (nr_budgets) 1366 nr_budgets--; 1367 ret = q->mq_ops->queue_rq(hctx, &bd); 1368 switch (ret) { 1369 case BLK_STS_OK: 1370 queued++; 1371 break; 1372 case BLK_STS_RESOURCE: 1373 case BLK_STS_DEV_RESOURCE: 1374 blk_mq_handle_dev_resource(rq, list); 1375 goto out; 1376 case BLK_STS_ZONE_RESOURCE: 1377 /* 1378 * Move the request to zone_list and keep going through 1379 * the dispatch list to find more requests the drive can 1380 * accept. 1381 */ 1382 blk_mq_handle_zone_resource(rq, &zone_list); 1383 break; 1384 default: 1385 errors++; 1386 blk_mq_end_request(rq, ret); 1387 } 1388 } while (!list_empty(list)); 1389 out: 1390 if (!list_empty(&zone_list)) 1391 list_splice_tail_init(&zone_list, list); 1392 1393 hctx->dispatched[queued_to_index(queued)]++; 1394 1395 /* If we didn't flush the entire list, we could have told the driver 1396 * there was more coming, but that turned out to be a lie. 1397 */ 1398 if ((!list_empty(list) || errors) && q->mq_ops->commit_rqs && queued) 1399 q->mq_ops->commit_rqs(hctx); 1400 /* 1401 * Any items that need requeuing? Stuff them into hctx->dispatch, 1402 * that is where we will continue on next queue run. 1403 */ 1404 if (!list_empty(list)) { 1405 bool needs_restart; 1406 /* For non-shared tags, the RESTART check will suffice */ 1407 bool no_tag = prep == PREP_DISPATCH_NO_TAG && 1408 (hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED); 1409 bool no_budget_avail = prep == PREP_DISPATCH_NO_BUDGET; 1410 1411 if (nr_budgets) 1412 blk_mq_release_budgets(q, list); 1413 1414 spin_lock(&hctx->lock); 1415 list_splice_tail_init(list, &hctx->dispatch); 1416 spin_unlock(&hctx->lock); 1417 1418 /* 1419 * Order adding requests to hctx->dispatch and checking 1420 * SCHED_RESTART flag. The pair of this smp_mb() is the one 1421 * in blk_mq_sched_restart(). Avoid restart code path to 1422 * miss the new added requests to hctx->dispatch, meantime 1423 * SCHED_RESTART is observed here. 1424 */ 1425 smp_mb(); 1426 1427 /* 1428 * If SCHED_RESTART was set by the caller of this function and 1429 * it is no longer set that means that it was cleared by another 1430 * thread and hence that a queue rerun is needed. 1431 * 1432 * If 'no_tag' is set, that means that we failed getting 1433 * a driver tag with an I/O scheduler attached. If our dispatch 1434 * waitqueue is no longer active, ensure that we run the queue 1435 * AFTER adding our entries back to the list. 1436 * 1437 * If no I/O scheduler has been configured it is possible that 1438 * the hardware queue got stopped and restarted before requests 1439 * were pushed back onto the dispatch list. Rerun the queue to 1440 * avoid starvation. Notes: 1441 * - blk_mq_run_hw_queue() checks whether or not a queue has 1442 * been stopped before rerunning a queue. 1443 * - Some but not all block drivers stop a queue before 1444 * returning BLK_STS_RESOURCE. Two exceptions are scsi-mq 1445 * and dm-rq. 1446 * 1447 * If driver returns BLK_STS_RESOURCE and SCHED_RESTART 1448 * bit is set, run queue after a delay to avoid IO stalls 1449 * that could otherwise occur if the queue is idle. We'll do 1450 * similar if we couldn't get budget and SCHED_RESTART is set. 1451 */ 1452 needs_restart = blk_mq_sched_needs_restart(hctx); 1453 if (!needs_restart || 1454 (no_tag && list_empty_careful(&hctx->dispatch_wait.entry))) 1455 blk_mq_run_hw_queue(hctx, true); 1456 else if (needs_restart && (ret == BLK_STS_RESOURCE || 1457 no_budget_avail)) 1458 blk_mq_delay_run_hw_queue(hctx, BLK_MQ_RESOURCE_DELAY); 1459 1460 blk_mq_update_dispatch_busy(hctx, true); 1461 return false; 1462 } else 1463 blk_mq_update_dispatch_busy(hctx, false); 1464 1465 return (queued + errors) != 0; 1466 } 1467 1468 /** 1469 * __blk_mq_run_hw_queue - Run a hardware queue. 1470 * @hctx: Pointer to the hardware queue to run. 1471 * 1472 * Send pending requests to the hardware. 1473 */ 1474 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1475 { 1476 int srcu_idx; 1477 1478 /* 1479 * We can't run the queue inline with ints disabled. Ensure that 1480 * we catch bad users of this early. 1481 */ 1482 WARN_ON_ONCE(in_interrupt()); 1483 1484 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 1485 1486 hctx_lock(hctx, &srcu_idx); 1487 blk_mq_sched_dispatch_requests(hctx); 1488 hctx_unlock(hctx, srcu_idx); 1489 } 1490 1491 static inline int blk_mq_first_mapped_cpu(struct blk_mq_hw_ctx *hctx) 1492 { 1493 int cpu = cpumask_first_and(hctx->cpumask, cpu_online_mask); 1494 1495 if (cpu >= nr_cpu_ids) 1496 cpu = cpumask_first(hctx->cpumask); 1497 return cpu; 1498 } 1499 1500 /* 1501 * It'd be great if the workqueue API had a way to pass 1502 * in a mask and had some smarts for more clever placement. 1503 * For now we just round-robin here, switching for every 1504 * BLK_MQ_CPU_WORK_BATCH queued items. 1505 */ 1506 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1507 { 1508 bool tried = false; 1509 int next_cpu = hctx->next_cpu; 1510 1511 if (hctx->queue->nr_hw_queues == 1) 1512 return WORK_CPU_UNBOUND; 1513 1514 if (--hctx->next_cpu_batch <= 0) { 1515 select_cpu: 1516 next_cpu = cpumask_next_and(next_cpu, hctx->cpumask, 1517 cpu_online_mask); 1518 if (next_cpu >= nr_cpu_ids) 1519 next_cpu = blk_mq_first_mapped_cpu(hctx); 1520 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1521 } 1522 1523 /* 1524 * Do unbound schedule if we can't find a online CPU for this hctx, 1525 * and it should only happen in the path of handling CPU DEAD. 1526 */ 1527 if (!cpu_online(next_cpu)) { 1528 if (!tried) { 1529 tried = true; 1530 goto select_cpu; 1531 } 1532 1533 /* 1534 * Make sure to re-select CPU next time once after CPUs 1535 * in hctx->cpumask become online again. 1536 */ 1537 hctx->next_cpu = next_cpu; 1538 hctx->next_cpu_batch = 1; 1539 return WORK_CPU_UNBOUND; 1540 } 1541 1542 hctx->next_cpu = next_cpu; 1543 return next_cpu; 1544 } 1545 1546 /** 1547 * __blk_mq_delay_run_hw_queue - Run (or schedule to run) a hardware queue. 1548 * @hctx: Pointer to the hardware queue to run. 1549 * @async: If we want to run the queue asynchronously. 1550 * @msecs: Milliseconds of delay to wait before running the queue. 1551 * 1552 * If !@async, try to run the queue now. Else, run the queue asynchronously and 1553 * with a delay of @msecs. 1554 */ 1555 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1556 unsigned long msecs) 1557 { 1558 if (unlikely(blk_mq_hctx_stopped(hctx))) 1559 return; 1560 1561 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 1562 int cpu = get_cpu(); 1563 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1564 __blk_mq_run_hw_queue(hctx); 1565 put_cpu(); 1566 return; 1567 } 1568 1569 put_cpu(); 1570 } 1571 1572 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), &hctx->run_work, 1573 msecs_to_jiffies(msecs)); 1574 } 1575 1576 /** 1577 * blk_mq_delay_run_hw_queue - Run a hardware queue asynchronously. 1578 * @hctx: Pointer to the hardware queue to run. 1579 * @msecs: Milliseconds of delay to wait before running the queue. 1580 * 1581 * Run a hardware queue asynchronously with a delay of @msecs. 1582 */ 1583 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1584 { 1585 __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1586 } 1587 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1588 1589 /** 1590 * blk_mq_run_hw_queue - Start to run a hardware queue. 1591 * @hctx: Pointer to the hardware queue to run. 1592 * @async: If we want to run the queue asynchronously. 1593 * 1594 * Check if the request queue is not in a quiesced state and if there are 1595 * pending requests to be sent. If this is true, run the queue to send requests 1596 * to hardware. 1597 */ 1598 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1599 { 1600 int srcu_idx; 1601 bool need_run; 1602 1603 /* 1604 * When queue is quiesced, we may be switching io scheduler, or 1605 * updating nr_hw_queues, or other things, and we can't run queue 1606 * any more, even __blk_mq_hctx_has_pending() can't be called safely. 1607 * 1608 * And queue will be rerun in blk_mq_unquiesce_queue() if it is 1609 * quiesced. 1610 */ 1611 hctx_lock(hctx, &srcu_idx); 1612 need_run = !blk_queue_quiesced(hctx->queue) && 1613 blk_mq_hctx_has_pending(hctx); 1614 hctx_unlock(hctx, srcu_idx); 1615 1616 if (need_run) 1617 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1618 } 1619 EXPORT_SYMBOL(blk_mq_run_hw_queue); 1620 1621 /* 1622 * Is the request queue handled by an IO scheduler that does not respect 1623 * hardware queues when dispatching? 1624 */ 1625 static bool blk_mq_has_sqsched(struct request_queue *q) 1626 { 1627 struct elevator_queue *e = q->elevator; 1628 1629 if (e && e->type->ops.dispatch_request && 1630 !(e->type->elevator_features & ELEVATOR_F_MQ_AWARE)) 1631 return true; 1632 return false; 1633 } 1634 1635 /* 1636 * Return prefered queue to dispatch from (if any) for non-mq aware IO 1637 * scheduler. 1638 */ 1639 static struct blk_mq_hw_ctx *blk_mq_get_sq_hctx(struct request_queue *q) 1640 { 1641 struct blk_mq_hw_ctx *hctx; 1642 1643 /* 1644 * If the IO scheduler does not respect hardware queues when 1645 * dispatching, we just don't bother with multiple HW queues and 1646 * dispatch from hctx for the current CPU since running multiple queues 1647 * just causes lock contention inside the scheduler and pointless cache 1648 * bouncing. 1649 */ 1650 hctx = blk_mq_map_queue_type(q, HCTX_TYPE_DEFAULT, 1651 raw_smp_processor_id()); 1652 if (!blk_mq_hctx_stopped(hctx)) 1653 return hctx; 1654 return NULL; 1655 } 1656 1657 /** 1658 * blk_mq_run_hw_queues - Run all hardware queues in a request queue. 1659 * @q: Pointer to the request queue to run. 1660 * @async: If we want to run the queue asynchronously. 1661 */ 1662 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1663 { 1664 struct blk_mq_hw_ctx *hctx, *sq_hctx; 1665 int i; 1666 1667 sq_hctx = NULL; 1668 if (blk_mq_has_sqsched(q)) 1669 sq_hctx = blk_mq_get_sq_hctx(q); 1670 queue_for_each_hw_ctx(q, hctx, i) { 1671 if (blk_mq_hctx_stopped(hctx)) 1672 continue; 1673 /* 1674 * Dispatch from this hctx either if there's no hctx preferred 1675 * by IO scheduler or if it has requests that bypass the 1676 * scheduler. 1677 */ 1678 if (!sq_hctx || sq_hctx == hctx || 1679 !list_empty_careful(&hctx->dispatch)) 1680 blk_mq_run_hw_queue(hctx, async); 1681 } 1682 } 1683 EXPORT_SYMBOL(blk_mq_run_hw_queues); 1684 1685 /** 1686 * blk_mq_delay_run_hw_queues - Run all hardware queues asynchronously. 1687 * @q: Pointer to the request queue to run. 1688 * @msecs: Milliseconds of delay to wait before running the queues. 1689 */ 1690 void blk_mq_delay_run_hw_queues(struct request_queue *q, unsigned long msecs) 1691 { 1692 struct blk_mq_hw_ctx *hctx, *sq_hctx; 1693 int i; 1694 1695 sq_hctx = NULL; 1696 if (blk_mq_has_sqsched(q)) 1697 sq_hctx = blk_mq_get_sq_hctx(q); 1698 queue_for_each_hw_ctx(q, hctx, i) { 1699 if (blk_mq_hctx_stopped(hctx)) 1700 continue; 1701 /* 1702 * Dispatch from this hctx either if there's no hctx preferred 1703 * by IO scheduler or if it has requests that bypass the 1704 * scheduler. 1705 */ 1706 if (!sq_hctx || sq_hctx == hctx || 1707 !list_empty_careful(&hctx->dispatch)) 1708 blk_mq_delay_run_hw_queue(hctx, msecs); 1709 } 1710 } 1711 EXPORT_SYMBOL(blk_mq_delay_run_hw_queues); 1712 1713 /** 1714 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1715 * @q: request queue. 1716 * 1717 * The caller is responsible for serializing this function against 1718 * blk_mq_{start,stop}_hw_queue(). 1719 */ 1720 bool blk_mq_queue_stopped(struct request_queue *q) 1721 { 1722 struct blk_mq_hw_ctx *hctx; 1723 int i; 1724 1725 queue_for_each_hw_ctx(q, hctx, i) 1726 if (blk_mq_hctx_stopped(hctx)) 1727 return true; 1728 1729 return false; 1730 } 1731 EXPORT_SYMBOL(blk_mq_queue_stopped); 1732 1733 /* 1734 * This function is often used for pausing .queue_rq() by driver when 1735 * there isn't enough resource or some conditions aren't satisfied, and 1736 * BLK_STS_RESOURCE is usually returned. 1737 * 1738 * We do not guarantee that dispatch can be drained or blocked 1739 * after blk_mq_stop_hw_queue() returns. Please use 1740 * blk_mq_quiesce_queue() for that requirement. 1741 */ 1742 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1743 { 1744 cancel_delayed_work(&hctx->run_work); 1745 1746 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1747 } 1748 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1749 1750 /* 1751 * This function is often used for pausing .queue_rq() by driver when 1752 * there isn't enough resource or some conditions aren't satisfied, and 1753 * BLK_STS_RESOURCE is usually returned. 1754 * 1755 * We do not guarantee that dispatch can be drained or blocked 1756 * after blk_mq_stop_hw_queues() returns. Please use 1757 * blk_mq_quiesce_queue() for that requirement. 1758 */ 1759 void blk_mq_stop_hw_queues(struct request_queue *q) 1760 { 1761 struct blk_mq_hw_ctx *hctx; 1762 int i; 1763 1764 queue_for_each_hw_ctx(q, hctx, i) 1765 blk_mq_stop_hw_queue(hctx); 1766 } 1767 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 1768 1769 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 1770 { 1771 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1772 1773 blk_mq_run_hw_queue(hctx, false); 1774 } 1775 EXPORT_SYMBOL(blk_mq_start_hw_queue); 1776 1777 void blk_mq_start_hw_queues(struct request_queue *q) 1778 { 1779 struct blk_mq_hw_ctx *hctx; 1780 int i; 1781 1782 queue_for_each_hw_ctx(q, hctx, i) 1783 blk_mq_start_hw_queue(hctx); 1784 } 1785 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1786 1787 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1788 { 1789 if (!blk_mq_hctx_stopped(hctx)) 1790 return; 1791 1792 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1793 blk_mq_run_hw_queue(hctx, async); 1794 } 1795 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 1796 1797 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1798 { 1799 struct blk_mq_hw_ctx *hctx; 1800 int i; 1801 1802 queue_for_each_hw_ctx(q, hctx, i) 1803 blk_mq_start_stopped_hw_queue(hctx, async); 1804 } 1805 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1806 1807 static void blk_mq_run_work_fn(struct work_struct *work) 1808 { 1809 struct blk_mq_hw_ctx *hctx; 1810 1811 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1812 1813 /* 1814 * If we are stopped, don't run the queue. 1815 */ 1816 if (blk_mq_hctx_stopped(hctx)) 1817 return; 1818 1819 __blk_mq_run_hw_queue(hctx); 1820 } 1821 1822 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1823 struct request *rq, 1824 bool at_head) 1825 { 1826 struct blk_mq_ctx *ctx = rq->mq_ctx; 1827 enum hctx_type type = hctx->type; 1828 1829 lockdep_assert_held(&ctx->lock); 1830 1831 trace_block_rq_insert(rq); 1832 1833 if (at_head) 1834 list_add(&rq->queuelist, &ctx->rq_lists[type]); 1835 else 1836 list_add_tail(&rq->queuelist, &ctx->rq_lists[type]); 1837 } 1838 1839 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1840 bool at_head) 1841 { 1842 struct blk_mq_ctx *ctx = rq->mq_ctx; 1843 1844 lockdep_assert_held(&ctx->lock); 1845 1846 __blk_mq_insert_req_list(hctx, rq, at_head); 1847 blk_mq_hctx_mark_pending(hctx, ctx); 1848 } 1849 1850 /** 1851 * blk_mq_request_bypass_insert - Insert a request at dispatch list. 1852 * @rq: Pointer to request to be inserted. 1853 * @at_head: true if the request should be inserted at the head of the list. 1854 * @run_queue: If we should run the hardware queue after inserting the request. 1855 * 1856 * Should only be used carefully, when the caller knows we want to 1857 * bypass a potential IO scheduler on the target device. 1858 */ 1859 void blk_mq_request_bypass_insert(struct request *rq, bool at_head, 1860 bool run_queue) 1861 { 1862 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 1863 1864 spin_lock(&hctx->lock); 1865 if (at_head) 1866 list_add(&rq->queuelist, &hctx->dispatch); 1867 else 1868 list_add_tail(&rq->queuelist, &hctx->dispatch); 1869 spin_unlock(&hctx->lock); 1870 1871 if (run_queue) 1872 blk_mq_run_hw_queue(hctx, false); 1873 } 1874 1875 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1876 struct list_head *list) 1877 1878 { 1879 struct request *rq; 1880 enum hctx_type type = hctx->type; 1881 1882 /* 1883 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1884 * offline now 1885 */ 1886 list_for_each_entry(rq, list, queuelist) { 1887 BUG_ON(rq->mq_ctx != ctx); 1888 trace_block_rq_insert(rq); 1889 } 1890 1891 spin_lock(&ctx->lock); 1892 list_splice_tail_init(list, &ctx->rq_lists[type]); 1893 blk_mq_hctx_mark_pending(hctx, ctx); 1894 spin_unlock(&ctx->lock); 1895 } 1896 1897 static int plug_rq_cmp(void *priv, const struct list_head *a, 1898 const struct list_head *b) 1899 { 1900 struct request *rqa = container_of(a, struct request, queuelist); 1901 struct request *rqb = container_of(b, struct request, queuelist); 1902 1903 if (rqa->mq_ctx != rqb->mq_ctx) 1904 return rqa->mq_ctx > rqb->mq_ctx; 1905 if (rqa->mq_hctx != rqb->mq_hctx) 1906 return rqa->mq_hctx > rqb->mq_hctx; 1907 1908 return blk_rq_pos(rqa) > blk_rq_pos(rqb); 1909 } 1910 1911 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1912 { 1913 LIST_HEAD(list); 1914 1915 if (list_empty(&plug->mq_list)) 1916 return; 1917 list_splice_init(&plug->mq_list, &list); 1918 1919 if (plug->rq_count > 2 && plug->multiple_queues) 1920 list_sort(NULL, &list, plug_rq_cmp); 1921 1922 plug->rq_count = 0; 1923 1924 do { 1925 struct list_head rq_list; 1926 struct request *rq, *head_rq = list_entry_rq(list.next); 1927 struct list_head *pos = &head_rq->queuelist; /* skip first */ 1928 struct blk_mq_hw_ctx *this_hctx = head_rq->mq_hctx; 1929 struct blk_mq_ctx *this_ctx = head_rq->mq_ctx; 1930 unsigned int depth = 1; 1931 1932 list_for_each_continue(pos, &list) { 1933 rq = list_entry_rq(pos); 1934 BUG_ON(!rq->q); 1935 if (rq->mq_hctx != this_hctx || rq->mq_ctx != this_ctx) 1936 break; 1937 depth++; 1938 } 1939 1940 list_cut_before(&rq_list, &list, pos); 1941 trace_block_unplug(head_rq->q, depth, !from_schedule); 1942 blk_mq_sched_insert_requests(this_hctx, this_ctx, &rq_list, 1943 from_schedule); 1944 } while(!list_empty(&list)); 1945 } 1946 1947 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio, 1948 unsigned int nr_segs) 1949 { 1950 int err; 1951 1952 if (bio->bi_opf & REQ_RAHEAD) 1953 rq->cmd_flags |= REQ_FAILFAST_MASK; 1954 1955 rq->__sector = bio->bi_iter.bi_sector; 1956 rq->write_hint = bio->bi_write_hint; 1957 blk_rq_bio_prep(rq, bio, nr_segs); 1958 1959 /* This can't fail, since GFP_NOIO includes __GFP_DIRECT_RECLAIM. */ 1960 err = blk_crypto_rq_bio_prep(rq, bio, GFP_NOIO); 1961 WARN_ON_ONCE(err); 1962 1963 blk_account_io_start(rq); 1964 } 1965 1966 static blk_status_t __blk_mq_issue_directly(struct blk_mq_hw_ctx *hctx, 1967 struct request *rq, 1968 blk_qc_t *cookie, bool last) 1969 { 1970 struct request_queue *q = rq->q; 1971 struct blk_mq_queue_data bd = { 1972 .rq = rq, 1973 .last = last, 1974 }; 1975 blk_qc_t new_cookie; 1976 blk_status_t ret; 1977 1978 new_cookie = request_to_qc_t(hctx, rq); 1979 1980 /* 1981 * For OK queue, we are done. For error, caller may kill it. 1982 * Any other error (busy), just add it to our list as we 1983 * previously would have done. 1984 */ 1985 ret = q->mq_ops->queue_rq(hctx, &bd); 1986 switch (ret) { 1987 case BLK_STS_OK: 1988 blk_mq_update_dispatch_busy(hctx, false); 1989 *cookie = new_cookie; 1990 break; 1991 case BLK_STS_RESOURCE: 1992 case BLK_STS_DEV_RESOURCE: 1993 blk_mq_update_dispatch_busy(hctx, true); 1994 __blk_mq_requeue_request(rq); 1995 break; 1996 default: 1997 blk_mq_update_dispatch_busy(hctx, false); 1998 *cookie = BLK_QC_T_NONE; 1999 break; 2000 } 2001 2002 return ret; 2003 } 2004 2005 static blk_status_t __blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 2006 struct request *rq, 2007 blk_qc_t *cookie, 2008 bool bypass_insert, bool last) 2009 { 2010 struct request_queue *q = rq->q; 2011 bool run_queue = true; 2012 int budget_token; 2013 2014 /* 2015 * RCU or SRCU read lock is needed before checking quiesced flag. 2016 * 2017 * When queue is stopped or quiesced, ignore 'bypass_insert' from 2018 * blk_mq_request_issue_directly(), and return BLK_STS_OK to caller, 2019 * and avoid driver to try to dispatch again. 2020 */ 2021 if (blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q)) { 2022 run_queue = false; 2023 bypass_insert = false; 2024 goto insert; 2025 } 2026 2027 if (q->elevator && !bypass_insert) 2028 goto insert; 2029 2030 budget_token = blk_mq_get_dispatch_budget(q); 2031 if (budget_token < 0) 2032 goto insert; 2033 2034 blk_mq_set_rq_budget_token(rq, budget_token); 2035 2036 if (!blk_mq_get_driver_tag(rq)) { 2037 blk_mq_put_dispatch_budget(q, budget_token); 2038 goto insert; 2039 } 2040 2041 return __blk_mq_issue_directly(hctx, rq, cookie, last); 2042 insert: 2043 if (bypass_insert) 2044 return BLK_STS_RESOURCE; 2045 2046 blk_mq_sched_insert_request(rq, false, run_queue, false); 2047 2048 return BLK_STS_OK; 2049 } 2050 2051 /** 2052 * blk_mq_try_issue_directly - Try to send a request directly to device driver. 2053 * @hctx: Pointer of the associated hardware queue. 2054 * @rq: Pointer to request to be sent. 2055 * @cookie: Request queue cookie. 2056 * 2057 * If the device has enough resources to accept a new request now, send the 2058 * request directly to device driver. Else, insert at hctx->dispatch queue, so 2059 * we can try send it another time in the future. Requests inserted at this 2060 * queue have higher priority. 2061 */ 2062 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 2063 struct request *rq, blk_qc_t *cookie) 2064 { 2065 blk_status_t ret; 2066 int srcu_idx; 2067 2068 might_sleep_if(hctx->flags & BLK_MQ_F_BLOCKING); 2069 2070 hctx_lock(hctx, &srcu_idx); 2071 2072 ret = __blk_mq_try_issue_directly(hctx, rq, cookie, false, true); 2073 if (ret == BLK_STS_RESOURCE || ret == BLK_STS_DEV_RESOURCE) 2074 blk_mq_request_bypass_insert(rq, false, true); 2075 else if (ret != BLK_STS_OK) 2076 blk_mq_end_request(rq, ret); 2077 2078 hctx_unlock(hctx, srcu_idx); 2079 } 2080 2081 blk_status_t blk_mq_request_issue_directly(struct request *rq, bool last) 2082 { 2083 blk_status_t ret; 2084 int srcu_idx; 2085 blk_qc_t unused_cookie; 2086 struct blk_mq_hw_ctx *hctx = rq->mq_hctx; 2087 2088 hctx_lock(hctx, &srcu_idx); 2089 ret = __blk_mq_try_issue_directly(hctx, rq, &unused_cookie, true, last); 2090 hctx_unlock(hctx, srcu_idx); 2091 2092 return ret; 2093 } 2094 2095 void blk_mq_try_issue_list_directly(struct blk_mq_hw_ctx *hctx, 2096 struct list_head *list) 2097 { 2098 int queued = 0; 2099 int errors = 0; 2100 2101 while (!list_empty(list)) { 2102 blk_status_t ret; 2103 struct request *rq = list_first_entry(list, struct request, 2104 queuelist); 2105 2106 list_del_init(&rq->queuelist); 2107 ret = blk_mq_request_issue_directly(rq, list_empty(list)); 2108 if (ret != BLK_STS_OK) { 2109 if (ret == BLK_STS_RESOURCE || 2110 ret == BLK_STS_DEV_RESOURCE) { 2111 blk_mq_request_bypass_insert(rq, false, 2112 list_empty(list)); 2113 break; 2114 } 2115 blk_mq_end_request(rq, ret); 2116 errors++; 2117 } else 2118 queued++; 2119 } 2120 2121 /* 2122 * If we didn't flush the entire list, we could have told 2123 * the driver there was more coming, but that turned out to 2124 * be a lie. 2125 */ 2126 if ((!list_empty(list) || errors) && 2127 hctx->queue->mq_ops->commit_rqs && queued) 2128 hctx->queue->mq_ops->commit_rqs(hctx); 2129 } 2130 2131 static void blk_add_rq_to_plug(struct blk_plug *plug, struct request *rq) 2132 { 2133 list_add_tail(&rq->queuelist, &plug->mq_list); 2134 plug->rq_count++; 2135 if (!plug->multiple_queues && !list_is_singular(&plug->mq_list)) { 2136 struct request *tmp; 2137 2138 tmp = list_first_entry(&plug->mq_list, struct request, 2139 queuelist); 2140 if (tmp->q != rq->q) 2141 plug->multiple_queues = true; 2142 } 2143 } 2144 2145 /* 2146 * Allow 4x BLK_MAX_REQUEST_COUNT requests on plug queue for multiple 2147 * queues. This is important for md arrays to benefit from merging 2148 * requests. 2149 */ 2150 static inline unsigned short blk_plug_max_rq_count(struct blk_plug *plug) 2151 { 2152 if (plug->multiple_queues) 2153 return BLK_MAX_REQUEST_COUNT * 4; 2154 return BLK_MAX_REQUEST_COUNT; 2155 } 2156 2157 /** 2158 * blk_mq_submit_bio - Create and send a request to block device. 2159 * @bio: Bio pointer. 2160 * 2161 * Builds up a request structure from @q and @bio and send to the device. The 2162 * request may not be queued directly to hardware if: 2163 * * This request can be merged with another one 2164 * * We want to place request at plug queue for possible future merging 2165 * * There is an IO scheduler active at this queue 2166 * 2167 * It will not queue the request if there is an error with the bio, or at the 2168 * request creation. 2169 * 2170 * Returns: Request queue cookie. 2171 */ 2172 blk_qc_t blk_mq_submit_bio(struct bio *bio) 2173 { 2174 struct request_queue *q = bio->bi_bdev->bd_disk->queue; 2175 const int is_sync = op_is_sync(bio->bi_opf); 2176 const int is_flush_fua = op_is_flush(bio->bi_opf); 2177 struct blk_mq_alloc_data data = { 2178 .q = q, 2179 }; 2180 struct request *rq; 2181 struct blk_plug *plug; 2182 struct request *same_queue_rq = NULL; 2183 unsigned int nr_segs; 2184 blk_qc_t cookie; 2185 blk_status_t ret; 2186 bool hipri; 2187 2188 blk_queue_bounce(q, &bio); 2189 __blk_queue_split(&bio, &nr_segs); 2190 2191 if (!bio_integrity_prep(bio)) 2192 goto queue_exit; 2193 2194 if (!is_flush_fua && !blk_queue_nomerges(q) && 2195 blk_attempt_plug_merge(q, bio, nr_segs, &same_queue_rq)) 2196 goto queue_exit; 2197 2198 if (blk_mq_sched_bio_merge(q, bio, nr_segs)) 2199 goto queue_exit; 2200 2201 rq_qos_throttle(q, bio); 2202 2203 hipri = bio->bi_opf & REQ_HIPRI; 2204 2205 data.cmd_flags = bio->bi_opf; 2206 rq = __blk_mq_alloc_request(&data); 2207 if (unlikely(!rq)) { 2208 rq_qos_cleanup(q, bio); 2209 if (bio->bi_opf & REQ_NOWAIT) 2210 bio_wouldblock_error(bio); 2211 goto queue_exit; 2212 } 2213 2214 trace_block_getrq(bio); 2215 2216 rq_qos_track(q, rq, bio); 2217 2218 cookie = request_to_qc_t(data.hctx, rq); 2219 2220 blk_mq_bio_to_request(rq, bio, nr_segs); 2221 2222 ret = blk_crypto_init_request(rq); 2223 if (ret != BLK_STS_OK) { 2224 bio->bi_status = ret; 2225 bio_endio(bio); 2226 blk_mq_free_request(rq); 2227 return BLK_QC_T_NONE; 2228 } 2229 2230 plug = blk_mq_plug(q, bio); 2231 if (unlikely(is_flush_fua)) { 2232 /* Bypass scheduler for flush requests */ 2233 blk_insert_flush(rq); 2234 blk_mq_run_hw_queue(data.hctx, true); 2235 } else if (plug && (q->nr_hw_queues == 1 || 2236 blk_mq_is_sbitmap_shared(rq->mq_hctx->flags) || 2237 q->mq_ops->commit_rqs || !blk_queue_nonrot(q))) { 2238 /* 2239 * Use plugging if we have a ->commit_rqs() hook as well, as 2240 * we know the driver uses bd->last in a smart fashion. 2241 * 2242 * Use normal plugging if this disk is slow HDD, as sequential 2243 * IO may benefit a lot from plug merging. 2244 */ 2245 unsigned int request_count = plug->rq_count; 2246 struct request *last = NULL; 2247 2248 if (!request_count) 2249 trace_block_plug(q); 2250 else 2251 last = list_entry_rq(plug->mq_list.prev); 2252 2253 if (request_count >= blk_plug_max_rq_count(plug) || (last && 2254 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 2255 blk_flush_plug_list(plug, false); 2256 trace_block_plug(q); 2257 } 2258 2259 blk_add_rq_to_plug(plug, rq); 2260 } else if (q->elevator) { 2261 /* Insert the request at the IO scheduler queue */ 2262 blk_mq_sched_insert_request(rq, false, true, true); 2263 } else if (plug && !blk_queue_nomerges(q)) { 2264 /* 2265 * We do limited plugging. If the bio can be merged, do that. 2266 * Otherwise the existing request in the plug list will be 2267 * issued. So the plug list will have one request at most 2268 * The plug list might get flushed before this. If that happens, 2269 * the plug list is empty, and same_queue_rq is invalid. 2270 */ 2271 if (list_empty(&plug->mq_list)) 2272 same_queue_rq = NULL; 2273 if (same_queue_rq) { 2274 list_del_init(&same_queue_rq->queuelist); 2275 plug->rq_count--; 2276 } 2277 blk_add_rq_to_plug(plug, rq); 2278 trace_block_plug(q); 2279 2280 if (same_queue_rq) { 2281 data.hctx = same_queue_rq->mq_hctx; 2282 trace_block_unplug(q, 1, true); 2283 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 2284 &cookie); 2285 } 2286 } else if ((q->nr_hw_queues > 1 && is_sync) || 2287 !data.hctx->dispatch_busy) { 2288 /* 2289 * There is no scheduler and we can try to send directly 2290 * to the hardware. 2291 */ 2292 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 2293 } else { 2294 /* Default case. */ 2295 blk_mq_sched_insert_request(rq, false, true, true); 2296 } 2297 2298 if (!hipri) 2299 return BLK_QC_T_NONE; 2300 return cookie; 2301 queue_exit: 2302 blk_queue_exit(q); 2303 return BLK_QC_T_NONE; 2304 } 2305 2306 static size_t order_to_size(unsigned int order) 2307 { 2308 return (size_t)PAGE_SIZE << order; 2309 } 2310 2311 /* called before freeing request pool in @tags */ 2312 static void blk_mq_clear_rq_mapping(struct blk_mq_tag_set *set, 2313 struct blk_mq_tags *tags, unsigned int hctx_idx) 2314 { 2315 struct blk_mq_tags *drv_tags = set->tags[hctx_idx]; 2316 struct page *page; 2317 unsigned long flags; 2318 2319 list_for_each_entry(page, &tags->page_list, lru) { 2320 unsigned long start = (unsigned long)page_address(page); 2321 unsigned long end = start + order_to_size(page->private); 2322 int i; 2323 2324 for (i = 0; i < set->queue_depth; i++) { 2325 struct request *rq = drv_tags->rqs[i]; 2326 unsigned long rq_addr = (unsigned long)rq; 2327 2328 if (rq_addr >= start && rq_addr < end) { 2329 WARN_ON_ONCE(refcount_read(&rq->ref) != 0); 2330 cmpxchg(&drv_tags->rqs[i], rq, NULL); 2331 } 2332 } 2333 } 2334 2335 /* 2336 * Wait until all pending iteration is done. 2337 * 2338 * Request reference is cleared and it is guaranteed to be observed 2339 * after the ->lock is released. 2340 */ 2341 spin_lock_irqsave(&drv_tags->lock, flags); 2342 spin_unlock_irqrestore(&drv_tags->lock, flags); 2343 } 2344 2345 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2346 unsigned int hctx_idx) 2347 { 2348 struct page *page; 2349 2350 if (tags->rqs && set->ops->exit_request) { 2351 int i; 2352 2353 for (i = 0; i < tags->nr_tags; i++) { 2354 struct request *rq = tags->static_rqs[i]; 2355 2356 if (!rq) 2357 continue; 2358 set->ops->exit_request(set, rq, hctx_idx); 2359 tags->static_rqs[i] = NULL; 2360 } 2361 } 2362 2363 blk_mq_clear_rq_mapping(set, tags, hctx_idx); 2364 2365 while (!list_empty(&tags->page_list)) { 2366 page = list_first_entry(&tags->page_list, struct page, lru); 2367 list_del_init(&page->lru); 2368 /* 2369 * Remove kmemleak object previously allocated in 2370 * blk_mq_alloc_rqs(). 2371 */ 2372 kmemleak_free(page_address(page)); 2373 __free_pages(page, page->private); 2374 } 2375 } 2376 2377 void blk_mq_free_rq_map(struct blk_mq_tags *tags, unsigned int flags) 2378 { 2379 kfree(tags->rqs); 2380 tags->rqs = NULL; 2381 kfree(tags->static_rqs); 2382 tags->static_rqs = NULL; 2383 2384 blk_mq_free_tags(tags, flags); 2385 } 2386 2387 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 2388 unsigned int hctx_idx, 2389 unsigned int nr_tags, 2390 unsigned int reserved_tags, 2391 unsigned int flags) 2392 { 2393 struct blk_mq_tags *tags; 2394 int node; 2395 2396 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2397 if (node == NUMA_NO_NODE) 2398 node = set->numa_node; 2399 2400 tags = blk_mq_init_tags(nr_tags, reserved_tags, node, flags); 2401 if (!tags) 2402 return NULL; 2403 2404 tags->rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2405 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2406 node); 2407 if (!tags->rqs) { 2408 blk_mq_free_tags(tags, flags); 2409 return NULL; 2410 } 2411 2412 tags->static_rqs = kcalloc_node(nr_tags, sizeof(struct request *), 2413 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 2414 node); 2415 if (!tags->static_rqs) { 2416 kfree(tags->rqs); 2417 blk_mq_free_tags(tags, flags); 2418 return NULL; 2419 } 2420 2421 return tags; 2422 } 2423 2424 static int blk_mq_init_request(struct blk_mq_tag_set *set, struct request *rq, 2425 unsigned int hctx_idx, int node) 2426 { 2427 int ret; 2428 2429 if (set->ops->init_request) { 2430 ret = set->ops->init_request(set, rq, hctx_idx, node); 2431 if (ret) 2432 return ret; 2433 } 2434 2435 WRITE_ONCE(rq->state, MQ_RQ_IDLE); 2436 return 0; 2437 } 2438 2439 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 2440 unsigned int hctx_idx, unsigned int depth) 2441 { 2442 unsigned int i, j, entries_per_page, max_order = 4; 2443 size_t rq_size, left; 2444 int node; 2445 2446 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], hctx_idx); 2447 if (node == NUMA_NO_NODE) 2448 node = set->numa_node; 2449 2450 INIT_LIST_HEAD(&tags->page_list); 2451 2452 /* 2453 * rq_size is the size of the request plus driver payload, rounded 2454 * to the cacheline size 2455 */ 2456 rq_size = round_up(sizeof(struct request) + set->cmd_size, 2457 cache_line_size()); 2458 left = rq_size * depth; 2459 2460 for (i = 0; i < depth; ) { 2461 int this_order = max_order; 2462 struct page *page; 2463 int to_do; 2464 void *p; 2465 2466 while (this_order && left < order_to_size(this_order - 1)) 2467 this_order--; 2468 2469 do { 2470 page = alloc_pages_node(node, 2471 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 2472 this_order); 2473 if (page) 2474 break; 2475 if (!this_order--) 2476 break; 2477 if (order_to_size(this_order) < rq_size) 2478 break; 2479 } while (1); 2480 2481 if (!page) 2482 goto fail; 2483 2484 page->private = this_order; 2485 list_add_tail(&page->lru, &tags->page_list); 2486 2487 p = page_address(page); 2488 /* 2489 * Allow kmemleak to scan these pages as they contain pointers 2490 * to additional allocations like via ops->init_request(). 2491 */ 2492 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 2493 entries_per_page = order_to_size(this_order) / rq_size; 2494 to_do = min(entries_per_page, depth - i); 2495 left -= to_do * rq_size; 2496 for (j = 0; j < to_do; j++) { 2497 struct request *rq = p; 2498 2499 tags->static_rqs[i] = rq; 2500 if (blk_mq_init_request(set, rq, hctx_idx, node)) { 2501 tags->static_rqs[i] = NULL; 2502 goto fail; 2503 } 2504 2505 p += rq_size; 2506 i++; 2507 } 2508 } 2509 return 0; 2510 2511 fail: 2512 blk_mq_free_rqs(set, tags, hctx_idx); 2513 return -ENOMEM; 2514 } 2515 2516 struct rq_iter_data { 2517 struct blk_mq_hw_ctx *hctx; 2518 bool has_rq; 2519 }; 2520 2521 static bool blk_mq_has_request(struct request *rq, void *data, bool reserved) 2522 { 2523 struct rq_iter_data *iter_data = data; 2524 2525 if (rq->mq_hctx != iter_data->hctx) 2526 return true; 2527 iter_data->has_rq = true; 2528 return false; 2529 } 2530 2531 static bool blk_mq_hctx_has_requests(struct blk_mq_hw_ctx *hctx) 2532 { 2533 struct blk_mq_tags *tags = hctx->sched_tags ? 2534 hctx->sched_tags : hctx->tags; 2535 struct rq_iter_data data = { 2536 .hctx = hctx, 2537 }; 2538 2539 blk_mq_all_tag_iter(tags, blk_mq_has_request, &data); 2540 return data.has_rq; 2541 } 2542 2543 static inline bool blk_mq_last_cpu_in_hctx(unsigned int cpu, 2544 struct blk_mq_hw_ctx *hctx) 2545 { 2546 if (cpumask_next_and(-1, hctx->cpumask, cpu_online_mask) != cpu) 2547 return false; 2548 if (cpumask_next_and(cpu, hctx->cpumask, cpu_online_mask) < nr_cpu_ids) 2549 return false; 2550 return true; 2551 } 2552 2553 static int blk_mq_hctx_notify_offline(unsigned int cpu, struct hlist_node *node) 2554 { 2555 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2556 struct blk_mq_hw_ctx, cpuhp_online); 2557 2558 if (!cpumask_test_cpu(cpu, hctx->cpumask) || 2559 !blk_mq_last_cpu_in_hctx(cpu, hctx)) 2560 return 0; 2561 2562 /* 2563 * Prevent new request from being allocated on the current hctx. 2564 * 2565 * The smp_mb__after_atomic() Pairs with the implied barrier in 2566 * test_and_set_bit_lock in sbitmap_get(). Ensures the inactive flag is 2567 * seen once we return from the tag allocator. 2568 */ 2569 set_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2570 smp_mb__after_atomic(); 2571 2572 /* 2573 * Try to grab a reference to the queue and wait for any outstanding 2574 * requests. If we could not grab a reference the queue has been 2575 * frozen and there are no requests. 2576 */ 2577 if (percpu_ref_tryget(&hctx->queue->q_usage_counter)) { 2578 while (blk_mq_hctx_has_requests(hctx)) 2579 msleep(5); 2580 percpu_ref_put(&hctx->queue->q_usage_counter); 2581 } 2582 2583 return 0; 2584 } 2585 2586 static int blk_mq_hctx_notify_online(unsigned int cpu, struct hlist_node *node) 2587 { 2588 struct blk_mq_hw_ctx *hctx = hlist_entry_safe(node, 2589 struct blk_mq_hw_ctx, cpuhp_online); 2590 2591 if (cpumask_test_cpu(cpu, hctx->cpumask)) 2592 clear_bit(BLK_MQ_S_INACTIVE, &hctx->state); 2593 return 0; 2594 } 2595 2596 /* 2597 * 'cpu' is going away. splice any existing rq_list entries from this 2598 * software queue to the hw queue dispatch list, and ensure that it 2599 * gets run. 2600 */ 2601 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 2602 { 2603 struct blk_mq_hw_ctx *hctx; 2604 struct blk_mq_ctx *ctx; 2605 LIST_HEAD(tmp); 2606 enum hctx_type type; 2607 2608 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 2609 if (!cpumask_test_cpu(cpu, hctx->cpumask)) 2610 return 0; 2611 2612 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 2613 type = hctx->type; 2614 2615 spin_lock(&ctx->lock); 2616 if (!list_empty(&ctx->rq_lists[type])) { 2617 list_splice_init(&ctx->rq_lists[type], &tmp); 2618 blk_mq_hctx_clear_pending(hctx, ctx); 2619 } 2620 spin_unlock(&ctx->lock); 2621 2622 if (list_empty(&tmp)) 2623 return 0; 2624 2625 spin_lock(&hctx->lock); 2626 list_splice_tail_init(&tmp, &hctx->dispatch); 2627 spin_unlock(&hctx->lock); 2628 2629 blk_mq_run_hw_queue(hctx, true); 2630 return 0; 2631 } 2632 2633 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 2634 { 2635 if (!(hctx->flags & BLK_MQ_F_STACKING)) 2636 cpuhp_state_remove_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 2637 &hctx->cpuhp_online); 2638 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 2639 &hctx->cpuhp_dead); 2640 } 2641 2642 /* 2643 * Before freeing hw queue, clearing the flush request reference in 2644 * tags->rqs[] for avoiding potential UAF. 2645 */ 2646 static void blk_mq_clear_flush_rq_mapping(struct blk_mq_tags *tags, 2647 unsigned int queue_depth, struct request *flush_rq) 2648 { 2649 int i; 2650 unsigned long flags; 2651 2652 /* The hw queue may not be mapped yet */ 2653 if (!tags) 2654 return; 2655 2656 WARN_ON_ONCE(refcount_read(&flush_rq->ref) != 0); 2657 2658 for (i = 0; i < queue_depth; i++) 2659 cmpxchg(&tags->rqs[i], flush_rq, NULL); 2660 2661 /* 2662 * Wait until all pending iteration is done. 2663 * 2664 * Request reference is cleared and it is guaranteed to be observed 2665 * after the ->lock is released. 2666 */ 2667 spin_lock_irqsave(&tags->lock, flags); 2668 spin_unlock_irqrestore(&tags->lock, flags); 2669 } 2670 2671 /* hctx->ctxs will be freed in queue's release handler */ 2672 static void blk_mq_exit_hctx(struct request_queue *q, 2673 struct blk_mq_tag_set *set, 2674 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 2675 { 2676 struct request *flush_rq = hctx->fq->flush_rq; 2677 2678 if (blk_mq_hw_queue_mapped(hctx)) 2679 blk_mq_tag_idle(hctx); 2680 2681 blk_mq_clear_flush_rq_mapping(set->tags[hctx_idx], 2682 set->queue_depth, flush_rq); 2683 if (set->ops->exit_request) 2684 set->ops->exit_request(set, flush_rq, hctx_idx); 2685 2686 if (set->ops->exit_hctx) 2687 set->ops->exit_hctx(hctx, hctx_idx); 2688 2689 blk_mq_remove_cpuhp(hctx); 2690 2691 spin_lock(&q->unused_hctx_lock); 2692 list_add(&hctx->hctx_list, &q->unused_hctx_list); 2693 spin_unlock(&q->unused_hctx_lock); 2694 } 2695 2696 static void blk_mq_exit_hw_queues(struct request_queue *q, 2697 struct blk_mq_tag_set *set, int nr_queue) 2698 { 2699 struct blk_mq_hw_ctx *hctx; 2700 unsigned int i; 2701 2702 queue_for_each_hw_ctx(q, hctx, i) { 2703 if (i == nr_queue) 2704 break; 2705 blk_mq_debugfs_unregister_hctx(hctx); 2706 blk_mq_exit_hctx(q, set, hctx, i); 2707 } 2708 } 2709 2710 static int blk_mq_hw_ctx_size(struct blk_mq_tag_set *tag_set) 2711 { 2712 int hw_ctx_size = sizeof(struct blk_mq_hw_ctx); 2713 2714 BUILD_BUG_ON(ALIGN(offsetof(struct blk_mq_hw_ctx, srcu), 2715 __alignof__(struct blk_mq_hw_ctx)) != 2716 sizeof(struct blk_mq_hw_ctx)); 2717 2718 if (tag_set->flags & BLK_MQ_F_BLOCKING) 2719 hw_ctx_size += sizeof(struct srcu_struct); 2720 2721 return hw_ctx_size; 2722 } 2723 2724 static int blk_mq_init_hctx(struct request_queue *q, 2725 struct blk_mq_tag_set *set, 2726 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 2727 { 2728 hctx->queue_num = hctx_idx; 2729 2730 if (!(hctx->flags & BLK_MQ_F_STACKING)) 2731 cpuhp_state_add_instance_nocalls(CPUHP_AP_BLK_MQ_ONLINE, 2732 &hctx->cpuhp_online); 2733 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 2734 2735 hctx->tags = set->tags[hctx_idx]; 2736 2737 if (set->ops->init_hctx && 2738 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 2739 goto unregister_cpu_notifier; 2740 2741 if (blk_mq_init_request(set, hctx->fq->flush_rq, hctx_idx, 2742 hctx->numa_node)) 2743 goto exit_hctx; 2744 return 0; 2745 2746 exit_hctx: 2747 if (set->ops->exit_hctx) 2748 set->ops->exit_hctx(hctx, hctx_idx); 2749 unregister_cpu_notifier: 2750 blk_mq_remove_cpuhp(hctx); 2751 return -1; 2752 } 2753 2754 static struct blk_mq_hw_ctx * 2755 blk_mq_alloc_hctx(struct request_queue *q, struct blk_mq_tag_set *set, 2756 int node) 2757 { 2758 struct blk_mq_hw_ctx *hctx; 2759 gfp_t gfp = GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY; 2760 2761 hctx = kzalloc_node(blk_mq_hw_ctx_size(set), gfp, node); 2762 if (!hctx) 2763 goto fail_alloc_hctx; 2764 2765 if (!zalloc_cpumask_var_node(&hctx->cpumask, gfp, node)) 2766 goto free_hctx; 2767 2768 atomic_set(&hctx->nr_active, 0); 2769 if (node == NUMA_NO_NODE) 2770 node = set->numa_node; 2771 hctx->numa_node = node; 2772 2773 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 2774 spin_lock_init(&hctx->lock); 2775 INIT_LIST_HEAD(&hctx->dispatch); 2776 hctx->queue = q; 2777 hctx->flags = set->flags & ~BLK_MQ_F_TAG_QUEUE_SHARED; 2778 2779 INIT_LIST_HEAD(&hctx->hctx_list); 2780 2781 /* 2782 * Allocate space for all possible cpus to avoid allocation at 2783 * runtime 2784 */ 2785 hctx->ctxs = kmalloc_array_node(nr_cpu_ids, sizeof(void *), 2786 gfp, node); 2787 if (!hctx->ctxs) 2788 goto free_cpumask; 2789 2790 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), 2791 gfp, node, false, false)) 2792 goto free_ctxs; 2793 hctx->nr_ctx = 0; 2794 2795 spin_lock_init(&hctx->dispatch_wait_lock); 2796 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 2797 INIT_LIST_HEAD(&hctx->dispatch_wait.entry); 2798 2799 hctx->fq = blk_alloc_flush_queue(hctx->numa_node, set->cmd_size, gfp); 2800 if (!hctx->fq) 2801 goto free_bitmap; 2802 2803 if (hctx->flags & BLK_MQ_F_BLOCKING) 2804 init_srcu_struct(hctx->srcu); 2805 blk_mq_hctx_kobj_init(hctx); 2806 2807 return hctx; 2808 2809 free_bitmap: 2810 sbitmap_free(&hctx->ctx_map); 2811 free_ctxs: 2812 kfree(hctx->ctxs); 2813 free_cpumask: 2814 free_cpumask_var(hctx->cpumask); 2815 free_hctx: 2816 kfree(hctx); 2817 fail_alloc_hctx: 2818 return NULL; 2819 } 2820 2821 static void blk_mq_init_cpu_queues(struct request_queue *q, 2822 unsigned int nr_hw_queues) 2823 { 2824 struct blk_mq_tag_set *set = q->tag_set; 2825 unsigned int i, j; 2826 2827 for_each_possible_cpu(i) { 2828 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 2829 struct blk_mq_hw_ctx *hctx; 2830 int k; 2831 2832 __ctx->cpu = i; 2833 spin_lock_init(&__ctx->lock); 2834 for (k = HCTX_TYPE_DEFAULT; k < HCTX_MAX_TYPES; k++) 2835 INIT_LIST_HEAD(&__ctx->rq_lists[k]); 2836 2837 __ctx->queue = q; 2838 2839 /* 2840 * Set local node, IFF we have more than one hw queue. If 2841 * not, we remain on the home node of the device 2842 */ 2843 for (j = 0; j < set->nr_maps; j++) { 2844 hctx = blk_mq_map_queue_type(q, j, i); 2845 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 2846 hctx->numa_node = cpu_to_node(i); 2847 } 2848 } 2849 } 2850 2851 static bool __blk_mq_alloc_map_and_request(struct blk_mq_tag_set *set, 2852 int hctx_idx) 2853 { 2854 unsigned int flags = set->flags; 2855 int ret = 0; 2856 2857 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 2858 set->queue_depth, set->reserved_tags, flags); 2859 if (!set->tags[hctx_idx]) 2860 return false; 2861 2862 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, 2863 set->queue_depth); 2864 if (!ret) 2865 return true; 2866 2867 blk_mq_free_rq_map(set->tags[hctx_idx], flags); 2868 set->tags[hctx_idx] = NULL; 2869 return false; 2870 } 2871 2872 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2873 unsigned int hctx_idx) 2874 { 2875 unsigned int flags = set->flags; 2876 2877 if (set->tags && set->tags[hctx_idx]) { 2878 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2879 blk_mq_free_rq_map(set->tags[hctx_idx], flags); 2880 set->tags[hctx_idx] = NULL; 2881 } 2882 } 2883 2884 static void blk_mq_map_swqueue(struct request_queue *q) 2885 { 2886 unsigned int i, j, hctx_idx; 2887 struct blk_mq_hw_ctx *hctx; 2888 struct blk_mq_ctx *ctx; 2889 struct blk_mq_tag_set *set = q->tag_set; 2890 2891 queue_for_each_hw_ctx(q, hctx, i) { 2892 cpumask_clear(hctx->cpumask); 2893 hctx->nr_ctx = 0; 2894 hctx->dispatch_from = NULL; 2895 } 2896 2897 /* 2898 * Map software to hardware queues. 2899 * 2900 * If the cpu isn't present, the cpu is mapped to first hctx. 2901 */ 2902 for_each_possible_cpu(i) { 2903 2904 ctx = per_cpu_ptr(q->queue_ctx, i); 2905 for (j = 0; j < set->nr_maps; j++) { 2906 if (!set->map[j].nr_queues) { 2907 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2908 HCTX_TYPE_DEFAULT, i); 2909 continue; 2910 } 2911 hctx_idx = set->map[j].mq_map[i]; 2912 /* unmapped hw queue can be remapped after CPU topo changed */ 2913 if (!set->tags[hctx_idx] && 2914 !__blk_mq_alloc_map_and_request(set, hctx_idx)) { 2915 /* 2916 * If tags initialization fail for some hctx, 2917 * that hctx won't be brought online. In this 2918 * case, remap the current ctx to hctx[0] which 2919 * is guaranteed to always have tags allocated 2920 */ 2921 set->map[j].mq_map[i] = 0; 2922 } 2923 2924 hctx = blk_mq_map_queue_type(q, j, i); 2925 ctx->hctxs[j] = hctx; 2926 /* 2927 * If the CPU is already set in the mask, then we've 2928 * mapped this one already. This can happen if 2929 * devices share queues across queue maps. 2930 */ 2931 if (cpumask_test_cpu(i, hctx->cpumask)) 2932 continue; 2933 2934 cpumask_set_cpu(i, hctx->cpumask); 2935 hctx->type = j; 2936 ctx->index_hw[hctx->type] = hctx->nr_ctx; 2937 hctx->ctxs[hctx->nr_ctx++] = ctx; 2938 2939 /* 2940 * If the nr_ctx type overflows, we have exceeded the 2941 * amount of sw queues we can support. 2942 */ 2943 BUG_ON(!hctx->nr_ctx); 2944 } 2945 2946 for (; j < HCTX_MAX_TYPES; j++) 2947 ctx->hctxs[j] = blk_mq_map_queue_type(q, 2948 HCTX_TYPE_DEFAULT, i); 2949 } 2950 2951 queue_for_each_hw_ctx(q, hctx, i) { 2952 /* 2953 * If no software queues are mapped to this hardware queue, 2954 * disable it and free the request entries. 2955 */ 2956 if (!hctx->nr_ctx) { 2957 /* Never unmap queue 0. We need it as a 2958 * fallback in case of a new remap fails 2959 * allocation 2960 */ 2961 if (i && set->tags[i]) 2962 blk_mq_free_map_and_requests(set, i); 2963 2964 hctx->tags = NULL; 2965 continue; 2966 } 2967 2968 hctx->tags = set->tags[i]; 2969 WARN_ON(!hctx->tags); 2970 2971 /* 2972 * Set the map size to the number of mapped software queues. 2973 * This is more accurate and more efficient than looping 2974 * over all possibly mapped software queues. 2975 */ 2976 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 2977 2978 /* 2979 * Initialize batch roundrobin counts 2980 */ 2981 hctx->next_cpu = blk_mq_first_mapped_cpu(hctx); 2982 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2983 } 2984 } 2985 2986 /* 2987 * Caller needs to ensure that we're either frozen/quiesced, or that 2988 * the queue isn't live yet. 2989 */ 2990 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 2991 { 2992 struct blk_mq_hw_ctx *hctx; 2993 int i; 2994 2995 queue_for_each_hw_ctx(q, hctx, i) { 2996 if (shared) { 2997 hctx->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 2998 } else { 2999 blk_mq_tag_idle(hctx); 3000 hctx->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 3001 } 3002 } 3003 } 3004 3005 static void blk_mq_update_tag_set_shared(struct blk_mq_tag_set *set, 3006 bool shared) 3007 { 3008 struct request_queue *q; 3009 3010 lockdep_assert_held(&set->tag_list_lock); 3011 3012 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3013 blk_mq_freeze_queue(q); 3014 queue_set_hctx_shared(q, shared); 3015 blk_mq_unfreeze_queue(q); 3016 } 3017 } 3018 3019 static void blk_mq_del_queue_tag_set(struct request_queue *q) 3020 { 3021 struct blk_mq_tag_set *set = q->tag_set; 3022 3023 mutex_lock(&set->tag_list_lock); 3024 list_del(&q->tag_set_list); 3025 if (list_is_singular(&set->tag_list)) { 3026 /* just transitioned to unshared */ 3027 set->flags &= ~BLK_MQ_F_TAG_QUEUE_SHARED; 3028 /* update existing queue */ 3029 blk_mq_update_tag_set_shared(set, false); 3030 } 3031 mutex_unlock(&set->tag_list_lock); 3032 INIT_LIST_HEAD(&q->tag_set_list); 3033 } 3034 3035 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 3036 struct request_queue *q) 3037 { 3038 mutex_lock(&set->tag_list_lock); 3039 3040 /* 3041 * Check to see if we're transitioning to shared (from 1 to 2 queues). 3042 */ 3043 if (!list_empty(&set->tag_list) && 3044 !(set->flags & BLK_MQ_F_TAG_QUEUE_SHARED)) { 3045 set->flags |= BLK_MQ_F_TAG_QUEUE_SHARED; 3046 /* update existing queue */ 3047 blk_mq_update_tag_set_shared(set, true); 3048 } 3049 if (set->flags & BLK_MQ_F_TAG_QUEUE_SHARED) 3050 queue_set_hctx_shared(q, true); 3051 list_add_tail(&q->tag_set_list, &set->tag_list); 3052 3053 mutex_unlock(&set->tag_list_lock); 3054 } 3055 3056 /* All allocations will be freed in release handler of q->mq_kobj */ 3057 static int blk_mq_alloc_ctxs(struct request_queue *q) 3058 { 3059 struct blk_mq_ctxs *ctxs; 3060 int cpu; 3061 3062 ctxs = kzalloc(sizeof(*ctxs), GFP_KERNEL); 3063 if (!ctxs) 3064 return -ENOMEM; 3065 3066 ctxs->queue_ctx = alloc_percpu(struct blk_mq_ctx); 3067 if (!ctxs->queue_ctx) 3068 goto fail; 3069 3070 for_each_possible_cpu(cpu) { 3071 struct blk_mq_ctx *ctx = per_cpu_ptr(ctxs->queue_ctx, cpu); 3072 ctx->ctxs = ctxs; 3073 } 3074 3075 q->mq_kobj = &ctxs->kobj; 3076 q->queue_ctx = ctxs->queue_ctx; 3077 3078 return 0; 3079 fail: 3080 kfree(ctxs); 3081 return -ENOMEM; 3082 } 3083 3084 /* 3085 * It is the actual release handler for mq, but we do it from 3086 * request queue's release handler for avoiding use-after-free 3087 * and headache because q->mq_kobj shouldn't have been introduced, 3088 * but we can't group ctx/kctx kobj without it. 3089 */ 3090 void blk_mq_release(struct request_queue *q) 3091 { 3092 struct blk_mq_hw_ctx *hctx, *next; 3093 int i; 3094 3095 queue_for_each_hw_ctx(q, hctx, i) 3096 WARN_ON_ONCE(hctx && list_empty(&hctx->hctx_list)); 3097 3098 /* all hctx are in .unused_hctx_list now */ 3099 list_for_each_entry_safe(hctx, next, &q->unused_hctx_list, hctx_list) { 3100 list_del_init(&hctx->hctx_list); 3101 kobject_put(&hctx->kobj); 3102 } 3103 3104 kfree(q->queue_hw_ctx); 3105 3106 /* 3107 * release .mq_kobj and sw queue's kobject now because 3108 * both share lifetime with request queue. 3109 */ 3110 blk_mq_sysfs_deinit(q); 3111 } 3112 3113 static struct request_queue *blk_mq_init_queue_data(struct blk_mq_tag_set *set, 3114 void *queuedata) 3115 { 3116 struct request_queue *q; 3117 int ret; 3118 3119 q = blk_alloc_queue(set->numa_node); 3120 if (!q) 3121 return ERR_PTR(-ENOMEM); 3122 q->queuedata = queuedata; 3123 ret = blk_mq_init_allocated_queue(set, q); 3124 if (ret) { 3125 blk_cleanup_queue(q); 3126 return ERR_PTR(ret); 3127 } 3128 return q; 3129 } 3130 3131 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 3132 { 3133 return blk_mq_init_queue_data(set, NULL); 3134 } 3135 EXPORT_SYMBOL(blk_mq_init_queue); 3136 3137 struct gendisk *__blk_mq_alloc_disk(struct blk_mq_tag_set *set, void *queuedata, 3138 struct lock_class_key *lkclass) 3139 { 3140 struct request_queue *q; 3141 struct gendisk *disk; 3142 3143 q = blk_mq_init_queue_data(set, queuedata); 3144 if (IS_ERR(q)) 3145 return ERR_CAST(q); 3146 3147 disk = __alloc_disk_node(q, set->numa_node, lkclass); 3148 if (!disk) { 3149 blk_cleanup_queue(q); 3150 return ERR_PTR(-ENOMEM); 3151 } 3152 return disk; 3153 } 3154 EXPORT_SYMBOL(__blk_mq_alloc_disk); 3155 3156 static struct blk_mq_hw_ctx *blk_mq_alloc_and_init_hctx( 3157 struct blk_mq_tag_set *set, struct request_queue *q, 3158 int hctx_idx, int node) 3159 { 3160 struct blk_mq_hw_ctx *hctx = NULL, *tmp; 3161 3162 /* reuse dead hctx first */ 3163 spin_lock(&q->unused_hctx_lock); 3164 list_for_each_entry(tmp, &q->unused_hctx_list, hctx_list) { 3165 if (tmp->numa_node == node) { 3166 hctx = tmp; 3167 break; 3168 } 3169 } 3170 if (hctx) 3171 list_del_init(&hctx->hctx_list); 3172 spin_unlock(&q->unused_hctx_lock); 3173 3174 if (!hctx) 3175 hctx = blk_mq_alloc_hctx(q, set, node); 3176 if (!hctx) 3177 goto fail; 3178 3179 if (blk_mq_init_hctx(q, set, hctx, hctx_idx)) 3180 goto free_hctx; 3181 3182 return hctx; 3183 3184 free_hctx: 3185 kobject_put(&hctx->kobj); 3186 fail: 3187 return NULL; 3188 } 3189 3190 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 3191 struct request_queue *q) 3192 { 3193 int i, j, end; 3194 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 3195 3196 if (q->nr_hw_queues < set->nr_hw_queues) { 3197 struct blk_mq_hw_ctx **new_hctxs; 3198 3199 new_hctxs = kcalloc_node(set->nr_hw_queues, 3200 sizeof(*new_hctxs), GFP_KERNEL, 3201 set->numa_node); 3202 if (!new_hctxs) 3203 return; 3204 if (hctxs) 3205 memcpy(new_hctxs, hctxs, q->nr_hw_queues * 3206 sizeof(*hctxs)); 3207 q->queue_hw_ctx = new_hctxs; 3208 kfree(hctxs); 3209 hctxs = new_hctxs; 3210 } 3211 3212 /* protect against switching io scheduler */ 3213 mutex_lock(&q->sysfs_lock); 3214 for (i = 0; i < set->nr_hw_queues; i++) { 3215 int node; 3216 struct blk_mq_hw_ctx *hctx; 3217 3218 node = blk_mq_hw_queue_to_node(&set->map[HCTX_TYPE_DEFAULT], i); 3219 /* 3220 * If the hw queue has been mapped to another numa node, 3221 * we need to realloc the hctx. If allocation fails, fallback 3222 * to use the previous one. 3223 */ 3224 if (hctxs[i] && (hctxs[i]->numa_node == node)) 3225 continue; 3226 3227 hctx = blk_mq_alloc_and_init_hctx(set, q, i, node); 3228 if (hctx) { 3229 if (hctxs[i]) 3230 blk_mq_exit_hctx(q, set, hctxs[i], i); 3231 hctxs[i] = hctx; 3232 } else { 3233 if (hctxs[i]) 3234 pr_warn("Allocate new hctx on node %d fails,\ 3235 fallback to previous one on node %d\n", 3236 node, hctxs[i]->numa_node); 3237 else 3238 break; 3239 } 3240 } 3241 /* 3242 * Increasing nr_hw_queues fails. Free the newly allocated 3243 * hctxs and keep the previous q->nr_hw_queues. 3244 */ 3245 if (i != set->nr_hw_queues) { 3246 j = q->nr_hw_queues; 3247 end = i; 3248 } else { 3249 j = i; 3250 end = q->nr_hw_queues; 3251 q->nr_hw_queues = set->nr_hw_queues; 3252 } 3253 3254 for (; j < end; j++) { 3255 struct blk_mq_hw_ctx *hctx = hctxs[j]; 3256 3257 if (hctx) { 3258 if (hctx->tags) 3259 blk_mq_free_map_and_requests(set, j); 3260 blk_mq_exit_hctx(q, set, hctx, j); 3261 hctxs[j] = NULL; 3262 } 3263 } 3264 mutex_unlock(&q->sysfs_lock); 3265 } 3266 3267 int blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 3268 struct request_queue *q) 3269 { 3270 /* mark the queue as mq asap */ 3271 q->mq_ops = set->ops; 3272 3273 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 3274 blk_mq_poll_stats_bkt, 3275 BLK_MQ_POLL_STATS_BKTS, q); 3276 if (!q->poll_cb) 3277 goto err_exit; 3278 3279 if (blk_mq_alloc_ctxs(q)) 3280 goto err_poll; 3281 3282 /* init q->mq_kobj and sw queues' kobjects */ 3283 blk_mq_sysfs_init(q); 3284 3285 INIT_LIST_HEAD(&q->unused_hctx_list); 3286 spin_lock_init(&q->unused_hctx_lock); 3287 3288 blk_mq_realloc_hw_ctxs(set, q); 3289 if (!q->nr_hw_queues) 3290 goto err_hctxs; 3291 3292 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 3293 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 3294 3295 q->tag_set = set; 3296 3297 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 3298 if (set->nr_maps > HCTX_TYPE_POLL && 3299 set->map[HCTX_TYPE_POLL].nr_queues) 3300 blk_queue_flag_set(QUEUE_FLAG_POLL, q); 3301 3302 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 3303 INIT_LIST_HEAD(&q->requeue_list); 3304 spin_lock_init(&q->requeue_lock); 3305 3306 q->nr_requests = set->queue_depth; 3307 3308 /* 3309 * Default to classic polling 3310 */ 3311 q->poll_nsec = BLK_MQ_POLL_CLASSIC; 3312 3313 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 3314 blk_mq_add_queue_tag_set(set, q); 3315 blk_mq_map_swqueue(q); 3316 return 0; 3317 3318 err_hctxs: 3319 kfree(q->queue_hw_ctx); 3320 q->nr_hw_queues = 0; 3321 blk_mq_sysfs_deinit(q); 3322 err_poll: 3323 blk_stat_free_callback(q->poll_cb); 3324 q->poll_cb = NULL; 3325 err_exit: 3326 q->mq_ops = NULL; 3327 return -ENOMEM; 3328 } 3329 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 3330 3331 /* tags can _not_ be used after returning from blk_mq_exit_queue */ 3332 void blk_mq_exit_queue(struct request_queue *q) 3333 { 3334 struct blk_mq_tag_set *set = q->tag_set; 3335 3336 /* Checks hctx->flags & BLK_MQ_F_TAG_QUEUE_SHARED. */ 3337 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 3338 /* May clear BLK_MQ_F_TAG_QUEUE_SHARED in hctx->flags. */ 3339 blk_mq_del_queue_tag_set(q); 3340 } 3341 3342 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 3343 { 3344 int i; 3345 3346 for (i = 0; i < set->nr_hw_queues; i++) { 3347 if (!__blk_mq_alloc_map_and_request(set, i)) 3348 goto out_unwind; 3349 cond_resched(); 3350 } 3351 3352 return 0; 3353 3354 out_unwind: 3355 while (--i >= 0) 3356 blk_mq_free_map_and_requests(set, i); 3357 3358 return -ENOMEM; 3359 } 3360 3361 /* 3362 * Allocate the request maps associated with this tag_set. Note that this 3363 * may reduce the depth asked for, if memory is tight. set->queue_depth 3364 * will be updated to reflect the allocated depth. 3365 */ 3366 static int blk_mq_alloc_map_and_requests(struct blk_mq_tag_set *set) 3367 { 3368 unsigned int depth; 3369 int err; 3370 3371 depth = set->queue_depth; 3372 do { 3373 err = __blk_mq_alloc_rq_maps(set); 3374 if (!err) 3375 break; 3376 3377 set->queue_depth >>= 1; 3378 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 3379 err = -ENOMEM; 3380 break; 3381 } 3382 } while (set->queue_depth); 3383 3384 if (!set->queue_depth || err) { 3385 pr_err("blk-mq: failed to allocate request map\n"); 3386 return -ENOMEM; 3387 } 3388 3389 if (depth != set->queue_depth) 3390 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 3391 depth, set->queue_depth); 3392 3393 return 0; 3394 } 3395 3396 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 3397 { 3398 /* 3399 * blk_mq_map_queues() and multiple .map_queues() implementations 3400 * expect that set->map[HCTX_TYPE_DEFAULT].nr_queues is set to the 3401 * number of hardware queues. 3402 */ 3403 if (set->nr_maps == 1) 3404 set->map[HCTX_TYPE_DEFAULT].nr_queues = set->nr_hw_queues; 3405 3406 if (set->ops->map_queues && !is_kdump_kernel()) { 3407 int i; 3408 3409 /* 3410 * transport .map_queues is usually done in the following 3411 * way: 3412 * 3413 * for (queue = 0; queue < set->nr_hw_queues; queue++) { 3414 * mask = get_cpu_mask(queue) 3415 * for_each_cpu(cpu, mask) 3416 * set->map[x].mq_map[cpu] = queue; 3417 * } 3418 * 3419 * When we need to remap, the table has to be cleared for 3420 * killing stale mapping since one CPU may not be mapped 3421 * to any hw queue. 3422 */ 3423 for (i = 0; i < set->nr_maps; i++) 3424 blk_mq_clear_mq_map(&set->map[i]); 3425 3426 return set->ops->map_queues(set); 3427 } else { 3428 BUG_ON(set->nr_maps > 1); 3429 return blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 3430 } 3431 } 3432 3433 static int blk_mq_realloc_tag_set_tags(struct blk_mq_tag_set *set, 3434 int cur_nr_hw_queues, int new_nr_hw_queues) 3435 { 3436 struct blk_mq_tags **new_tags; 3437 3438 if (cur_nr_hw_queues >= new_nr_hw_queues) 3439 return 0; 3440 3441 new_tags = kcalloc_node(new_nr_hw_queues, sizeof(struct blk_mq_tags *), 3442 GFP_KERNEL, set->numa_node); 3443 if (!new_tags) 3444 return -ENOMEM; 3445 3446 if (set->tags) 3447 memcpy(new_tags, set->tags, cur_nr_hw_queues * 3448 sizeof(*set->tags)); 3449 kfree(set->tags); 3450 set->tags = new_tags; 3451 set->nr_hw_queues = new_nr_hw_queues; 3452 3453 return 0; 3454 } 3455 3456 static int blk_mq_alloc_tag_set_tags(struct blk_mq_tag_set *set, 3457 int new_nr_hw_queues) 3458 { 3459 return blk_mq_realloc_tag_set_tags(set, 0, new_nr_hw_queues); 3460 } 3461 3462 /* 3463 * Alloc a tag set to be associated with one or more request queues. 3464 * May fail with EINVAL for various error conditions. May adjust the 3465 * requested depth down, if it's too large. In that case, the set 3466 * value will be stored in set->queue_depth. 3467 */ 3468 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 3469 { 3470 int i, ret; 3471 3472 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 3473 3474 if (!set->nr_hw_queues) 3475 return -EINVAL; 3476 if (!set->queue_depth) 3477 return -EINVAL; 3478 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 3479 return -EINVAL; 3480 3481 if (!set->ops->queue_rq) 3482 return -EINVAL; 3483 3484 if (!set->ops->get_budget ^ !set->ops->put_budget) 3485 return -EINVAL; 3486 3487 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 3488 pr_info("blk-mq: reduced tag depth to %u\n", 3489 BLK_MQ_MAX_DEPTH); 3490 set->queue_depth = BLK_MQ_MAX_DEPTH; 3491 } 3492 3493 if (!set->nr_maps) 3494 set->nr_maps = 1; 3495 else if (set->nr_maps > HCTX_MAX_TYPES) 3496 return -EINVAL; 3497 3498 /* 3499 * If a crashdump is active, then we are potentially in a very 3500 * memory constrained environment. Limit us to 1 queue and 3501 * 64 tags to prevent using too much memory. 3502 */ 3503 if (is_kdump_kernel()) { 3504 set->nr_hw_queues = 1; 3505 set->nr_maps = 1; 3506 set->queue_depth = min(64U, set->queue_depth); 3507 } 3508 /* 3509 * There is no use for more h/w queues than cpus if we just have 3510 * a single map 3511 */ 3512 if (set->nr_maps == 1 && set->nr_hw_queues > nr_cpu_ids) 3513 set->nr_hw_queues = nr_cpu_ids; 3514 3515 if (blk_mq_alloc_tag_set_tags(set, set->nr_hw_queues) < 0) 3516 return -ENOMEM; 3517 3518 ret = -ENOMEM; 3519 for (i = 0; i < set->nr_maps; i++) { 3520 set->map[i].mq_map = kcalloc_node(nr_cpu_ids, 3521 sizeof(set->map[i].mq_map[0]), 3522 GFP_KERNEL, set->numa_node); 3523 if (!set->map[i].mq_map) 3524 goto out_free_mq_map; 3525 set->map[i].nr_queues = is_kdump_kernel() ? 1 : set->nr_hw_queues; 3526 } 3527 3528 ret = blk_mq_update_queue_map(set); 3529 if (ret) 3530 goto out_free_mq_map; 3531 3532 ret = blk_mq_alloc_map_and_requests(set); 3533 if (ret) 3534 goto out_free_mq_map; 3535 3536 if (blk_mq_is_sbitmap_shared(set->flags)) { 3537 atomic_set(&set->active_queues_shared_sbitmap, 0); 3538 3539 if (blk_mq_init_shared_sbitmap(set)) { 3540 ret = -ENOMEM; 3541 goto out_free_mq_rq_maps; 3542 } 3543 } 3544 3545 mutex_init(&set->tag_list_lock); 3546 INIT_LIST_HEAD(&set->tag_list); 3547 3548 return 0; 3549 3550 out_free_mq_rq_maps: 3551 for (i = 0; i < set->nr_hw_queues; i++) 3552 blk_mq_free_map_and_requests(set, i); 3553 out_free_mq_map: 3554 for (i = 0; i < set->nr_maps; i++) { 3555 kfree(set->map[i].mq_map); 3556 set->map[i].mq_map = NULL; 3557 } 3558 kfree(set->tags); 3559 set->tags = NULL; 3560 return ret; 3561 } 3562 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 3563 3564 /* allocate and initialize a tagset for a simple single-queue device */ 3565 int blk_mq_alloc_sq_tag_set(struct blk_mq_tag_set *set, 3566 const struct blk_mq_ops *ops, unsigned int queue_depth, 3567 unsigned int set_flags) 3568 { 3569 memset(set, 0, sizeof(*set)); 3570 set->ops = ops; 3571 set->nr_hw_queues = 1; 3572 set->nr_maps = 1; 3573 set->queue_depth = queue_depth; 3574 set->numa_node = NUMA_NO_NODE; 3575 set->flags = set_flags; 3576 return blk_mq_alloc_tag_set(set); 3577 } 3578 EXPORT_SYMBOL_GPL(blk_mq_alloc_sq_tag_set); 3579 3580 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 3581 { 3582 int i, j; 3583 3584 for (i = 0; i < set->nr_hw_queues; i++) 3585 blk_mq_free_map_and_requests(set, i); 3586 3587 if (blk_mq_is_sbitmap_shared(set->flags)) 3588 blk_mq_exit_shared_sbitmap(set); 3589 3590 for (j = 0; j < set->nr_maps; j++) { 3591 kfree(set->map[j].mq_map); 3592 set->map[j].mq_map = NULL; 3593 } 3594 3595 kfree(set->tags); 3596 set->tags = NULL; 3597 } 3598 EXPORT_SYMBOL(blk_mq_free_tag_set); 3599 3600 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 3601 { 3602 struct blk_mq_tag_set *set = q->tag_set; 3603 struct blk_mq_hw_ctx *hctx; 3604 int i, ret; 3605 3606 if (!set) 3607 return -EINVAL; 3608 3609 if (q->nr_requests == nr) 3610 return 0; 3611 3612 blk_mq_freeze_queue(q); 3613 blk_mq_quiesce_queue(q); 3614 3615 ret = 0; 3616 queue_for_each_hw_ctx(q, hctx, i) { 3617 if (!hctx->tags) 3618 continue; 3619 /* 3620 * If we're using an MQ scheduler, just update the scheduler 3621 * queue depth. This is similar to what the old code would do. 3622 */ 3623 if (!hctx->sched_tags) { 3624 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, nr, 3625 false); 3626 if (!ret && blk_mq_is_sbitmap_shared(set->flags)) 3627 blk_mq_tag_resize_shared_sbitmap(set, nr); 3628 } else { 3629 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 3630 nr, true); 3631 if (blk_mq_is_sbitmap_shared(set->flags)) { 3632 hctx->sched_tags->bitmap_tags = 3633 &q->sched_bitmap_tags; 3634 hctx->sched_tags->breserved_tags = 3635 &q->sched_breserved_tags; 3636 } 3637 } 3638 if (ret) 3639 break; 3640 if (q->elevator && q->elevator->type->ops.depth_updated) 3641 q->elevator->type->ops.depth_updated(hctx); 3642 } 3643 if (!ret) { 3644 q->nr_requests = nr; 3645 if (q->elevator && blk_mq_is_sbitmap_shared(set->flags)) 3646 sbitmap_queue_resize(&q->sched_bitmap_tags, 3647 nr - set->reserved_tags); 3648 } 3649 3650 blk_mq_unquiesce_queue(q); 3651 blk_mq_unfreeze_queue(q); 3652 3653 return ret; 3654 } 3655 3656 /* 3657 * request_queue and elevator_type pair. 3658 * It is just used by __blk_mq_update_nr_hw_queues to cache 3659 * the elevator_type associated with a request_queue. 3660 */ 3661 struct blk_mq_qe_pair { 3662 struct list_head node; 3663 struct request_queue *q; 3664 struct elevator_type *type; 3665 }; 3666 3667 /* 3668 * Cache the elevator_type in qe pair list and switch the 3669 * io scheduler to 'none' 3670 */ 3671 static bool blk_mq_elv_switch_none(struct list_head *head, 3672 struct request_queue *q) 3673 { 3674 struct blk_mq_qe_pair *qe; 3675 3676 if (!q->elevator) 3677 return true; 3678 3679 qe = kmalloc(sizeof(*qe), GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY); 3680 if (!qe) 3681 return false; 3682 3683 INIT_LIST_HEAD(&qe->node); 3684 qe->q = q; 3685 qe->type = q->elevator->type; 3686 list_add(&qe->node, head); 3687 3688 mutex_lock(&q->sysfs_lock); 3689 /* 3690 * After elevator_switch_mq, the previous elevator_queue will be 3691 * released by elevator_release. The reference of the io scheduler 3692 * module get by elevator_get will also be put. So we need to get 3693 * a reference of the io scheduler module here to prevent it to be 3694 * removed. 3695 */ 3696 __module_get(qe->type->elevator_owner); 3697 elevator_switch_mq(q, NULL); 3698 mutex_unlock(&q->sysfs_lock); 3699 3700 return true; 3701 } 3702 3703 static void blk_mq_elv_switch_back(struct list_head *head, 3704 struct request_queue *q) 3705 { 3706 struct blk_mq_qe_pair *qe; 3707 struct elevator_type *t = NULL; 3708 3709 list_for_each_entry(qe, head, node) 3710 if (qe->q == q) { 3711 t = qe->type; 3712 break; 3713 } 3714 3715 if (!t) 3716 return; 3717 3718 list_del(&qe->node); 3719 kfree(qe); 3720 3721 mutex_lock(&q->sysfs_lock); 3722 elevator_switch_mq(q, t); 3723 mutex_unlock(&q->sysfs_lock); 3724 } 3725 3726 static void __blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, 3727 int nr_hw_queues) 3728 { 3729 struct request_queue *q; 3730 LIST_HEAD(head); 3731 int prev_nr_hw_queues; 3732 3733 lockdep_assert_held(&set->tag_list_lock); 3734 3735 if (set->nr_maps == 1 && nr_hw_queues > nr_cpu_ids) 3736 nr_hw_queues = nr_cpu_ids; 3737 if (nr_hw_queues < 1) 3738 return; 3739 if (set->nr_maps == 1 && nr_hw_queues == set->nr_hw_queues) 3740 return; 3741 3742 list_for_each_entry(q, &set->tag_list, tag_set_list) 3743 blk_mq_freeze_queue(q); 3744 /* 3745 * Switch IO scheduler to 'none', cleaning up the data associated 3746 * with the previous scheduler. We will switch back once we are done 3747 * updating the new sw to hw queue mappings. 3748 */ 3749 list_for_each_entry(q, &set->tag_list, tag_set_list) 3750 if (!blk_mq_elv_switch_none(&head, q)) 3751 goto switch_back; 3752 3753 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3754 blk_mq_debugfs_unregister_hctxs(q); 3755 blk_mq_sysfs_unregister(q); 3756 } 3757 3758 prev_nr_hw_queues = set->nr_hw_queues; 3759 if (blk_mq_realloc_tag_set_tags(set, set->nr_hw_queues, nr_hw_queues) < 3760 0) 3761 goto reregister; 3762 3763 set->nr_hw_queues = nr_hw_queues; 3764 fallback: 3765 blk_mq_update_queue_map(set); 3766 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3767 blk_mq_realloc_hw_ctxs(set, q); 3768 if (q->nr_hw_queues != set->nr_hw_queues) { 3769 pr_warn("Increasing nr_hw_queues to %d fails, fallback to %d\n", 3770 nr_hw_queues, prev_nr_hw_queues); 3771 set->nr_hw_queues = prev_nr_hw_queues; 3772 blk_mq_map_queues(&set->map[HCTX_TYPE_DEFAULT]); 3773 goto fallback; 3774 } 3775 blk_mq_map_swqueue(q); 3776 } 3777 3778 reregister: 3779 list_for_each_entry(q, &set->tag_list, tag_set_list) { 3780 blk_mq_sysfs_register(q); 3781 blk_mq_debugfs_register_hctxs(q); 3782 } 3783 3784 switch_back: 3785 list_for_each_entry(q, &set->tag_list, tag_set_list) 3786 blk_mq_elv_switch_back(&head, q); 3787 3788 list_for_each_entry(q, &set->tag_list, tag_set_list) 3789 blk_mq_unfreeze_queue(q); 3790 } 3791 3792 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 3793 { 3794 mutex_lock(&set->tag_list_lock); 3795 __blk_mq_update_nr_hw_queues(set, nr_hw_queues); 3796 mutex_unlock(&set->tag_list_lock); 3797 } 3798 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 3799 3800 /* Enable polling stats and return whether they were already enabled. */ 3801 static bool blk_poll_stats_enable(struct request_queue *q) 3802 { 3803 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3804 blk_queue_flag_test_and_set(QUEUE_FLAG_POLL_STATS, q)) 3805 return true; 3806 blk_stat_add_callback(q, q->poll_cb); 3807 return false; 3808 } 3809 3810 static void blk_mq_poll_stats_start(struct request_queue *q) 3811 { 3812 /* 3813 * We don't arm the callback if polling stats are not enabled or the 3814 * callback is already active. 3815 */ 3816 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 3817 blk_stat_is_active(q->poll_cb)) 3818 return; 3819 3820 blk_stat_activate_msecs(q->poll_cb, 100); 3821 } 3822 3823 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 3824 { 3825 struct request_queue *q = cb->data; 3826 int bucket; 3827 3828 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 3829 if (cb->stat[bucket].nr_samples) 3830 q->poll_stat[bucket] = cb->stat[bucket]; 3831 } 3832 } 3833 3834 static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 3835 struct request *rq) 3836 { 3837 unsigned long ret = 0; 3838 int bucket; 3839 3840 /* 3841 * If stats collection isn't on, don't sleep but turn it on for 3842 * future users 3843 */ 3844 if (!blk_poll_stats_enable(q)) 3845 return 0; 3846 3847 /* 3848 * As an optimistic guess, use half of the mean service time 3849 * for this type of request. We can (and should) make this smarter. 3850 * For instance, if the completion latencies are tight, we can 3851 * get closer than just half the mean. This is especially 3852 * important on devices where the completion latencies are longer 3853 * than ~10 usec. We do use the stats for the relevant IO size 3854 * if available which does lead to better estimates. 3855 */ 3856 bucket = blk_mq_poll_stats_bkt(rq); 3857 if (bucket < 0) 3858 return ret; 3859 3860 if (q->poll_stat[bucket].nr_samples) 3861 ret = (q->poll_stat[bucket].mean + 1) / 2; 3862 3863 return ret; 3864 } 3865 3866 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, 3867 struct request *rq) 3868 { 3869 struct hrtimer_sleeper hs; 3870 enum hrtimer_mode mode; 3871 unsigned int nsecs; 3872 ktime_t kt; 3873 3874 if (rq->rq_flags & RQF_MQ_POLL_SLEPT) 3875 return false; 3876 3877 /* 3878 * If we get here, hybrid polling is enabled. Hence poll_nsec can be: 3879 * 3880 * 0: use half of prev avg 3881 * >0: use this specific value 3882 */ 3883 if (q->poll_nsec > 0) 3884 nsecs = q->poll_nsec; 3885 else 3886 nsecs = blk_mq_poll_nsecs(q, rq); 3887 3888 if (!nsecs) 3889 return false; 3890 3891 rq->rq_flags |= RQF_MQ_POLL_SLEPT; 3892 3893 /* 3894 * This will be replaced with the stats tracking code, using 3895 * 'avg_completion_time / 2' as the pre-sleep target. 3896 */ 3897 kt = nsecs; 3898 3899 mode = HRTIMER_MODE_REL; 3900 hrtimer_init_sleeper_on_stack(&hs, CLOCK_MONOTONIC, mode); 3901 hrtimer_set_expires(&hs.timer, kt); 3902 3903 do { 3904 if (blk_mq_rq_state(rq) == MQ_RQ_COMPLETE) 3905 break; 3906 set_current_state(TASK_UNINTERRUPTIBLE); 3907 hrtimer_sleeper_start_expires(&hs, mode); 3908 if (hs.task) 3909 io_schedule(); 3910 hrtimer_cancel(&hs.timer); 3911 mode = HRTIMER_MODE_ABS; 3912 } while (hs.task && !signal_pending(current)); 3913 3914 __set_current_state(TASK_RUNNING); 3915 destroy_hrtimer_on_stack(&hs.timer); 3916 return true; 3917 } 3918 3919 static bool blk_mq_poll_hybrid(struct request_queue *q, 3920 struct blk_mq_hw_ctx *hctx, blk_qc_t cookie) 3921 { 3922 struct request *rq; 3923 3924 if (q->poll_nsec == BLK_MQ_POLL_CLASSIC) 3925 return false; 3926 3927 if (!blk_qc_t_is_internal(cookie)) 3928 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 3929 else { 3930 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 3931 /* 3932 * With scheduling, if the request has completed, we'll 3933 * get a NULL return here, as we clear the sched tag when 3934 * that happens. The request still remains valid, like always, 3935 * so we should be safe with just the NULL check. 3936 */ 3937 if (!rq) 3938 return false; 3939 } 3940 3941 return blk_mq_poll_hybrid_sleep(q, rq); 3942 } 3943 3944 /** 3945 * blk_poll - poll for IO completions 3946 * @q: the queue 3947 * @cookie: cookie passed back at IO submission time 3948 * @spin: whether to spin for completions 3949 * 3950 * Description: 3951 * Poll for completions on the passed in queue. Returns number of 3952 * completed entries found. If @spin is true, then blk_poll will continue 3953 * looping until at least one completion is found, unless the task is 3954 * otherwise marked running (or we need to reschedule). 3955 */ 3956 int blk_poll(struct request_queue *q, blk_qc_t cookie, bool spin) 3957 { 3958 struct blk_mq_hw_ctx *hctx; 3959 unsigned int state; 3960 3961 if (!blk_qc_t_valid(cookie) || 3962 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 3963 return 0; 3964 3965 if (current->plug) 3966 blk_flush_plug_list(current->plug, false); 3967 3968 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 3969 3970 /* 3971 * If we sleep, have the caller restart the poll loop to reset 3972 * the state. Like for the other success return cases, the 3973 * caller is responsible for checking if the IO completed. If 3974 * the IO isn't complete, we'll get called again and will go 3975 * straight to the busy poll loop. If specified not to spin, 3976 * we also should not sleep. 3977 */ 3978 if (spin && blk_mq_poll_hybrid(q, hctx, cookie)) 3979 return 1; 3980 3981 hctx->poll_considered++; 3982 3983 state = get_current_state(); 3984 do { 3985 int ret; 3986 3987 hctx->poll_invoked++; 3988 3989 ret = q->mq_ops->poll(hctx); 3990 if (ret > 0) { 3991 hctx->poll_success++; 3992 __set_current_state(TASK_RUNNING); 3993 return ret; 3994 } 3995 3996 if (signal_pending_state(state, current)) 3997 __set_current_state(TASK_RUNNING); 3998 3999 if (task_is_running(current)) 4000 return 1; 4001 if (ret < 0 || !spin) 4002 break; 4003 cpu_relax(); 4004 } while (!need_resched()); 4005 4006 __set_current_state(TASK_RUNNING); 4007 return 0; 4008 } 4009 EXPORT_SYMBOL_GPL(blk_poll); 4010 4011 unsigned int blk_mq_rq_cpu(struct request *rq) 4012 { 4013 return rq->mq_ctx->cpu; 4014 } 4015 EXPORT_SYMBOL(blk_mq_rq_cpu); 4016 4017 static int __init blk_mq_init(void) 4018 { 4019 int i; 4020 4021 for_each_possible_cpu(i) 4022 init_llist_head(&per_cpu(blk_cpu_done, i)); 4023 open_softirq(BLOCK_SOFTIRQ, blk_done_softirq); 4024 4025 cpuhp_setup_state_nocalls(CPUHP_BLOCK_SOFTIRQ_DEAD, 4026 "block/softirq:dead", NULL, 4027 blk_softirq_cpu_dead); 4028 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 4029 blk_mq_hctx_notify_dead); 4030 cpuhp_setup_state_multi(CPUHP_AP_BLK_MQ_ONLINE, "block/mq:online", 4031 blk_mq_hctx_notify_online, 4032 blk_mq_hctx_notify_offline); 4033 return 0; 4034 } 4035 subsys_initcall(blk_mq_init); 4036