1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/kmemleak.h> 13 #include <linux/mm.h> 14 #include <linux/init.h> 15 #include <linux/slab.h> 16 #include <linux/workqueue.h> 17 #include <linux/smp.h> 18 #include <linux/llist.h> 19 #include <linux/list_sort.h> 20 #include <linux/cpu.h> 21 #include <linux/cache.h> 22 #include <linux/sched/sysctl.h> 23 #include <linux/sched/topology.h> 24 #include <linux/sched/signal.h> 25 #include <linux/delay.h> 26 #include <linux/crash_dump.h> 27 #include <linux/prefetch.h> 28 29 #include <trace/events/block.h> 30 31 #include <linux/blk-mq.h> 32 #include "blk.h" 33 #include "blk-mq.h" 34 #include "blk-mq-tag.h" 35 #include "blk-stat.h" 36 #include "blk-wbt.h" 37 #include "blk-mq-sched.h" 38 39 static DEFINE_MUTEX(all_q_mutex); 40 static LIST_HEAD(all_q_list); 41 42 static void blk_mq_poll_stats_start(struct request_queue *q); 43 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb); 44 45 static int blk_mq_poll_stats_bkt(const struct request *rq) 46 { 47 int ddir, bytes, bucket; 48 49 ddir = rq_data_dir(rq); 50 bytes = blk_rq_bytes(rq); 51 52 bucket = ddir + 2*(ilog2(bytes) - 9); 53 54 if (bucket < 0) 55 return -1; 56 else if (bucket >= BLK_MQ_POLL_STATS_BKTS) 57 return ddir + BLK_MQ_POLL_STATS_BKTS - 2; 58 59 return bucket; 60 } 61 62 /* 63 * Check if any of the ctx's have pending work in this hardware queue 64 */ 65 bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 66 { 67 return sbitmap_any_bit_set(&hctx->ctx_map) || 68 !list_empty_careful(&hctx->dispatch) || 69 blk_mq_sched_has_work(hctx); 70 } 71 72 /* 73 * Mark this ctx as having pending work in this hardware queue 74 */ 75 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 76 struct blk_mq_ctx *ctx) 77 { 78 if (!sbitmap_test_bit(&hctx->ctx_map, ctx->index_hw)) 79 sbitmap_set_bit(&hctx->ctx_map, ctx->index_hw); 80 } 81 82 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 83 struct blk_mq_ctx *ctx) 84 { 85 sbitmap_clear_bit(&hctx->ctx_map, ctx->index_hw); 86 } 87 88 void blk_freeze_queue_start(struct request_queue *q) 89 { 90 int freeze_depth; 91 92 freeze_depth = atomic_inc_return(&q->mq_freeze_depth); 93 if (freeze_depth == 1) { 94 percpu_ref_kill(&q->q_usage_counter); 95 blk_mq_run_hw_queues(q, false); 96 } 97 } 98 EXPORT_SYMBOL_GPL(blk_freeze_queue_start); 99 100 void blk_mq_freeze_queue_wait(struct request_queue *q) 101 { 102 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->q_usage_counter)); 103 } 104 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait); 105 106 int blk_mq_freeze_queue_wait_timeout(struct request_queue *q, 107 unsigned long timeout) 108 { 109 return wait_event_timeout(q->mq_freeze_wq, 110 percpu_ref_is_zero(&q->q_usage_counter), 111 timeout); 112 } 113 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_wait_timeout); 114 115 /* 116 * Guarantee no request is in use, so we can change any data structure of 117 * the queue afterward. 118 */ 119 void blk_freeze_queue(struct request_queue *q) 120 { 121 /* 122 * In the !blk_mq case we are only calling this to kill the 123 * q_usage_counter, otherwise this increases the freeze depth 124 * and waits for it to return to zero. For this reason there is 125 * no blk_unfreeze_queue(), and blk_freeze_queue() is not 126 * exported to drivers as the only user for unfreeze is blk_mq. 127 */ 128 blk_freeze_queue_start(q); 129 blk_mq_freeze_queue_wait(q); 130 } 131 132 void blk_mq_freeze_queue(struct request_queue *q) 133 { 134 /* 135 * ...just an alias to keep freeze and unfreeze actions balanced 136 * in the blk_mq_* namespace 137 */ 138 blk_freeze_queue(q); 139 } 140 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 141 142 void blk_mq_unfreeze_queue(struct request_queue *q) 143 { 144 int freeze_depth; 145 146 freeze_depth = atomic_dec_return(&q->mq_freeze_depth); 147 WARN_ON_ONCE(freeze_depth < 0); 148 if (!freeze_depth) { 149 percpu_ref_reinit(&q->q_usage_counter); 150 wake_up_all(&q->mq_freeze_wq); 151 } 152 } 153 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 154 155 /** 156 * blk_mq_quiesce_queue() - wait until all ongoing queue_rq calls have finished 157 * @q: request queue. 158 * 159 * Note: this function does not prevent that the struct request end_io() 160 * callback function is invoked. Additionally, it is not prevented that 161 * new queue_rq() calls occur unless the queue has been stopped first. 162 */ 163 void blk_mq_quiesce_queue(struct request_queue *q) 164 { 165 struct blk_mq_hw_ctx *hctx; 166 unsigned int i; 167 bool rcu = false; 168 169 blk_mq_stop_hw_queues(q); 170 171 queue_for_each_hw_ctx(q, hctx, i) { 172 if (hctx->flags & BLK_MQ_F_BLOCKING) 173 synchronize_srcu(&hctx->queue_rq_srcu); 174 else 175 rcu = true; 176 } 177 if (rcu) 178 synchronize_rcu(); 179 } 180 EXPORT_SYMBOL_GPL(blk_mq_quiesce_queue); 181 182 void blk_mq_wake_waiters(struct request_queue *q) 183 { 184 struct blk_mq_hw_ctx *hctx; 185 unsigned int i; 186 187 queue_for_each_hw_ctx(q, hctx, i) 188 if (blk_mq_hw_queue_mapped(hctx)) 189 blk_mq_tag_wakeup_all(hctx->tags, true); 190 191 /* 192 * If we are called because the queue has now been marked as 193 * dying, we need to ensure that processes currently waiting on 194 * the queue are notified as well. 195 */ 196 wake_up_all(&q->mq_freeze_wq); 197 } 198 199 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 200 { 201 return blk_mq_has_free_tags(hctx->tags); 202 } 203 EXPORT_SYMBOL(blk_mq_can_queue); 204 205 void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 206 struct request *rq, unsigned int op) 207 { 208 INIT_LIST_HEAD(&rq->queuelist); 209 /* csd/requeue_work/fifo_time is initialized before use */ 210 rq->q = q; 211 rq->mq_ctx = ctx; 212 rq->cmd_flags = op; 213 if (blk_queue_io_stat(q)) 214 rq->rq_flags |= RQF_IO_STAT; 215 /* do not touch atomic flags, it needs atomic ops against the timer */ 216 rq->cpu = -1; 217 INIT_HLIST_NODE(&rq->hash); 218 RB_CLEAR_NODE(&rq->rb_node); 219 rq->rq_disk = NULL; 220 rq->part = NULL; 221 rq->start_time = jiffies; 222 #ifdef CONFIG_BLK_CGROUP 223 rq->rl = NULL; 224 set_start_time_ns(rq); 225 rq->io_start_time_ns = 0; 226 #endif 227 rq->nr_phys_segments = 0; 228 #if defined(CONFIG_BLK_DEV_INTEGRITY) 229 rq->nr_integrity_segments = 0; 230 #endif 231 rq->special = NULL; 232 /* tag was already set */ 233 rq->extra_len = 0; 234 235 INIT_LIST_HEAD(&rq->timeout_list); 236 rq->timeout = 0; 237 238 rq->end_io = NULL; 239 rq->end_io_data = NULL; 240 rq->next_rq = NULL; 241 242 ctx->rq_dispatched[op_is_sync(op)]++; 243 } 244 EXPORT_SYMBOL_GPL(blk_mq_rq_ctx_init); 245 246 struct request *__blk_mq_alloc_request(struct blk_mq_alloc_data *data, 247 unsigned int op) 248 { 249 struct request *rq; 250 unsigned int tag; 251 252 tag = blk_mq_get_tag(data); 253 if (tag != BLK_MQ_TAG_FAIL) { 254 struct blk_mq_tags *tags = blk_mq_tags_from_data(data); 255 256 rq = tags->static_rqs[tag]; 257 258 if (data->flags & BLK_MQ_REQ_INTERNAL) { 259 rq->tag = -1; 260 rq->internal_tag = tag; 261 } else { 262 if (blk_mq_tag_busy(data->hctx)) { 263 rq->rq_flags = RQF_MQ_INFLIGHT; 264 atomic_inc(&data->hctx->nr_active); 265 } 266 rq->tag = tag; 267 rq->internal_tag = -1; 268 data->hctx->tags->rqs[rq->tag] = rq; 269 } 270 271 blk_mq_rq_ctx_init(data->q, data->ctx, rq, op); 272 return rq; 273 } 274 275 return NULL; 276 } 277 EXPORT_SYMBOL_GPL(__blk_mq_alloc_request); 278 279 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 280 unsigned int flags) 281 { 282 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 283 struct request *rq; 284 int ret; 285 286 ret = blk_queue_enter(q, flags & BLK_MQ_REQ_NOWAIT); 287 if (ret) 288 return ERR_PTR(ret); 289 290 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); 291 292 blk_mq_put_ctx(alloc_data.ctx); 293 blk_queue_exit(q); 294 295 if (!rq) 296 return ERR_PTR(-EWOULDBLOCK); 297 298 rq->__data_len = 0; 299 rq->__sector = (sector_t) -1; 300 rq->bio = rq->biotail = NULL; 301 return rq; 302 } 303 EXPORT_SYMBOL(blk_mq_alloc_request); 304 305 struct request *blk_mq_alloc_request_hctx(struct request_queue *q, int rw, 306 unsigned int flags, unsigned int hctx_idx) 307 { 308 struct blk_mq_alloc_data alloc_data = { .flags = flags }; 309 struct request *rq; 310 unsigned int cpu; 311 int ret; 312 313 /* 314 * If the tag allocator sleeps we could get an allocation for a 315 * different hardware context. No need to complicate the low level 316 * allocator for this for the rare use case of a command tied to 317 * a specific queue. 318 */ 319 if (WARN_ON_ONCE(!(flags & BLK_MQ_REQ_NOWAIT))) 320 return ERR_PTR(-EINVAL); 321 322 if (hctx_idx >= q->nr_hw_queues) 323 return ERR_PTR(-EIO); 324 325 ret = blk_queue_enter(q, true); 326 if (ret) 327 return ERR_PTR(ret); 328 329 /* 330 * Check if the hardware context is actually mapped to anything. 331 * If not tell the caller that it should skip this queue. 332 */ 333 alloc_data.hctx = q->queue_hw_ctx[hctx_idx]; 334 if (!blk_mq_hw_queue_mapped(alloc_data.hctx)) { 335 blk_queue_exit(q); 336 return ERR_PTR(-EXDEV); 337 } 338 cpu = cpumask_first(alloc_data.hctx->cpumask); 339 alloc_data.ctx = __blk_mq_get_ctx(q, cpu); 340 341 rq = blk_mq_sched_get_request(q, NULL, rw, &alloc_data); 342 343 blk_queue_exit(q); 344 345 if (!rq) 346 return ERR_PTR(-EWOULDBLOCK); 347 348 return rq; 349 } 350 EXPORT_SYMBOL_GPL(blk_mq_alloc_request_hctx); 351 352 void __blk_mq_finish_request(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 353 struct request *rq) 354 { 355 const int sched_tag = rq->internal_tag; 356 struct request_queue *q = rq->q; 357 358 if (rq->rq_flags & RQF_MQ_INFLIGHT) 359 atomic_dec(&hctx->nr_active); 360 361 wbt_done(q->rq_wb, &rq->issue_stat); 362 rq->rq_flags = 0; 363 364 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 365 clear_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 366 if (rq->tag != -1) 367 blk_mq_put_tag(hctx, hctx->tags, ctx, rq->tag); 368 if (sched_tag != -1) 369 blk_mq_put_tag(hctx, hctx->sched_tags, ctx, sched_tag); 370 blk_mq_sched_restart(hctx); 371 blk_queue_exit(q); 372 } 373 374 static void blk_mq_finish_hctx_request(struct blk_mq_hw_ctx *hctx, 375 struct request *rq) 376 { 377 struct blk_mq_ctx *ctx = rq->mq_ctx; 378 379 ctx->rq_completed[rq_is_sync(rq)]++; 380 __blk_mq_finish_request(hctx, ctx, rq); 381 } 382 383 void blk_mq_finish_request(struct request *rq) 384 { 385 blk_mq_finish_hctx_request(blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), rq); 386 } 387 EXPORT_SYMBOL_GPL(blk_mq_finish_request); 388 389 void blk_mq_free_request(struct request *rq) 390 { 391 blk_mq_sched_put_request(rq); 392 } 393 EXPORT_SYMBOL_GPL(blk_mq_free_request); 394 395 inline void __blk_mq_end_request(struct request *rq, int error) 396 { 397 blk_account_io_done(rq); 398 399 if (rq->end_io) { 400 wbt_done(rq->q->rq_wb, &rq->issue_stat); 401 rq->end_io(rq, error); 402 } else { 403 if (unlikely(blk_bidi_rq(rq))) 404 blk_mq_free_request(rq->next_rq); 405 blk_mq_free_request(rq); 406 } 407 } 408 EXPORT_SYMBOL(__blk_mq_end_request); 409 410 void blk_mq_end_request(struct request *rq, int error) 411 { 412 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 413 BUG(); 414 __blk_mq_end_request(rq, error); 415 } 416 EXPORT_SYMBOL(blk_mq_end_request); 417 418 static void __blk_mq_complete_request_remote(void *data) 419 { 420 struct request *rq = data; 421 422 rq->q->softirq_done_fn(rq); 423 } 424 425 static void __blk_mq_complete_request(struct request *rq) 426 { 427 struct blk_mq_ctx *ctx = rq->mq_ctx; 428 bool shared = false; 429 int cpu; 430 431 if (rq->internal_tag != -1) 432 blk_mq_sched_completed_request(rq); 433 if (rq->rq_flags & RQF_STATS) { 434 blk_mq_poll_stats_start(rq->q); 435 blk_stat_add(rq); 436 } 437 438 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 439 rq->q->softirq_done_fn(rq); 440 return; 441 } 442 443 cpu = get_cpu(); 444 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 445 shared = cpus_share_cache(cpu, ctx->cpu); 446 447 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 448 rq->csd.func = __blk_mq_complete_request_remote; 449 rq->csd.info = rq; 450 rq->csd.flags = 0; 451 smp_call_function_single_async(ctx->cpu, &rq->csd); 452 } else { 453 rq->q->softirq_done_fn(rq); 454 } 455 put_cpu(); 456 } 457 458 /** 459 * blk_mq_complete_request - end I/O on a request 460 * @rq: the request being processed 461 * 462 * Description: 463 * Ends all I/O on a request. It does not handle partial completions. 464 * The actual completion happens out-of-order, through a IPI handler. 465 **/ 466 void blk_mq_complete_request(struct request *rq) 467 { 468 struct request_queue *q = rq->q; 469 470 if (unlikely(blk_should_fake_timeout(q))) 471 return; 472 if (!blk_mark_rq_complete(rq)) 473 __blk_mq_complete_request(rq); 474 } 475 EXPORT_SYMBOL(blk_mq_complete_request); 476 477 int blk_mq_request_started(struct request *rq) 478 { 479 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 480 } 481 EXPORT_SYMBOL_GPL(blk_mq_request_started); 482 483 void blk_mq_start_request(struct request *rq) 484 { 485 struct request_queue *q = rq->q; 486 487 blk_mq_sched_started_request(rq); 488 489 trace_block_rq_issue(q, rq); 490 491 if (test_bit(QUEUE_FLAG_STATS, &q->queue_flags)) { 492 blk_stat_set_issue(&rq->issue_stat, blk_rq_sectors(rq)); 493 rq->rq_flags |= RQF_STATS; 494 wbt_issue(q->rq_wb, &rq->issue_stat); 495 } 496 497 blk_add_timer(rq); 498 499 /* 500 * Ensure that ->deadline is visible before set the started 501 * flag and clear the completed flag. 502 */ 503 smp_mb__before_atomic(); 504 505 /* 506 * Mark us as started and clear complete. Complete might have been 507 * set if requeue raced with timeout, which then marked it as 508 * complete. So be sure to clear complete again when we start 509 * the request, otherwise we'll ignore the completion event. 510 */ 511 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 512 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 513 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 514 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 515 516 if (q->dma_drain_size && blk_rq_bytes(rq)) { 517 /* 518 * Make sure space for the drain appears. We know we can do 519 * this because max_hw_segments has been adjusted to be one 520 * fewer than the device can handle. 521 */ 522 rq->nr_phys_segments++; 523 } 524 } 525 EXPORT_SYMBOL(blk_mq_start_request); 526 527 /* 528 * When we reach here because queue is busy, REQ_ATOM_COMPLETE 529 * flag isn't set yet, so there may be race with timeout handler, 530 * but given rq->deadline is just set in .queue_rq() under 531 * this situation, the race won't be possible in reality because 532 * rq->timeout should be set as big enough to cover the window 533 * between blk_mq_start_request() called from .queue_rq() and 534 * clearing REQ_ATOM_STARTED here. 535 */ 536 static void __blk_mq_requeue_request(struct request *rq) 537 { 538 struct request_queue *q = rq->q; 539 540 trace_block_rq_requeue(q, rq); 541 wbt_requeue(q->rq_wb, &rq->issue_stat); 542 blk_mq_sched_requeue_request(rq); 543 544 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 545 if (q->dma_drain_size && blk_rq_bytes(rq)) 546 rq->nr_phys_segments--; 547 } 548 } 549 550 void blk_mq_requeue_request(struct request *rq, bool kick_requeue_list) 551 { 552 __blk_mq_requeue_request(rq); 553 554 BUG_ON(blk_queued_rq(rq)); 555 blk_mq_add_to_requeue_list(rq, true, kick_requeue_list); 556 } 557 EXPORT_SYMBOL(blk_mq_requeue_request); 558 559 static void blk_mq_requeue_work(struct work_struct *work) 560 { 561 struct request_queue *q = 562 container_of(work, struct request_queue, requeue_work.work); 563 LIST_HEAD(rq_list); 564 struct request *rq, *next; 565 unsigned long flags; 566 567 spin_lock_irqsave(&q->requeue_lock, flags); 568 list_splice_init(&q->requeue_list, &rq_list); 569 spin_unlock_irqrestore(&q->requeue_lock, flags); 570 571 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 572 if (!(rq->rq_flags & RQF_SOFTBARRIER)) 573 continue; 574 575 rq->rq_flags &= ~RQF_SOFTBARRIER; 576 list_del_init(&rq->queuelist); 577 blk_mq_sched_insert_request(rq, true, false, false, true); 578 } 579 580 while (!list_empty(&rq_list)) { 581 rq = list_entry(rq_list.next, struct request, queuelist); 582 list_del_init(&rq->queuelist); 583 blk_mq_sched_insert_request(rq, false, false, false, true); 584 } 585 586 blk_mq_run_hw_queues(q, false); 587 } 588 589 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head, 590 bool kick_requeue_list) 591 { 592 struct request_queue *q = rq->q; 593 unsigned long flags; 594 595 /* 596 * We abuse this flag that is otherwise used by the I/O scheduler to 597 * request head insertation from the workqueue. 598 */ 599 BUG_ON(rq->rq_flags & RQF_SOFTBARRIER); 600 601 spin_lock_irqsave(&q->requeue_lock, flags); 602 if (at_head) { 603 rq->rq_flags |= RQF_SOFTBARRIER; 604 list_add(&rq->queuelist, &q->requeue_list); 605 } else { 606 list_add_tail(&rq->queuelist, &q->requeue_list); 607 } 608 spin_unlock_irqrestore(&q->requeue_lock, flags); 609 610 if (kick_requeue_list) 611 blk_mq_kick_requeue_list(q); 612 } 613 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 614 615 void blk_mq_kick_requeue_list(struct request_queue *q) 616 { 617 kblockd_schedule_delayed_work(&q->requeue_work, 0); 618 } 619 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 620 621 void blk_mq_delay_kick_requeue_list(struct request_queue *q, 622 unsigned long msecs) 623 { 624 kblockd_schedule_delayed_work(&q->requeue_work, 625 msecs_to_jiffies(msecs)); 626 } 627 EXPORT_SYMBOL(blk_mq_delay_kick_requeue_list); 628 629 void blk_mq_abort_requeue_list(struct request_queue *q) 630 { 631 unsigned long flags; 632 LIST_HEAD(rq_list); 633 634 spin_lock_irqsave(&q->requeue_lock, flags); 635 list_splice_init(&q->requeue_list, &rq_list); 636 spin_unlock_irqrestore(&q->requeue_lock, flags); 637 638 while (!list_empty(&rq_list)) { 639 struct request *rq; 640 641 rq = list_first_entry(&rq_list, struct request, queuelist); 642 list_del_init(&rq->queuelist); 643 blk_mq_end_request(rq, -EIO); 644 } 645 } 646 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 647 648 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 649 { 650 if (tag < tags->nr_tags) { 651 prefetch(tags->rqs[tag]); 652 return tags->rqs[tag]; 653 } 654 655 return NULL; 656 } 657 EXPORT_SYMBOL(blk_mq_tag_to_rq); 658 659 struct blk_mq_timeout_data { 660 unsigned long next; 661 unsigned int next_set; 662 }; 663 664 void blk_mq_rq_timed_out(struct request *req, bool reserved) 665 { 666 const struct blk_mq_ops *ops = req->q->mq_ops; 667 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 668 669 /* 670 * We know that complete is set at this point. If STARTED isn't set 671 * anymore, then the request isn't active and the "timeout" should 672 * just be ignored. This can happen due to the bitflag ordering. 673 * Timeout first checks if STARTED is set, and if it is, assumes 674 * the request is active. But if we race with completion, then 675 * both flags will get cleared. So check here again, and ignore 676 * a timeout event with a request that isn't active. 677 */ 678 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 679 return; 680 681 if (ops->timeout) 682 ret = ops->timeout(req, reserved); 683 684 switch (ret) { 685 case BLK_EH_HANDLED: 686 __blk_mq_complete_request(req); 687 break; 688 case BLK_EH_RESET_TIMER: 689 blk_add_timer(req); 690 blk_clear_rq_complete(req); 691 break; 692 case BLK_EH_NOT_HANDLED: 693 break; 694 default: 695 printk(KERN_ERR "block: bad eh return: %d\n", ret); 696 break; 697 } 698 } 699 700 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 701 struct request *rq, void *priv, bool reserved) 702 { 703 struct blk_mq_timeout_data *data = priv; 704 705 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 706 return; 707 708 /* 709 * The rq being checked may have been freed and reallocated 710 * out already here, we avoid this race by checking rq->deadline 711 * and REQ_ATOM_COMPLETE flag together: 712 * 713 * - if rq->deadline is observed as new value because of 714 * reusing, the rq won't be timed out because of timing. 715 * - if rq->deadline is observed as previous value, 716 * REQ_ATOM_COMPLETE flag won't be cleared in reuse path 717 * because we put a barrier between setting rq->deadline 718 * and clearing the flag in blk_mq_start_request(), so 719 * this rq won't be timed out too. 720 */ 721 if (time_after_eq(jiffies, rq->deadline)) { 722 if (!blk_mark_rq_complete(rq)) 723 blk_mq_rq_timed_out(rq, reserved); 724 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 725 data->next = rq->deadline; 726 data->next_set = 1; 727 } 728 } 729 730 static void blk_mq_timeout_work(struct work_struct *work) 731 { 732 struct request_queue *q = 733 container_of(work, struct request_queue, timeout_work); 734 struct blk_mq_timeout_data data = { 735 .next = 0, 736 .next_set = 0, 737 }; 738 int i; 739 740 /* A deadlock might occur if a request is stuck requiring a 741 * timeout at the same time a queue freeze is waiting 742 * completion, since the timeout code would not be able to 743 * acquire the queue reference here. 744 * 745 * That's why we don't use blk_queue_enter here; instead, we use 746 * percpu_ref_tryget directly, because we need to be able to 747 * obtain a reference even in the short window between the queue 748 * starting to freeze, by dropping the first reference in 749 * blk_freeze_queue_start, and the moment the last request is 750 * consumed, marked by the instant q_usage_counter reaches 751 * zero. 752 */ 753 if (!percpu_ref_tryget(&q->q_usage_counter)) 754 return; 755 756 blk_mq_queue_tag_busy_iter(q, blk_mq_check_expired, &data); 757 758 if (data.next_set) { 759 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 760 mod_timer(&q->timeout, data.next); 761 } else { 762 struct blk_mq_hw_ctx *hctx; 763 764 queue_for_each_hw_ctx(q, hctx, i) { 765 /* the hctx may be unmapped, so check it here */ 766 if (blk_mq_hw_queue_mapped(hctx)) 767 blk_mq_tag_idle(hctx); 768 } 769 } 770 blk_queue_exit(q); 771 } 772 773 /* 774 * Reverse check our software queue for entries that we could potentially 775 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 776 * too much time checking for merges. 777 */ 778 static bool blk_mq_attempt_merge(struct request_queue *q, 779 struct blk_mq_ctx *ctx, struct bio *bio) 780 { 781 struct request *rq; 782 int checked = 8; 783 784 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 785 bool merged = false; 786 787 if (!checked--) 788 break; 789 790 if (!blk_rq_merge_ok(rq, bio)) 791 continue; 792 793 switch (blk_try_merge(rq, bio)) { 794 case ELEVATOR_BACK_MERGE: 795 if (blk_mq_sched_allow_merge(q, rq, bio)) 796 merged = bio_attempt_back_merge(q, rq, bio); 797 break; 798 case ELEVATOR_FRONT_MERGE: 799 if (blk_mq_sched_allow_merge(q, rq, bio)) 800 merged = bio_attempt_front_merge(q, rq, bio); 801 break; 802 case ELEVATOR_DISCARD_MERGE: 803 merged = bio_attempt_discard_merge(q, rq, bio); 804 break; 805 default: 806 continue; 807 } 808 809 if (merged) 810 ctx->rq_merged++; 811 return merged; 812 } 813 814 return false; 815 } 816 817 struct flush_busy_ctx_data { 818 struct blk_mq_hw_ctx *hctx; 819 struct list_head *list; 820 }; 821 822 static bool flush_busy_ctx(struct sbitmap *sb, unsigned int bitnr, void *data) 823 { 824 struct flush_busy_ctx_data *flush_data = data; 825 struct blk_mq_hw_ctx *hctx = flush_data->hctx; 826 struct blk_mq_ctx *ctx = hctx->ctxs[bitnr]; 827 828 sbitmap_clear_bit(sb, bitnr); 829 spin_lock(&ctx->lock); 830 list_splice_tail_init(&ctx->rq_list, flush_data->list); 831 spin_unlock(&ctx->lock); 832 return true; 833 } 834 835 /* 836 * Process software queues that have been marked busy, splicing them 837 * to the for-dispatch 838 */ 839 void blk_mq_flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 840 { 841 struct flush_busy_ctx_data data = { 842 .hctx = hctx, 843 .list = list, 844 }; 845 846 sbitmap_for_each_set(&hctx->ctx_map, flush_busy_ctx, &data); 847 } 848 EXPORT_SYMBOL_GPL(blk_mq_flush_busy_ctxs); 849 850 static inline unsigned int queued_to_index(unsigned int queued) 851 { 852 if (!queued) 853 return 0; 854 855 return min(BLK_MQ_MAX_DISPATCH_ORDER - 1, ilog2(queued) + 1); 856 } 857 858 bool blk_mq_get_driver_tag(struct request *rq, struct blk_mq_hw_ctx **hctx, 859 bool wait) 860 { 861 struct blk_mq_alloc_data data = { 862 .q = rq->q, 863 .hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu), 864 .flags = wait ? 0 : BLK_MQ_REQ_NOWAIT, 865 }; 866 867 might_sleep_if(wait); 868 869 if (rq->tag != -1) 870 goto done; 871 872 if (blk_mq_tag_is_reserved(data.hctx->sched_tags, rq->internal_tag)) 873 data.flags |= BLK_MQ_REQ_RESERVED; 874 875 rq->tag = blk_mq_get_tag(&data); 876 if (rq->tag >= 0) { 877 if (blk_mq_tag_busy(data.hctx)) { 878 rq->rq_flags |= RQF_MQ_INFLIGHT; 879 atomic_inc(&data.hctx->nr_active); 880 } 881 data.hctx->tags->rqs[rq->tag] = rq; 882 } 883 884 done: 885 if (hctx) 886 *hctx = data.hctx; 887 return rq->tag != -1; 888 } 889 890 static void __blk_mq_put_driver_tag(struct blk_mq_hw_ctx *hctx, 891 struct request *rq) 892 { 893 blk_mq_put_tag(hctx, hctx->tags, rq->mq_ctx, rq->tag); 894 rq->tag = -1; 895 896 if (rq->rq_flags & RQF_MQ_INFLIGHT) { 897 rq->rq_flags &= ~RQF_MQ_INFLIGHT; 898 atomic_dec(&hctx->nr_active); 899 } 900 } 901 902 static void blk_mq_put_driver_tag_hctx(struct blk_mq_hw_ctx *hctx, 903 struct request *rq) 904 { 905 if (rq->tag == -1 || rq->internal_tag == -1) 906 return; 907 908 __blk_mq_put_driver_tag(hctx, rq); 909 } 910 911 static void blk_mq_put_driver_tag(struct request *rq) 912 { 913 struct blk_mq_hw_ctx *hctx; 914 915 if (rq->tag == -1 || rq->internal_tag == -1) 916 return; 917 918 hctx = blk_mq_map_queue(rq->q, rq->mq_ctx->cpu); 919 __blk_mq_put_driver_tag(hctx, rq); 920 } 921 922 /* 923 * If we fail getting a driver tag because all the driver tags are already 924 * assigned and on the dispatch list, BUT the first entry does not have a 925 * tag, then we could deadlock. For that case, move entries with assigned 926 * driver tags to the front, leaving the set of tagged requests in the 927 * same order, and the untagged set in the same order. 928 */ 929 static bool reorder_tags_to_front(struct list_head *list) 930 { 931 struct request *rq, *tmp, *first = NULL; 932 933 list_for_each_entry_safe_reverse(rq, tmp, list, queuelist) { 934 if (rq == first) 935 break; 936 if (rq->tag != -1) { 937 list_move(&rq->queuelist, list); 938 if (!first) 939 first = rq; 940 } 941 } 942 943 return first != NULL; 944 } 945 946 static int blk_mq_dispatch_wake(wait_queue_t *wait, unsigned mode, int flags, 947 void *key) 948 { 949 struct blk_mq_hw_ctx *hctx; 950 951 hctx = container_of(wait, struct blk_mq_hw_ctx, dispatch_wait); 952 953 list_del(&wait->task_list); 954 clear_bit_unlock(BLK_MQ_S_TAG_WAITING, &hctx->state); 955 blk_mq_run_hw_queue(hctx, true); 956 return 1; 957 } 958 959 static bool blk_mq_dispatch_wait_add(struct blk_mq_hw_ctx *hctx) 960 { 961 struct sbq_wait_state *ws; 962 963 /* 964 * The TAG_WAITING bit serves as a lock protecting hctx->dispatch_wait. 965 * The thread which wins the race to grab this bit adds the hardware 966 * queue to the wait queue. 967 */ 968 if (test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state) || 969 test_and_set_bit_lock(BLK_MQ_S_TAG_WAITING, &hctx->state)) 970 return false; 971 972 init_waitqueue_func_entry(&hctx->dispatch_wait, blk_mq_dispatch_wake); 973 ws = bt_wait_ptr(&hctx->tags->bitmap_tags, hctx); 974 975 /* 976 * As soon as this returns, it's no longer safe to fiddle with 977 * hctx->dispatch_wait, since a completion can wake up the wait queue 978 * and unlock the bit. 979 */ 980 add_wait_queue(&ws->wait, &hctx->dispatch_wait); 981 return true; 982 } 983 984 bool blk_mq_dispatch_rq_list(struct request_queue *q, struct list_head *list) 985 { 986 struct blk_mq_hw_ctx *hctx; 987 struct request *rq; 988 int errors, queued, ret = BLK_MQ_RQ_QUEUE_OK; 989 990 if (list_empty(list)) 991 return false; 992 993 /* 994 * Now process all the entries, sending them to the driver. 995 */ 996 errors = queued = 0; 997 do { 998 struct blk_mq_queue_data bd; 999 1000 rq = list_first_entry(list, struct request, queuelist); 1001 if (!blk_mq_get_driver_tag(rq, &hctx, false)) { 1002 if (!queued && reorder_tags_to_front(list)) 1003 continue; 1004 1005 /* 1006 * The initial allocation attempt failed, so we need to 1007 * rerun the hardware queue when a tag is freed. 1008 */ 1009 if (!blk_mq_dispatch_wait_add(hctx)) 1010 break; 1011 1012 /* 1013 * It's possible that a tag was freed in the window 1014 * between the allocation failure and adding the 1015 * hardware queue to the wait queue. 1016 */ 1017 if (!blk_mq_get_driver_tag(rq, &hctx, false)) 1018 break; 1019 } 1020 1021 list_del_init(&rq->queuelist); 1022 1023 bd.rq = rq; 1024 1025 /* 1026 * Flag last if we have no more requests, or if we have more 1027 * but can't assign a driver tag to it. 1028 */ 1029 if (list_empty(list)) 1030 bd.last = true; 1031 else { 1032 struct request *nxt; 1033 1034 nxt = list_first_entry(list, struct request, queuelist); 1035 bd.last = !blk_mq_get_driver_tag(nxt, NULL, false); 1036 } 1037 1038 ret = q->mq_ops->queue_rq(hctx, &bd); 1039 switch (ret) { 1040 case BLK_MQ_RQ_QUEUE_OK: 1041 queued++; 1042 break; 1043 case BLK_MQ_RQ_QUEUE_BUSY: 1044 blk_mq_put_driver_tag_hctx(hctx, rq); 1045 list_add(&rq->queuelist, list); 1046 __blk_mq_requeue_request(rq); 1047 break; 1048 default: 1049 pr_err("blk-mq: bad return on queue: %d\n", ret); 1050 case BLK_MQ_RQ_QUEUE_ERROR: 1051 errors++; 1052 blk_mq_end_request(rq, -EIO); 1053 break; 1054 } 1055 1056 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 1057 break; 1058 } while (!list_empty(list)); 1059 1060 hctx->dispatched[queued_to_index(queued)]++; 1061 1062 /* 1063 * Any items that need requeuing? Stuff them into hctx->dispatch, 1064 * that is where we will continue on next queue run. 1065 */ 1066 if (!list_empty(list)) { 1067 /* 1068 * If an I/O scheduler has been configured and we got a driver 1069 * tag for the next request already, free it again. 1070 */ 1071 rq = list_first_entry(list, struct request, queuelist); 1072 blk_mq_put_driver_tag(rq); 1073 1074 spin_lock(&hctx->lock); 1075 list_splice_init(list, &hctx->dispatch); 1076 spin_unlock(&hctx->lock); 1077 1078 /* 1079 * If SCHED_RESTART was set by the caller of this function and 1080 * it is no longer set that means that it was cleared by another 1081 * thread and hence that a queue rerun is needed. 1082 * 1083 * If TAG_WAITING is set that means that an I/O scheduler has 1084 * been configured and another thread is waiting for a driver 1085 * tag. To guarantee fairness, do not rerun this hardware queue 1086 * but let the other thread grab the driver tag. 1087 * 1088 * If no I/O scheduler has been configured it is possible that 1089 * the hardware queue got stopped and restarted before requests 1090 * were pushed back onto the dispatch list. Rerun the queue to 1091 * avoid starvation. Notes: 1092 * - blk_mq_run_hw_queue() checks whether or not a queue has 1093 * been stopped before rerunning a queue. 1094 * - Some but not all block drivers stop a queue before 1095 * returning BLK_MQ_RQ_QUEUE_BUSY. Two exceptions are scsi-mq 1096 * and dm-rq. 1097 */ 1098 if (!blk_mq_sched_needs_restart(hctx) && 1099 !test_bit(BLK_MQ_S_TAG_WAITING, &hctx->state)) 1100 blk_mq_run_hw_queue(hctx, true); 1101 } 1102 1103 return (queued + errors) != 0; 1104 } 1105 1106 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 1107 { 1108 int srcu_idx; 1109 1110 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask) && 1111 cpu_online(hctx->next_cpu)); 1112 1113 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1114 rcu_read_lock(); 1115 blk_mq_sched_dispatch_requests(hctx); 1116 rcu_read_unlock(); 1117 } else { 1118 might_sleep(); 1119 1120 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); 1121 blk_mq_sched_dispatch_requests(hctx); 1122 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); 1123 } 1124 } 1125 1126 /* 1127 * It'd be great if the workqueue API had a way to pass 1128 * in a mask and had some smarts for more clever placement. 1129 * For now we just round-robin here, switching for every 1130 * BLK_MQ_CPU_WORK_BATCH queued items. 1131 */ 1132 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 1133 { 1134 if (hctx->queue->nr_hw_queues == 1) 1135 return WORK_CPU_UNBOUND; 1136 1137 if (--hctx->next_cpu_batch <= 0) { 1138 int next_cpu; 1139 1140 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 1141 if (next_cpu >= nr_cpu_ids) 1142 next_cpu = cpumask_first(hctx->cpumask); 1143 1144 hctx->next_cpu = next_cpu; 1145 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1146 } 1147 1148 return hctx->next_cpu; 1149 } 1150 1151 static void __blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async, 1152 unsigned long msecs) 1153 { 1154 if (unlikely(blk_mq_hctx_stopped(hctx) || 1155 !blk_mq_hw_queue_mapped(hctx))) 1156 return; 1157 1158 if (!async && !(hctx->flags & BLK_MQ_F_BLOCKING)) { 1159 int cpu = get_cpu(); 1160 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 1161 __blk_mq_run_hw_queue(hctx); 1162 put_cpu(); 1163 return; 1164 } 1165 1166 put_cpu(); 1167 } 1168 1169 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1170 &hctx->run_work, 1171 msecs_to_jiffies(msecs)); 1172 } 1173 1174 void blk_mq_delay_run_hw_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1175 { 1176 __blk_mq_delay_run_hw_queue(hctx, true, msecs); 1177 } 1178 EXPORT_SYMBOL(blk_mq_delay_run_hw_queue); 1179 1180 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1181 { 1182 __blk_mq_delay_run_hw_queue(hctx, async, 0); 1183 } 1184 EXPORT_SYMBOL(blk_mq_run_hw_queue); 1185 1186 void blk_mq_run_hw_queues(struct request_queue *q, bool async) 1187 { 1188 struct blk_mq_hw_ctx *hctx; 1189 int i; 1190 1191 queue_for_each_hw_ctx(q, hctx, i) { 1192 if (!blk_mq_hctx_has_pending(hctx) || 1193 blk_mq_hctx_stopped(hctx)) 1194 continue; 1195 1196 blk_mq_run_hw_queue(hctx, async); 1197 } 1198 } 1199 EXPORT_SYMBOL(blk_mq_run_hw_queues); 1200 1201 /** 1202 * blk_mq_queue_stopped() - check whether one or more hctxs have been stopped 1203 * @q: request queue. 1204 * 1205 * The caller is responsible for serializing this function against 1206 * blk_mq_{start,stop}_hw_queue(). 1207 */ 1208 bool blk_mq_queue_stopped(struct request_queue *q) 1209 { 1210 struct blk_mq_hw_ctx *hctx; 1211 int i; 1212 1213 queue_for_each_hw_ctx(q, hctx, i) 1214 if (blk_mq_hctx_stopped(hctx)) 1215 return true; 1216 1217 return false; 1218 } 1219 EXPORT_SYMBOL(blk_mq_queue_stopped); 1220 1221 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 1222 { 1223 cancel_delayed_work_sync(&hctx->run_work); 1224 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 1225 } 1226 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 1227 1228 void blk_mq_stop_hw_queues(struct request_queue *q) 1229 { 1230 struct blk_mq_hw_ctx *hctx; 1231 int i; 1232 1233 queue_for_each_hw_ctx(q, hctx, i) 1234 blk_mq_stop_hw_queue(hctx); 1235 } 1236 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 1237 1238 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 1239 { 1240 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1241 1242 blk_mq_run_hw_queue(hctx, false); 1243 } 1244 EXPORT_SYMBOL(blk_mq_start_hw_queue); 1245 1246 void blk_mq_start_hw_queues(struct request_queue *q) 1247 { 1248 struct blk_mq_hw_ctx *hctx; 1249 int i; 1250 1251 queue_for_each_hw_ctx(q, hctx, i) 1252 blk_mq_start_hw_queue(hctx); 1253 } 1254 EXPORT_SYMBOL(blk_mq_start_hw_queues); 1255 1256 void blk_mq_start_stopped_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 1257 { 1258 if (!blk_mq_hctx_stopped(hctx)) 1259 return; 1260 1261 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1262 blk_mq_run_hw_queue(hctx, async); 1263 } 1264 EXPORT_SYMBOL_GPL(blk_mq_start_stopped_hw_queue); 1265 1266 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 1267 { 1268 struct blk_mq_hw_ctx *hctx; 1269 int i; 1270 1271 queue_for_each_hw_ctx(q, hctx, i) 1272 blk_mq_start_stopped_hw_queue(hctx, async); 1273 } 1274 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 1275 1276 static void blk_mq_run_work_fn(struct work_struct *work) 1277 { 1278 struct blk_mq_hw_ctx *hctx; 1279 1280 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 1281 1282 /* 1283 * If we are stopped, don't run the queue. The exception is if 1284 * BLK_MQ_S_START_ON_RUN is set. For that case, we auto-clear 1285 * the STOPPED bit and run it. 1286 */ 1287 if (test_bit(BLK_MQ_S_STOPPED, &hctx->state)) { 1288 if (!test_bit(BLK_MQ_S_START_ON_RUN, &hctx->state)) 1289 return; 1290 1291 clear_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); 1292 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 1293 } 1294 1295 __blk_mq_run_hw_queue(hctx); 1296 } 1297 1298 1299 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 1300 { 1301 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 1302 return; 1303 1304 /* 1305 * Stop the hw queue, then modify currently delayed work. 1306 * This should prevent us from running the queue prematurely. 1307 * Mark the queue as auto-clearing STOPPED when it runs. 1308 */ 1309 blk_mq_stop_hw_queue(hctx); 1310 set_bit(BLK_MQ_S_START_ON_RUN, &hctx->state); 1311 kblockd_mod_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 1312 &hctx->run_work, 1313 msecs_to_jiffies(msecs)); 1314 } 1315 EXPORT_SYMBOL(blk_mq_delay_queue); 1316 1317 static inline void __blk_mq_insert_req_list(struct blk_mq_hw_ctx *hctx, 1318 struct request *rq, 1319 bool at_head) 1320 { 1321 struct blk_mq_ctx *ctx = rq->mq_ctx; 1322 1323 trace_block_rq_insert(hctx->queue, rq); 1324 1325 if (at_head) 1326 list_add(&rq->queuelist, &ctx->rq_list); 1327 else 1328 list_add_tail(&rq->queuelist, &ctx->rq_list); 1329 } 1330 1331 void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 1332 bool at_head) 1333 { 1334 struct blk_mq_ctx *ctx = rq->mq_ctx; 1335 1336 __blk_mq_insert_req_list(hctx, rq, at_head); 1337 blk_mq_hctx_mark_pending(hctx, ctx); 1338 } 1339 1340 void blk_mq_insert_requests(struct blk_mq_hw_ctx *hctx, struct blk_mq_ctx *ctx, 1341 struct list_head *list) 1342 1343 { 1344 /* 1345 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1346 * offline now 1347 */ 1348 spin_lock(&ctx->lock); 1349 while (!list_empty(list)) { 1350 struct request *rq; 1351 1352 rq = list_first_entry(list, struct request, queuelist); 1353 BUG_ON(rq->mq_ctx != ctx); 1354 list_del_init(&rq->queuelist); 1355 __blk_mq_insert_req_list(hctx, rq, false); 1356 } 1357 blk_mq_hctx_mark_pending(hctx, ctx); 1358 spin_unlock(&ctx->lock); 1359 } 1360 1361 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1362 { 1363 struct request *rqa = container_of(a, struct request, queuelist); 1364 struct request *rqb = container_of(b, struct request, queuelist); 1365 1366 return !(rqa->mq_ctx < rqb->mq_ctx || 1367 (rqa->mq_ctx == rqb->mq_ctx && 1368 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1369 } 1370 1371 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1372 { 1373 struct blk_mq_ctx *this_ctx; 1374 struct request_queue *this_q; 1375 struct request *rq; 1376 LIST_HEAD(list); 1377 LIST_HEAD(ctx_list); 1378 unsigned int depth; 1379 1380 list_splice_init(&plug->mq_list, &list); 1381 1382 list_sort(NULL, &list, plug_ctx_cmp); 1383 1384 this_q = NULL; 1385 this_ctx = NULL; 1386 depth = 0; 1387 1388 while (!list_empty(&list)) { 1389 rq = list_entry_rq(list.next); 1390 list_del_init(&rq->queuelist); 1391 BUG_ON(!rq->q); 1392 if (rq->mq_ctx != this_ctx) { 1393 if (this_ctx) { 1394 trace_block_unplug(this_q, depth, from_schedule); 1395 blk_mq_sched_insert_requests(this_q, this_ctx, 1396 &ctx_list, 1397 from_schedule); 1398 } 1399 1400 this_ctx = rq->mq_ctx; 1401 this_q = rq->q; 1402 depth = 0; 1403 } 1404 1405 depth++; 1406 list_add_tail(&rq->queuelist, &ctx_list); 1407 } 1408 1409 /* 1410 * If 'this_ctx' is set, we know we have entries to complete 1411 * on 'ctx_list'. Do those. 1412 */ 1413 if (this_ctx) { 1414 trace_block_unplug(this_q, depth, from_schedule); 1415 blk_mq_sched_insert_requests(this_q, this_ctx, &ctx_list, 1416 from_schedule); 1417 } 1418 } 1419 1420 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1421 { 1422 blk_init_request_from_bio(rq, bio); 1423 1424 blk_account_io_start(rq, true); 1425 } 1426 1427 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1428 { 1429 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1430 !blk_queue_nomerges(hctx->queue); 1431 } 1432 1433 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1434 struct blk_mq_ctx *ctx, 1435 struct request *rq, struct bio *bio) 1436 { 1437 if (!hctx_allow_merges(hctx) || !bio_mergeable(bio)) { 1438 blk_mq_bio_to_request(rq, bio); 1439 spin_lock(&ctx->lock); 1440 insert_rq: 1441 __blk_mq_insert_request(hctx, rq, false); 1442 spin_unlock(&ctx->lock); 1443 return false; 1444 } else { 1445 struct request_queue *q = hctx->queue; 1446 1447 spin_lock(&ctx->lock); 1448 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1449 blk_mq_bio_to_request(rq, bio); 1450 goto insert_rq; 1451 } 1452 1453 spin_unlock(&ctx->lock); 1454 __blk_mq_finish_request(hctx, ctx, rq); 1455 return true; 1456 } 1457 } 1458 1459 static blk_qc_t request_to_qc_t(struct blk_mq_hw_ctx *hctx, struct request *rq) 1460 { 1461 if (rq->tag != -1) 1462 return blk_tag_to_qc_t(rq->tag, hctx->queue_num, false); 1463 1464 return blk_tag_to_qc_t(rq->internal_tag, hctx->queue_num, true); 1465 } 1466 1467 static void __blk_mq_try_issue_directly(struct request *rq, blk_qc_t *cookie, 1468 bool may_sleep) 1469 { 1470 struct request_queue *q = rq->q; 1471 struct blk_mq_queue_data bd = { 1472 .rq = rq, 1473 .last = true, 1474 }; 1475 struct blk_mq_hw_ctx *hctx; 1476 blk_qc_t new_cookie; 1477 int ret; 1478 1479 if (q->elevator) 1480 goto insert; 1481 1482 if (!blk_mq_get_driver_tag(rq, &hctx, false)) 1483 goto insert; 1484 1485 new_cookie = request_to_qc_t(hctx, rq); 1486 1487 /* 1488 * For OK queue, we are done. For error, kill it. Any other 1489 * error (busy), just add it to our list as we previously 1490 * would have done 1491 */ 1492 ret = q->mq_ops->queue_rq(hctx, &bd); 1493 if (ret == BLK_MQ_RQ_QUEUE_OK) { 1494 *cookie = new_cookie; 1495 return; 1496 } 1497 1498 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1499 *cookie = BLK_QC_T_NONE; 1500 blk_mq_end_request(rq, -EIO); 1501 return; 1502 } 1503 1504 __blk_mq_requeue_request(rq); 1505 insert: 1506 blk_mq_sched_insert_request(rq, false, true, false, may_sleep); 1507 } 1508 1509 static void blk_mq_try_issue_directly(struct blk_mq_hw_ctx *hctx, 1510 struct request *rq, blk_qc_t *cookie) 1511 { 1512 if (!(hctx->flags & BLK_MQ_F_BLOCKING)) { 1513 rcu_read_lock(); 1514 __blk_mq_try_issue_directly(rq, cookie, false); 1515 rcu_read_unlock(); 1516 } else { 1517 unsigned int srcu_idx; 1518 1519 might_sleep(); 1520 1521 srcu_idx = srcu_read_lock(&hctx->queue_rq_srcu); 1522 __blk_mq_try_issue_directly(rq, cookie, true); 1523 srcu_read_unlock(&hctx->queue_rq_srcu, srcu_idx); 1524 } 1525 } 1526 1527 static blk_qc_t blk_mq_make_request(struct request_queue *q, struct bio *bio) 1528 { 1529 const int is_sync = op_is_sync(bio->bi_opf); 1530 const int is_flush_fua = op_is_flush(bio->bi_opf); 1531 struct blk_mq_alloc_data data = { .flags = 0 }; 1532 struct request *rq; 1533 unsigned int request_count = 0; 1534 struct blk_plug *plug; 1535 struct request *same_queue_rq = NULL; 1536 blk_qc_t cookie; 1537 unsigned int wb_acct; 1538 1539 blk_queue_bounce(q, &bio); 1540 1541 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1542 bio_io_error(bio); 1543 return BLK_QC_T_NONE; 1544 } 1545 1546 blk_queue_split(q, &bio, q->bio_split); 1547 1548 if (!is_flush_fua && !blk_queue_nomerges(q) && 1549 blk_attempt_plug_merge(q, bio, &request_count, &same_queue_rq)) 1550 return BLK_QC_T_NONE; 1551 1552 if (blk_mq_sched_bio_merge(q, bio)) 1553 return BLK_QC_T_NONE; 1554 1555 wb_acct = wbt_wait(q->rq_wb, bio, NULL); 1556 1557 trace_block_getrq(q, bio, bio->bi_opf); 1558 1559 rq = blk_mq_sched_get_request(q, bio, bio->bi_opf, &data); 1560 if (unlikely(!rq)) { 1561 __wbt_done(q->rq_wb, wb_acct); 1562 return BLK_QC_T_NONE; 1563 } 1564 1565 wbt_track(&rq->issue_stat, wb_acct); 1566 1567 cookie = request_to_qc_t(data.hctx, rq); 1568 1569 plug = current->plug; 1570 if (unlikely(is_flush_fua)) { 1571 blk_mq_put_ctx(data.ctx); 1572 blk_mq_bio_to_request(rq, bio); 1573 if (q->elevator) { 1574 blk_mq_sched_insert_request(rq, false, true, true, 1575 true); 1576 } else { 1577 blk_insert_flush(rq); 1578 blk_mq_run_hw_queue(data.hctx, true); 1579 } 1580 } else if (plug && q->nr_hw_queues == 1) { 1581 struct request *last = NULL; 1582 1583 blk_mq_put_ctx(data.ctx); 1584 blk_mq_bio_to_request(rq, bio); 1585 1586 /* 1587 * @request_count may become stale because of schedule 1588 * out, so check the list again. 1589 */ 1590 if (list_empty(&plug->mq_list)) 1591 request_count = 0; 1592 else if (blk_queue_nomerges(q)) 1593 request_count = blk_plug_queued_count(q); 1594 1595 if (!request_count) 1596 trace_block_plug(q); 1597 else 1598 last = list_entry_rq(plug->mq_list.prev); 1599 1600 if (request_count >= BLK_MAX_REQUEST_COUNT || (last && 1601 blk_rq_bytes(last) >= BLK_PLUG_FLUSH_SIZE)) { 1602 blk_flush_plug_list(plug, false); 1603 trace_block_plug(q); 1604 } 1605 1606 list_add_tail(&rq->queuelist, &plug->mq_list); 1607 } else if (plug && !blk_queue_nomerges(q)) { 1608 blk_mq_bio_to_request(rq, bio); 1609 1610 /* 1611 * We do limited plugging. If the bio can be merged, do that. 1612 * Otherwise the existing request in the plug list will be 1613 * issued. So the plug list will have one request at most 1614 * The plug list might get flushed before this. If that happens, 1615 * the plug list is empty, and same_queue_rq is invalid. 1616 */ 1617 if (list_empty(&plug->mq_list)) 1618 same_queue_rq = NULL; 1619 if (same_queue_rq) 1620 list_del_init(&same_queue_rq->queuelist); 1621 list_add_tail(&rq->queuelist, &plug->mq_list); 1622 1623 blk_mq_put_ctx(data.ctx); 1624 1625 if (same_queue_rq) 1626 blk_mq_try_issue_directly(data.hctx, same_queue_rq, 1627 &cookie); 1628 } else if (q->nr_hw_queues > 1 && is_sync) { 1629 blk_mq_put_ctx(data.ctx); 1630 blk_mq_bio_to_request(rq, bio); 1631 blk_mq_try_issue_directly(data.hctx, rq, &cookie); 1632 } else if (q->elevator) { 1633 blk_mq_put_ctx(data.ctx); 1634 blk_mq_bio_to_request(rq, bio); 1635 blk_mq_sched_insert_request(rq, false, true, true, true); 1636 } else if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1637 blk_mq_put_ctx(data.ctx); 1638 blk_mq_run_hw_queue(data.hctx, true); 1639 } else 1640 blk_mq_put_ctx(data.ctx); 1641 1642 return cookie; 1643 } 1644 1645 void blk_mq_free_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1646 unsigned int hctx_idx) 1647 { 1648 struct page *page; 1649 1650 if (tags->rqs && set->ops->exit_request) { 1651 int i; 1652 1653 for (i = 0; i < tags->nr_tags; i++) { 1654 struct request *rq = tags->static_rqs[i]; 1655 1656 if (!rq) 1657 continue; 1658 set->ops->exit_request(set->driver_data, rq, 1659 hctx_idx, i); 1660 tags->static_rqs[i] = NULL; 1661 } 1662 } 1663 1664 while (!list_empty(&tags->page_list)) { 1665 page = list_first_entry(&tags->page_list, struct page, lru); 1666 list_del_init(&page->lru); 1667 /* 1668 * Remove kmemleak object previously allocated in 1669 * blk_mq_init_rq_map(). 1670 */ 1671 kmemleak_free(page_address(page)); 1672 __free_pages(page, page->private); 1673 } 1674 } 1675 1676 void blk_mq_free_rq_map(struct blk_mq_tags *tags) 1677 { 1678 kfree(tags->rqs); 1679 tags->rqs = NULL; 1680 kfree(tags->static_rqs); 1681 tags->static_rqs = NULL; 1682 1683 blk_mq_free_tags(tags); 1684 } 1685 1686 struct blk_mq_tags *blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, 1687 unsigned int hctx_idx, 1688 unsigned int nr_tags, 1689 unsigned int reserved_tags) 1690 { 1691 struct blk_mq_tags *tags; 1692 int node; 1693 1694 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1695 if (node == NUMA_NO_NODE) 1696 node = set->numa_node; 1697 1698 tags = blk_mq_init_tags(nr_tags, reserved_tags, node, 1699 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1700 if (!tags) 1701 return NULL; 1702 1703 tags->rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1704 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1705 node); 1706 if (!tags->rqs) { 1707 blk_mq_free_tags(tags); 1708 return NULL; 1709 } 1710 1711 tags->static_rqs = kzalloc_node(nr_tags * sizeof(struct request *), 1712 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY, 1713 node); 1714 if (!tags->static_rqs) { 1715 kfree(tags->rqs); 1716 blk_mq_free_tags(tags); 1717 return NULL; 1718 } 1719 1720 return tags; 1721 } 1722 1723 static size_t order_to_size(unsigned int order) 1724 { 1725 return (size_t)PAGE_SIZE << order; 1726 } 1727 1728 int blk_mq_alloc_rqs(struct blk_mq_tag_set *set, struct blk_mq_tags *tags, 1729 unsigned int hctx_idx, unsigned int depth) 1730 { 1731 unsigned int i, j, entries_per_page, max_order = 4; 1732 size_t rq_size, left; 1733 int node; 1734 1735 node = blk_mq_hw_queue_to_node(set->mq_map, hctx_idx); 1736 if (node == NUMA_NO_NODE) 1737 node = set->numa_node; 1738 1739 INIT_LIST_HEAD(&tags->page_list); 1740 1741 /* 1742 * rq_size is the size of the request plus driver payload, rounded 1743 * to the cacheline size 1744 */ 1745 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1746 cache_line_size()); 1747 left = rq_size * depth; 1748 1749 for (i = 0; i < depth; ) { 1750 int this_order = max_order; 1751 struct page *page; 1752 int to_do; 1753 void *p; 1754 1755 while (this_order && left < order_to_size(this_order - 1)) 1756 this_order--; 1757 1758 do { 1759 page = alloc_pages_node(node, 1760 GFP_NOIO | __GFP_NOWARN | __GFP_NORETRY | __GFP_ZERO, 1761 this_order); 1762 if (page) 1763 break; 1764 if (!this_order--) 1765 break; 1766 if (order_to_size(this_order) < rq_size) 1767 break; 1768 } while (1); 1769 1770 if (!page) 1771 goto fail; 1772 1773 page->private = this_order; 1774 list_add_tail(&page->lru, &tags->page_list); 1775 1776 p = page_address(page); 1777 /* 1778 * Allow kmemleak to scan these pages as they contain pointers 1779 * to additional allocations like via ops->init_request(). 1780 */ 1781 kmemleak_alloc(p, order_to_size(this_order), 1, GFP_NOIO); 1782 entries_per_page = order_to_size(this_order) / rq_size; 1783 to_do = min(entries_per_page, depth - i); 1784 left -= to_do * rq_size; 1785 for (j = 0; j < to_do; j++) { 1786 struct request *rq = p; 1787 1788 tags->static_rqs[i] = rq; 1789 if (set->ops->init_request) { 1790 if (set->ops->init_request(set->driver_data, 1791 rq, hctx_idx, i, 1792 node)) { 1793 tags->static_rqs[i] = NULL; 1794 goto fail; 1795 } 1796 } 1797 1798 p += rq_size; 1799 i++; 1800 } 1801 } 1802 return 0; 1803 1804 fail: 1805 blk_mq_free_rqs(set, tags, hctx_idx); 1806 return -ENOMEM; 1807 } 1808 1809 /* 1810 * 'cpu' is going away. splice any existing rq_list entries from this 1811 * software queue to the hw queue dispatch list, and ensure that it 1812 * gets run. 1813 */ 1814 static int blk_mq_hctx_notify_dead(unsigned int cpu, struct hlist_node *node) 1815 { 1816 struct blk_mq_hw_ctx *hctx; 1817 struct blk_mq_ctx *ctx; 1818 LIST_HEAD(tmp); 1819 1820 hctx = hlist_entry_safe(node, struct blk_mq_hw_ctx, cpuhp_dead); 1821 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1822 1823 spin_lock(&ctx->lock); 1824 if (!list_empty(&ctx->rq_list)) { 1825 list_splice_init(&ctx->rq_list, &tmp); 1826 blk_mq_hctx_clear_pending(hctx, ctx); 1827 } 1828 spin_unlock(&ctx->lock); 1829 1830 if (list_empty(&tmp)) 1831 return 0; 1832 1833 spin_lock(&hctx->lock); 1834 list_splice_tail_init(&tmp, &hctx->dispatch); 1835 spin_unlock(&hctx->lock); 1836 1837 blk_mq_run_hw_queue(hctx, true); 1838 return 0; 1839 } 1840 1841 static void blk_mq_remove_cpuhp(struct blk_mq_hw_ctx *hctx) 1842 { 1843 cpuhp_state_remove_instance_nocalls(CPUHP_BLK_MQ_DEAD, 1844 &hctx->cpuhp_dead); 1845 } 1846 1847 /* hctx->ctxs will be freed in queue's release handler */ 1848 static void blk_mq_exit_hctx(struct request_queue *q, 1849 struct blk_mq_tag_set *set, 1850 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1851 { 1852 unsigned flush_start_tag = set->queue_depth; 1853 1854 blk_mq_tag_idle(hctx); 1855 1856 if (set->ops->exit_request) 1857 set->ops->exit_request(set->driver_data, 1858 hctx->fq->flush_rq, hctx_idx, 1859 flush_start_tag + hctx_idx); 1860 1861 blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 1862 1863 if (set->ops->exit_hctx) 1864 set->ops->exit_hctx(hctx, hctx_idx); 1865 1866 if (hctx->flags & BLK_MQ_F_BLOCKING) 1867 cleanup_srcu_struct(&hctx->queue_rq_srcu); 1868 1869 blk_mq_remove_cpuhp(hctx); 1870 blk_free_flush_queue(hctx->fq); 1871 sbitmap_free(&hctx->ctx_map); 1872 } 1873 1874 static void blk_mq_exit_hw_queues(struct request_queue *q, 1875 struct blk_mq_tag_set *set, int nr_queue) 1876 { 1877 struct blk_mq_hw_ctx *hctx; 1878 unsigned int i; 1879 1880 queue_for_each_hw_ctx(q, hctx, i) { 1881 if (i == nr_queue) 1882 break; 1883 blk_mq_exit_hctx(q, set, hctx, i); 1884 } 1885 } 1886 1887 static int blk_mq_init_hctx(struct request_queue *q, 1888 struct blk_mq_tag_set *set, 1889 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1890 { 1891 int node; 1892 unsigned flush_start_tag = set->queue_depth; 1893 1894 node = hctx->numa_node; 1895 if (node == NUMA_NO_NODE) 1896 node = hctx->numa_node = set->numa_node; 1897 1898 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1899 spin_lock_init(&hctx->lock); 1900 INIT_LIST_HEAD(&hctx->dispatch); 1901 hctx->queue = q; 1902 hctx->queue_num = hctx_idx; 1903 hctx->flags = set->flags & ~BLK_MQ_F_TAG_SHARED; 1904 1905 cpuhp_state_add_instance_nocalls(CPUHP_BLK_MQ_DEAD, &hctx->cpuhp_dead); 1906 1907 hctx->tags = set->tags[hctx_idx]; 1908 1909 /* 1910 * Allocate space for all possible cpus to avoid allocation at 1911 * runtime 1912 */ 1913 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1914 GFP_KERNEL, node); 1915 if (!hctx->ctxs) 1916 goto unregister_cpu_notifier; 1917 1918 if (sbitmap_init_node(&hctx->ctx_map, nr_cpu_ids, ilog2(8), GFP_KERNEL, 1919 node)) 1920 goto free_ctxs; 1921 1922 hctx->nr_ctx = 0; 1923 1924 if (set->ops->init_hctx && 1925 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1926 goto free_bitmap; 1927 1928 if (blk_mq_sched_init_hctx(q, hctx, hctx_idx)) 1929 goto exit_hctx; 1930 1931 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1932 if (!hctx->fq) 1933 goto sched_exit_hctx; 1934 1935 if (set->ops->init_request && 1936 set->ops->init_request(set->driver_data, 1937 hctx->fq->flush_rq, hctx_idx, 1938 flush_start_tag + hctx_idx, node)) 1939 goto free_fq; 1940 1941 if (hctx->flags & BLK_MQ_F_BLOCKING) 1942 init_srcu_struct(&hctx->queue_rq_srcu); 1943 1944 return 0; 1945 1946 free_fq: 1947 kfree(hctx->fq); 1948 sched_exit_hctx: 1949 blk_mq_sched_exit_hctx(q, hctx, hctx_idx); 1950 exit_hctx: 1951 if (set->ops->exit_hctx) 1952 set->ops->exit_hctx(hctx, hctx_idx); 1953 free_bitmap: 1954 sbitmap_free(&hctx->ctx_map); 1955 free_ctxs: 1956 kfree(hctx->ctxs); 1957 unregister_cpu_notifier: 1958 blk_mq_remove_cpuhp(hctx); 1959 return -1; 1960 } 1961 1962 static void blk_mq_init_cpu_queues(struct request_queue *q, 1963 unsigned int nr_hw_queues) 1964 { 1965 unsigned int i; 1966 1967 for_each_possible_cpu(i) { 1968 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1969 struct blk_mq_hw_ctx *hctx; 1970 1971 __ctx->cpu = i; 1972 spin_lock_init(&__ctx->lock); 1973 INIT_LIST_HEAD(&__ctx->rq_list); 1974 __ctx->queue = q; 1975 1976 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1977 if (!cpu_online(i)) 1978 continue; 1979 1980 hctx = blk_mq_map_queue(q, i); 1981 1982 /* 1983 * Set local node, IFF we have more than one hw queue. If 1984 * not, we remain on the home node of the device 1985 */ 1986 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1987 hctx->numa_node = local_memory_node(cpu_to_node(i)); 1988 } 1989 } 1990 1991 static bool __blk_mq_alloc_rq_map(struct blk_mq_tag_set *set, int hctx_idx) 1992 { 1993 int ret = 0; 1994 1995 set->tags[hctx_idx] = blk_mq_alloc_rq_map(set, hctx_idx, 1996 set->queue_depth, set->reserved_tags); 1997 if (!set->tags[hctx_idx]) 1998 return false; 1999 2000 ret = blk_mq_alloc_rqs(set, set->tags[hctx_idx], hctx_idx, 2001 set->queue_depth); 2002 if (!ret) 2003 return true; 2004 2005 blk_mq_free_rq_map(set->tags[hctx_idx]); 2006 set->tags[hctx_idx] = NULL; 2007 return false; 2008 } 2009 2010 static void blk_mq_free_map_and_requests(struct blk_mq_tag_set *set, 2011 unsigned int hctx_idx) 2012 { 2013 if (set->tags[hctx_idx]) { 2014 blk_mq_free_rqs(set, set->tags[hctx_idx], hctx_idx); 2015 blk_mq_free_rq_map(set->tags[hctx_idx]); 2016 set->tags[hctx_idx] = NULL; 2017 } 2018 } 2019 2020 static void blk_mq_map_swqueue(struct request_queue *q, 2021 const struct cpumask *online_mask) 2022 { 2023 unsigned int i, hctx_idx; 2024 struct blk_mq_hw_ctx *hctx; 2025 struct blk_mq_ctx *ctx; 2026 struct blk_mq_tag_set *set = q->tag_set; 2027 2028 /* 2029 * Avoid others reading imcomplete hctx->cpumask through sysfs 2030 */ 2031 mutex_lock(&q->sysfs_lock); 2032 2033 queue_for_each_hw_ctx(q, hctx, i) { 2034 cpumask_clear(hctx->cpumask); 2035 hctx->nr_ctx = 0; 2036 } 2037 2038 /* 2039 * Map software to hardware queues 2040 */ 2041 for_each_possible_cpu(i) { 2042 /* If the cpu isn't online, the cpu is mapped to first hctx */ 2043 if (!cpumask_test_cpu(i, online_mask)) 2044 continue; 2045 2046 hctx_idx = q->mq_map[i]; 2047 /* unmapped hw queue can be remapped after CPU topo changed */ 2048 if (!set->tags[hctx_idx] && 2049 !__blk_mq_alloc_rq_map(set, hctx_idx)) { 2050 /* 2051 * If tags initialization fail for some hctx, 2052 * that hctx won't be brought online. In this 2053 * case, remap the current ctx to hctx[0] which 2054 * is guaranteed to always have tags allocated 2055 */ 2056 q->mq_map[i] = 0; 2057 } 2058 2059 ctx = per_cpu_ptr(q->queue_ctx, i); 2060 hctx = blk_mq_map_queue(q, i); 2061 2062 cpumask_set_cpu(i, hctx->cpumask); 2063 ctx->index_hw = hctx->nr_ctx; 2064 hctx->ctxs[hctx->nr_ctx++] = ctx; 2065 } 2066 2067 mutex_unlock(&q->sysfs_lock); 2068 2069 queue_for_each_hw_ctx(q, hctx, i) { 2070 /* 2071 * If no software queues are mapped to this hardware queue, 2072 * disable it and free the request entries. 2073 */ 2074 if (!hctx->nr_ctx) { 2075 /* Never unmap queue 0. We need it as a 2076 * fallback in case of a new remap fails 2077 * allocation 2078 */ 2079 if (i && set->tags[i]) 2080 blk_mq_free_map_and_requests(set, i); 2081 2082 hctx->tags = NULL; 2083 continue; 2084 } 2085 2086 hctx->tags = set->tags[i]; 2087 WARN_ON(!hctx->tags); 2088 2089 /* 2090 * Set the map size to the number of mapped software queues. 2091 * This is more accurate and more efficient than looping 2092 * over all possibly mapped software queues. 2093 */ 2094 sbitmap_resize(&hctx->ctx_map, hctx->nr_ctx); 2095 2096 /* 2097 * Initialize batch roundrobin counts 2098 */ 2099 hctx->next_cpu = cpumask_first(hctx->cpumask); 2100 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 2101 } 2102 } 2103 2104 static void queue_set_hctx_shared(struct request_queue *q, bool shared) 2105 { 2106 struct blk_mq_hw_ctx *hctx; 2107 int i; 2108 2109 queue_for_each_hw_ctx(q, hctx, i) { 2110 if (shared) 2111 hctx->flags |= BLK_MQ_F_TAG_SHARED; 2112 else 2113 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 2114 } 2115 } 2116 2117 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set, bool shared) 2118 { 2119 struct request_queue *q; 2120 2121 lockdep_assert_held(&set->tag_list_lock); 2122 2123 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2124 blk_mq_freeze_queue(q); 2125 queue_set_hctx_shared(q, shared); 2126 blk_mq_unfreeze_queue(q); 2127 } 2128 } 2129 2130 static void blk_mq_del_queue_tag_set(struct request_queue *q) 2131 { 2132 struct blk_mq_tag_set *set = q->tag_set; 2133 2134 mutex_lock(&set->tag_list_lock); 2135 list_del_rcu(&q->tag_set_list); 2136 INIT_LIST_HEAD(&q->tag_set_list); 2137 if (list_is_singular(&set->tag_list)) { 2138 /* just transitioned to unshared */ 2139 set->flags &= ~BLK_MQ_F_TAG_SHARED; 2140 /* update existing queue */ 2141 blk_mq_update_tag_set_depth(set, false); 2142 } 2143 mutex_unlock(&set->tag_list_lock); 2144 2145 synchronize_rcu(); 2146 } 2147 2148 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 2149 struct request_queue *q) 2150 { 2151 q->tag_set = set; 2152 2153 mutex_lock(&set->tag_list_lock); 2154 2155 /* Check to see if we're transitioning to shared (from 1 to 2 queues). */ 2156 if (!list_empty(&set->tag_list) && !(set->flags & BLK_MQ_F_TAG_SHARED)) { 2157 set->flags |= BLK_MQ_F_TAG_SHARED; 2158 /* update existing queue */ 2159 blk_mq_update_tag_set_depth(set, true); 2160 } 2161 if (set->flags & BLK_MQ_F_TAG_SHARED) 2162 queue_set_hctx_shared(q, true); 2163 list_add_tail_rcu(&q->tag_set_list, &set->tag_list); 2164 2165 mutex_unlock(&set->tag_list_lock); 2166 } 2167 2168 /* 2169 * It is the actual release handler for mq, but we do it from 2170 * request queue's release handler for avoiding use-after-free 2171 * and headache because q->mq_kobj shouldn't have been introduced, 2172 * but we can't group ctx/kctx kobj without it. 2173 */ 2174 void blk_mq_release(struct request_queue *q) 2175 { 2176 struct blk_mq_hw_ctx *hctx; 2177 unsigned int i; 2178 2179 /* hctx kobj stays in hctx */ 2180 queue_for_each_hw_ctx(q, hctx, i) { 2181 if (!hctx) 2182 continue; 2183 kobject_put(&hctx->kobj); 2184 } 2185 2186 q->mq_map = NULL; 2187 2188 kfree(q->queue_hw_ctx); 2189 2190 /* 2191 * release .mq_kobj and sw queue's kobject now because 2192 * both share lifetime with request queue. 2193 */ 2194 blk_mq_sysfs_deinit(q); 2195 2196 free_percpu(q->queue_ctx); 2197 } 2198 2199 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 2200 { 2201 struct request_queue *uninit_q, *q; 2202 2203 uninit_q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 2204 if (!uninit_q) 2205 return ERR_PTR(-ENOMEM); 2206 2207 q = blk_mq_init_allocated_queue(set, uninit_q); 2208 if (IS_ERR(q)) 2209 blk_cleanup_queue(uninit_q); 2210 2211 return q; 2212 } 2213 EXPORT_SYMBOL(blk_mq_init_queue); 2214 2215 static void blk_mq_realloc_hw_ctxs(struct blk_mq_tag_set *set, 2216 struct request_queue *q) 2217 { 2218 int i, j; 2219 struct blk_mq_hw_ctx **hctxs = q->queue_hw_ctx; 2220 2221 blk_mq_sysfs_unregister(q); 2222 for (i = 0; i < set->nr_hw_queues; i++) { 2223 int node; 2224 2225 if (hctxs[i]) 2226 continue; 2227 2228 node = blk_mq_hw_queue_to_node(q->mq_map, i); 2229 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 2230 GFP_KERNEL, node); 2231 if (!hctxs[i]) 2232 break; 2233 2234 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 2235 node)) { 2236 kfree(hctxs[i]); 2237 hctxs[i] = NULL; 2238 break; 2239 } 2240 2241 atomic_set(&hctxs[i]->nr_active, 0); 2242 hctxs[i]->numa_node = node; 2243 hctxs[i]->queue_num = i; 2244 2245 if (blk_mq_init_hctx(q, set, hctxs[i], i)) { 2246 free_cpumask_var(hctxs[i]->cpumask); 2247 kfree(hctxs[i]); 2248 hctxs[i] = NULL; 2249 break; 2250 } 2251 blk_mq_hctx_kobj_init(hctxs[i]); 2252 } 2253 for (j = i; j < q->nr_hw_queues; j++) { 2254 struct blk_mq_hw_ctx *hctx = hctxs[j]; 2255 2256 if (hctx) { 2257 if (hctx->tags) 2258 blk_mq_free_map_and_requests(set, j); 2259 blk_mq_exit_hctx(q, set, hctx, j); 2260 kobject_put(&hctx->kobj); 2261 hctxs[j] = NULL; 2262 2263 } 2264 } 2265 q->nr_hw_queues = i; 2266 blk_mq_sysfs_register(q); 2267 } 2268 2269 struct request_queue *blk_mq_init_allocated_queue(struct blk_mq_tag_set *set, 2270 struct request_queue *q) 2271 { 2272 /* mark the queue as mq asap */ 2273 q->mq_ops = set->ops; 2274 2275 q->poll_cb = blk_stat_alloc_callback(blk_mq_poll_stats_fn, 2276 blk_mq_poll_stats_bkt, 2277 BLK_MQ_POLL_STATS_BKTS, q); 2278 if (!q->poll_cb) 2279 goto err_exit; 2280 2281 q->queue_ctx = alloc_percpu(struct blk_mq_ctx); 2282 if (!q->queue_ctx) 2283 goto err_exit; 2284 2285 /* init q->mq_kobj and sw queues' kobjects */ 2286 blk_mq_sysfs_init(q); 2287 2288 q->queue_hw_ctx = kzalloc_node(nr_cpu_ids * sizeof(*(q->queue_hw_ctx)), 2289 GFP_KERNEL, set->numa_node); 2290 if (!q->queue_hw_ctx) 2291 goto err_percpu; 2292 2293 q->mq_map = set->mq_map; 2294 2295 blk_mq_realloc_hw_ctxs(set, q); 2296 if (!q->nr_hw_queues) 2297 goto err_hctxs; 2298 2299 INIT_WORK(&q->timeout_work, blk_mq_timeout_work); 2300 blk_queue_rq_timeout(q, set->timeout ? set->timeout : 30 * HZ); 2301 2302 q->nr_queues = nr_cpu_ids; 2303 2304 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 2305 2306 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 2307 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 2308 2309 q->sg_reserved_size = INT_MAX; 2310 2311 INIT_DELAYED_WORK(&q->requeue_work, blk_mq_requeue_work); 2312 INIT_LIST_HEAD(&q->requeue_list); 2313 spin_lock_init(&q->requeue_lock); 2314 2315 blk_queue_make_request(q, blk_mq_make_request); 2316 2317 /* 2318 * Do this after blk_queue_make_request() overrides it... 2319 */ 2320 q->nr_requests = set->queue_depth; 2321 2322 /* 2323 * Default to classic polling 2324 */ 2325 q->poll_nsec = -1; 2326 2327 if (set->ops->complete) 2328 blk_queue_softirq_done(q, set->ops->complete); 2329 2330 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 2331 2332 get_online_cpus(); 2333 mutex_lock(&all_q_mutex); 2334 2335 list_add_tail(&q->all_q_node, &all_q_list); 2336 blk_mq_add_queue_tag_set(set, q); 2337 blk_mq_map_swqueue(q, cpu_online_mask); 2338 2339 mutex_unlock(&all_q_mutex); 2340 put_online_cpus(); 2341 2342 if (!(set->flags & BLK_MQ_F_NO_SCHED)) { 2343 int ret; 2344 2345 ret = blk_mq_sched_init(q); 2346 if (ret) 2347 return ERR_PTR(ret); 2348 } 2349 2350 return q; 2351 2352 err_hctxs: 2353 kfree(q->queue_hw_ctx); 2354 err_percpu: 2355 free_percpu(q->queue_ctx); 2356 err_exit: 2357 q->mq_ops = NULL; 2358 return ERR_PTR(-ENOMEM); 2359 } 2360 EXPORT_SYMBOL(blk_mq_init_allocated_queue); 2361 2362 void blk_mq_free_queue(struct request_queue *q) 2363 { 2364 struct blk_mq_tag_set *set = q->tag_set; 2365 2366 mutex_lock(&all_q_mutex); 2367 list_del_init(&q->all_q_node); 2368 mutex_unlock(&all_q_mutex); 2369 2370 blk_mq_del_queue_tag_set(q); 2371 2372 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2373 } 2374 2375 /* Basically redo blk_mq_init_queue with queue frozen */ 2376 static void blk_mq_queue_reinit(struct request_queue *q, 2377 const struct cpumask *online_mask) 2378 { 2379 WARN_ON_ONCE(!atomic_read(&q->mq_freeze_depth)); 2380 2381 blk_mq_sysfs_unregister(q); 2382 2383 /* 2384 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2385 * we should change hctx numa_node according to new topology (this 2386 * involves free and re-allocate memory, worthy doing?) 2387 */ 2388 2389 blk_mq_map_swqueue(q, online_mask); 2390 2391 blk_mq_sysfs_register(q); 2392 } 2393 2394 /* 2395 * New online cpumask which is going to be set in this hotplug event. 2396 * Declare this cpumasks as global as cpu-hotplug operation is invoked 2397 * one-by-one and dynamically allocating this could result in a failure. 2398 */ 2399 static struct cpumask cpuhp_online_new; 2400 2401 static void blk_mq_queue_reinit_work(void) 2402 { 2403 struct request_queue *q; 2404 2405 mutex_lock(&all_q_mutex); 2406 /* 2407 * We need to freeze and reinit all existing queues. Freezing 2408 * involves synchronous wait for an RCU grace period and doing it 2409 * one by one may take a long time. Start freezing all queues in 2410 * one swoop and then wait for the completions so that freezing can 2411 * take place in parallel. 2412 */ 2413 list_for_each_entry(q, &all_q_list, all_q_node) 2414 blk_freeze_queue_start(q); 2415 list_for_each_entry(q, &all_q_list, all_q_node) 2416 blk_mq_freeze_queue_wait(q); 2417 2418 list_for_each_entry(q, &all_q_list, all_q_node) 2419 blk_mq_queue_reinit(q, &cpuhp_online_new); 2420 2421 list_for_each_entry(q, &all_q_list, all_q_node) 2422 blk_mq_unfreeze_queue(q); 2423 2424 mutex_unlock(&all_q_mutex); 2425 } 2426 2427 static int blk_mq_queue_reinit_dead(unsigned int cpu) 2428 { 2429 cpumask_copy(&cpuhp_online_new, cpu_online_mask); 2430 blk_mq_queue_reinit_work(); 2431 return 0; 2432 } 2433 2434 /* 2435 * Before hotadded cpu starts handling requests, new mappings must be 2436 * established. Otherwise, these requests in hw queue might never be 2437 * dispatched. 2438 * 2439 * For example, there is a single hw queue (hctx) and two CPU queues (ctx0 2440 * for CPU0, and ctx1 for CPU1). 2441 * 2442 * Now CPU1 is just onlined and a request is inserted into ctx1->rq_list 2443 * and set bit0 in pending bitmap as ctx1->index_hw is still zero. 2444 * 2445 * And then while running hw queue, blk_mq_flush_busy_ctxs() finds bit0 is set 2446 * in pending bitmap and tries to retrieve requests in hctx->ctxs[0]->rq_list. 2447 * But htx->ctxs[0] is a pointer to ctx0, so the request in ctx1->rq_list is 2448 * ignored. 2449 */ 2450 static int blk_mq_queue_reinit_prepare(unsigned int cpu) 2451 { 2452 cpumask_copy(&cpuhp_online_new, cpu_online_mask); 2453 cpumask_set_cpu(cpu, &cpuhp_online_new); 2454 blk_mq_queue_reinit_work(); 2455 return 0; 2456 } 2457 2458 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2459 { 2460 int i; 2461 2462 for (i = 0; i < set->nr_hw_queues; i++) 2463 if (!__blk_mq_alloc_rq_map(set, i)) 2464 goto out_unwind; 2465 2466 return 0; 2467 2468 out_unwind: 2469 while (--i >= 0) 2470 blk_mq_free_rq_map(set->tags[i]); 2471 2472 return -ENOMEM; 2473 } 2474 2475 /* 2476 * Allocate the request maps associated with this tag_set. Note that this 2477 * may reduce the depth asked for, if memory is tight. set->queue_depth 2478 * will be updated to reflect the allocated depth. 2479 */ 2480 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2481 { 2482 unsigned int depth; 2483 int err; 2484 2485 depth = set->queue_depth; 2486 do { 2487 err = __blk_mq_alloc_rq_maps(set); 2488 if (!err) 2489 break; 2490 2491 set->queue_depth >>= 1; 2492 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2493 err = -ENOMEM; 2494 break; 2495 } 2496 } while (set->queue_depth); 2497 2498 if (!set->queue_depth || err) { 2499 pr_err("blk-mq: failed to allocate request map\n"); 2500 return -ENOMEM; 2501 } 2502 2503 if (depth != set->queue_depth) 2504 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2505 depth, set->queue_depth); 2506 2507 return 0; 2508 } 2509 2510 static int blk_mq_update_queue_map(struct blk_mq_tag_set *set) 2511 { 2512 if (set->ops->map_queues) 2513 return set->ops->map_queues(set); 2514 else 2515 return blk_mq_map_queues(set); 2516 } 2517 2518 /* 2519 * Alloc a tag set to be associated with one or more request queues. 2520 * May fail with EINVAL for various error conditions. May adjust the 2521 * requested depth down, if if it too large. In that case, the set 2522 * value will be stored in set->queue_depth. 2523 */ 2524 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2525 { 2526 int ret; 2527 2528 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2529 2530 if (!set->nr_hw_queues) 2531 return -EINVAL; 2532 if (!set->queue_depth) 2533 return -EINVAL; 2534 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2535 return -EINVAL; 2536 2537 if (!set->ops->queue_rq) 2538 return -EINVAL; 2539 2540 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2541 pr_info("blk-mq: reduced tag depth to %u\n", 2542 BLK_MQ_MAX_DEPTH); 2543 set->queue_depth = BLK_MQ_MAX_DEPTH; 2544 } 2545 2546 /* 2547 * If a crashdump is active, then we are potentially in a very 2548 * memory constrained environment. Limit us to 1 queue and 2549 * 64 tags to prevent using too much memory. 2550 */ 2551 if (is_kdump_kernel()) { 2552 set->nr_hw_queues = 1; 2553 set->queue_depth = min(64U, set->queue_depth); 2554 } 2555 /* 2556 * There is no use for more h/w queues than cpus. 2557 */ 2558 if (set->nr_hw_queues > nr_cpu_ids) 2559 set->nr_hw_queues = nr_cpu_ids; 2560 2561 set->tags = kzalloc_node(nr_cpu_ids * sizeof(struct blk_mq_tags *), 2562 GFP_KERNEL, set->numa_node); 2563 if (!set->tags) 2564 return -ENOMEM; 2565 2566 ret = -ENOMEM; 2567 set->mq_map = kzalloc_node(sizeof(*set->mq_map) * nr_cpu_ids, 2568 GFP_KERNEL, set->numa_node); 2569 if (!set->mq_map) 2570 goto out_free_tags; 2571 2572 ret = blk_mq_update_queue_map(set); 2573 if (ret) 2574 goto out_free_mq_map; 2575 2576 ret = blk_mq_alloc_rq_maps(set); 2577 if (ret) 2578 goto out_free_mq_map; 2579 2580 mutex_init(&set->tag_list_lock); 2581 INIT_LIST_HEAD(&set->tag_list); 2582 2583 return 0; 2584 2585 out_free_mq_map: 2586 kfree(set->mq_map); 2587 set->mq_map = NULL; 2588 out_free_tags: 2589 kfree(set->tags); 2590 set->tags = NULL; 2591 return ret; 2592 } 2593 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2594 2595 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2596 { 2597 int i; 2598 2599 for (i = 0; i < nr_cpu_ids; i++) 2600 blk_mq_free_map_and_requests(set, i); 2601 2602 kfree(set->mq_map); 2603 set->mq_map = NULL; 2604 2605 kfree(set->tags); 2606 set->tags = NULL; 2607 } 2608 EXPORT_SYMBOL(blk_mq_free_tag_set); 2609 2610 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2611 { 2612 struct blk_mq_tag_set *set = q->tag_set; 2613 struct blk_mq_hw_ctx *hctx; 2614 int i, ret; 2615 2616 if (!set) 2617 return -EINVAL; 2618 2619 blk_mq_freeze_queue(q); 2620 blk_mq_quiesce_queue(q); 2621 2622 ret = 0; 2623 queue_for_each_hw_ctx(q, hctx, i) { 2624 if (!hctx->tags) 2625 continue; 2626 /* 2627 * If we're using an MQ scheduler, just update the scheduler 2628 * queue depth. This is similar to what the old code would do. 2629 */ 2630 if (!hctx->sched_tags) { 2631 ret = blk_mq_tag_update_depth(hctx, &hctx->tags, 2632 min(nr, set->queue_depth), 2633 false); 2634 } else { 2635 ret = blk_mq_tag_update_depth(hctx, &hctx->sched_tags, 2636 nr, true); 2637 } 2638 if (ret) 2639 break; 2640 } 2641 2642 if (!ret) 2643 q->nr_requests = nr; 2644 2645 blk_mq_unfreeze_queue(q); 2646 blk_mq_start_stopped_hw_queues(q, true); 2647 2648 return ret; 2649 } 2650 2651 void blk_mq_update_nr_hw_queues(struct blk_mq_tag_set *set, int nr_hw_queues) 2652 { 2653 struct request_queue *q; 2654 2655 lockdep_assert_held(&set->tag_list_lock); 2656 2657 if (nr_hw_queues > nr_cpu_ids) 2658 nr_hw_queues = nr_cpu_ids; 2659 if (nr_hw_queues < 1 || nr_hw_queues == set->nr_hw_queues) 2660 return; 2661 2662 list_for_each_entry(q, &set->tag_list, tag_set_list) 2663 blk_mq_freeze_queue(q); 2664 2665 set->nr_hw_queues = nr_hw_queues; 2666 blk_mq_update_queue_map(set); 2667 list_for_each_entry(q, &set->tag_list, tag_set_list) { 2668 blk_mq_realloc_hw_ctxs(set, q); 2669 blk_mq_queue_reinit(q, cpu_online_mask); 2670 } 2671 2672 list_for_each_entry(q, &set->tag_list, tag_set_list) 2673 blk_mq_unfreeze_queue(q); 2674 } 2675 EXPORT_SYMBOL_GPL(blk_mq_update_nr_hw_queues); 2676 2677 /* Enable polling stats and return whether they were already enabled. */ 2678 static bool blk_poll_stats_enable(struct request_queue *q) 2679 { 2680 if (test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 2681 test_and_set_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags)) 2682 return true; 2683 blk_stat_add_callback(q, q->poll_cb); 2684 return false; 2685 } 2686 2687 static void blk_mq_poll_stats_start(struct request_queue *q) 2688 { 2689 /* 2690 * We don't arm the callback if polling stats are not enabled or the 2691 * callback is already active. 2692 */ 2693 if (!test_bit(QUEUE_FLAG_POLL_STATS, &q->queue_flags) || 2694 blk_stat_is_active(q->poll_cb)) 2695 return; 2696 2697 blk_stat_activate_msecs(q->poll_cb, 100); 2698 } 2699 2700 static void blk_mq_poll_stats_fn(struct blk_stat_callback *cb) 2701 { 2702 struct request_queue *q = cb->data; 2703 int bucket; 2704 2705 for (bucket = 0; bucket < BLK_MQ_POLL_STATS_BKTS; bucket++) { 2706 if (cb->stat[bucket].nr_samples) 2707 q->poll_stat[bucket] = cb->stat[bucket]; 2708 } 2709 } 2710 2711 static unsigned long blk_mq_poll_nsecs(struct request_queue *q, 2712 struct blk_mq_hw_ctx *hctx, 2713 struct request *rq) 2714 { 2715 unsigned long ret = 0; 2716 int bucket; 2717 2718 /* 2719 * If stats collection isn't on, don't sleep but turn it on for 2720 * future users 2721 */ 2722 if (!blk_poll_stats_enable(q)) 2723 return 0; 2724 2725 /* 2726 * As an optimistic guess, use half of the mean service time 2727 * for this type of request. We can (and should) make this smarter. 2728 * For instance, if the completion latencies are tight, we can 2729 * get closer than just half the mean. This is especially 2730 * important on devices where the completion latencies are longer 2731 * than ~10 usec. We do use the stats for the relevant IO size 2732 * if available which does lead to better estimates. 2733 */ 2734 bucket = blk_mq_poll_stats_bkt(rq); 2735 if (bucket < 0) 2736 return ret; 2737 2738 if (q->poll_stat[bucket].nr_samples) 2739 ret = (q->poll_stat[bucket].mean + 1) / 2; 2740 2741 return ret; 2742 } 2743 2744 static bool blk_mq_poll_hybrid_sleep(struct request_queue *q, 2745 struct blk_mq_hw_ctx *hctx, 2746 struct request *rq) 2747 { 2748 struct hrtimer_sleeper hs; 2749 enum hrtimer_mode mode; 2750 unsigned int nsecs; 2751 ktime_t kt; 2752 2753 if (test_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags)) 2754 return false; 2755 2756 /* 2757 * poll_nsec can be: 2758 * 2759 * -1: don't ever hybrid sleep 2760 * 0: use half of prev avg 2761 * >0: use this specific value 2762 */ 2763 if (q->poll_nsec == -1) 2764 return false; 2765 else if (q->poll_nsec > 0) 2766 nsecs = q->poll_nsec; 2767 else 2768 nsecs = blk_mq_poll_nsecs(q, hctx, rq); 2769 2770 if (!nsecs) 2771 return false; 2772 2773 set_bit(REQ_ATOM_POLL_SLEPT, &rq->atomic_flags); 2774 2775 /* 2776 * This will be replaced with the stats tracking code, using 2777 * 'avg_completion_time / 2' as the pre-sleep target. 2778 */ 2779 kt = nsecs; 2780 2781 mode = HRTIMER_MODE_REL; 2782 hrtimer_init_on_stack(&hs.timer, CLOCK_MONOTONIC, mode); 2783 hrtimer_set_expires(&hs.timer, kt); 2784 2785 hrtimer_init_sleeper(&hs, current); 2786 do { 2787 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 2788 break; 2789 set_current_state(TASK_UNINTERRUPTIBLE); 2790 hrtimer_start_expires(&hs.timer, mode); 2791 if (hs.task) 2792 io_schedule(); 2793 hrtimer_cancel(&hs.timer); 2794 mode = HRTIMER_MODE_ABS; 2795 } while (hs.task && !signal_pending(current)); 2796 2797 __set_current_state(TASK_RUNNING); 2798 destroy_hrtimer_on_stack(&hs.timer); 2799 return true; 2800 } 2801 2802 static bool __blk_mq_poll(struct blk_mq_hw_ctx *hctx, struct request *rq) 2803 { 2804 struct request_queue *q = hctx->queue; 2805 long state; 2806 2807 /* 2808 * If we sleep, have the caller restart the poll loop to reset 2809 * the state. Like for the other success return cases, the 2810 * caller is responsible for checking if the IO completed. If 2811 * the IO isn't complete, we'll get called again and will go 2812 * straight to the busy poll loop. 2813 */ 2814 if (blk_mq_poll_hybrid_sleep(q, hctx, rq)) 2815 return true; 2816 2817 hctx->poll_considered++; 2818 2819 state = current->state; 2820 while (!need_resched()) { 2821 int ret; 2822 2823 hctx->poll_invoked++; 2824 2825 ret = q->mq_ops->poll(hctx, rq->tag); 2826 if (ret > 0) { 2827 hctx->poll_success++; 2828 set_current_state(TASK_RUNNING); 2829 return true; 2830 } 2831 2832 if (signal_pending_state(state, current)) 2833 set_current_state(TASK_RUNNING); 2834 2835 if (current->state == TASK_RUNNING) 2836 return true; 2837 if (ret < 0) 2838 break; 2839 cpu_relax(); 2840 } 2841 2842 return false; 2843 } 2844 2845 bool blk_mq_poll(struct request_queue *q, blk_qc_t cookie) 2846 { 2847 struct blk_mq_hw_ctx *hctx; 2848 struct blk_plug *plug; 2849 struct request *rq; 2850 2851 if (!q->mq_ops || !q->mq_ops->poll || !blk_qc_t_valid(cookie) || 2852 !test_bit(QUEUE_FLAG_POLL, &q->queue_flags)) 2853 return false; 2854 2855 plug = current->plug; 2856 if (plug) 2857 blk_flush_plug_list(plug, false); 2858 2859 hctx = q->queue_hw_ctx[blk_qc_t_to_queue_num(cookie)]; 2860 if (!blk_qc_t_is_internal(cookie)) 2861 rq = blk_mq_tag_to_rq(hctx->tags, blk_qc_t_to_tag(cookie)); 2862 else { 2863 rq = blk_mq_tag_to_rq(hctx->sched_tags, blk_qc_t_to_tag(cookie)); 2864 /* 2865 * With scheduling, if the request has completed, we'll 2866 * get a NULL return here, as we clear the sched tag when 2867 * that happens. The request still remains valid, like always, 2868 * so we should be safe with just the NULL check. 2869 */ 2870 if (!rq) 2871 return false; 2872 } 2873 2874 return __blk_mq_poll(hctx, rq); 2875 } 2876 EXPORT_SYMBOL_GPL(blk_mq_poll); 2877 2878 void blk_mq_disable_hotplug(void) 2879 { 2880 mutex_lock(&all_q_mutex); 2881 } 2882 2883 void blk_mq_enable_hotplug(void) 2884 { 2885 mutex_unlock(&all_q_mutex); 2886 } 2887 2888 static int __init blk_mq_init(void) 2889 { 2890 cpuhp_setup_state_multi(CPUHP_BLK_MQ_DEAD, "block/mq:dead", NULL, 2891 blk_mq_hctx_notify_dead); 2892 2893 cpuhp_setup_state_nocalls(CPUHP_BLK_MQ_PREPARE, "block/mq:prepare", 2894 blk_mq_queue_reinit_prepare, 2895 blk_mq_queue_reinit_dead); 2896 return 0; 2897 } 2898 subsys_initcall(blk_mq_init); 2899