1 /* 2 * Block multiqueue core code 3 * 4 * Copyright (C) 2013-2014 Jens Axboe 5 * Copyright (C) 2013-2014 Christoph Hellwig 6 */ 7 #include <linux/kernel.h> 8 #include <linux/module.h> 9 #include <linux/backing-dev.h> 10 #include <linux/bio.h> 11 #include <linux/blkdev.h> 12 #include <linux/mm.h> 13 #include <linux/init.h> 14 #include <linux/slab.h> 15 #include <linux/workqueue.h> 16 #include <linux/smp.h> 17 #include <linux/llist.h> 18 #include <linux/list_sort.h> 19 #include <linux/cpu.h> 20 #include <linux/cache.h> 21 #include <linux/sched/sysctl.h> 22 #include <linux/delay.h> 23 #include <linux/crash_dump.h> 24 25 #include <trace/events/block.h> 26 27 #include <linux/blk-mq.h> 28 #include "blk.h" 29 #include "blk-mq.h" 30 #include "blk-mq-tag.h" 31 32 static DEFINE_MUTEX(all_q_mutex); 33 static LIST_HEAD(all_q_list); 34 35 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 36 static void blk_mq_run_queues(struct request_queue *q); 37 38 /* 39 * Check if any of the ctx's have pending work in this hardware queue 40 */ 41 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 42 { 43 unsigned int i; 44 45 for (i = 0; i < hctx->ctx_map.map_size; i++) 46 if (hctx->ctx_map.map[i].word) 47 return true; 48 49 return false; 50 } 51 52 static inline struct blk_align_bitmap *get_bm(struct blk_mq_hw_ctx *hctx, 53 struct blk_mq_ctx *ctx) 54 { 55 return &hctx->ctx_map.map[ctx->index_hw / hctx->ctx_map.bits_per_word]; 56 } 57 58 #define CTX_TO_BIT(hctx, ctx) \ 59 ((ctx)->index_hw & ((hctx)->ctx_map.bits_per_word - 1)) 60 61 /* 62 * Mark this ctx as having pending work in this hardware queue 63 */ 64 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 65 struct blk_mq_ctx *ctx) 66 { 67 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 68 69 if (!test_bit(CTX_TO_BIT(hctx, ctx), &bm->word)) 70 set_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 71 } 72 73 static void blk_mq_hctx_clear_pending(struct blk_mq_hw_ctx *hctx, 74 struct blk_mq_ctx *ctx) 75 { 76 struct blk_align_bitmap *bm = get_bm(hctx, ctx); 77 78 clear_bit(CTX_TO_BIT(hctx, ctx), &bm->word); 79 } 80 81 static int blk_mq_queue_enter(struct request_queue *q) 82 { 83 while (true) { 84 int ret; 85 86 if (percpu_ref_tryget_live(&q->mq_usage_counter)) 87 return 0; 88 89 ret = wait_event_interruptible(q->mq_freeze_wq, 90 !q->mq_freeze_depth || blk_queue_dying(q)); 91 if (blk_queue_dying(q)) 92 return -ENODEV; 93 if (ret) 94 return ret; 95 } 96 } 97 98 static void blk_mq_queue_exit(struct request_queue *q) 99 { 100 percpu_ref_put(&q->mq_usage_counter); 101 } 102 103 static void blk_mq_usage_counter_release(struct percpu_ref *ref) 104 { 105 struct request_queue *q = 106 container_of(ref, struct request_queue, mq_usage_counter); 107 108 wake_up_all(&q->mq_freeze_wq); 109 } 110 111 void blk_mq_freeze_queue_start(struct request_queue *q) 112 { 113 bool freeze; 114 115 spin_lock_irq(q->queue_lock); 116 freeze = !q->mq_freeze_depth++; 117 spin_unlock_irq(q->queue_lock); 118 119 if (freeze) { 120 percpu_ref_kill(&q->mq_usage_counter); 121 blk_mq_run_queues(q); 122 } 123 } 124 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue_start); 125 126 static void blk_mq_freeze_queue_wait(struct request_queue *q) 127 { 128 wait_event(q->mq_freeze_wq, percpu_ref_is_zero(&q->mq_usage_counter)); 129 } 130 131 /* 132 * Guarantee no request is in use, so we can change any data structure of 133 * the queue afterward. 134 */ 135 void blk_mq_freeze_queue(struct request_queue *q) 136 { 137 blk_mq_freeze_queue_start(q); 138 blk_mq_freeze_queue_wait(q); 139 } 140 EXPORT_SYMBOL_GPL(blk_mq_freeze_queue); 141 142 void blk_mq_unfreeze_queue(struct request_queue *q) 143 { 144 bool wake; 145 146 spin_lock_irq(q->queue_lock); 147 wake = !--q->mq_freeze_depth; 148 WARN_ON_ONCE(q->mq_freeze_depth < 0); 149 spin_unlock_irq(q->queue_lock); 150 if (wake) { 151 percpu_ref_reinit(&q->mq_usage_counter); 152 wake_up_all(&q->mq_freeze_wq); 153 } 154 } 155 EXPORT_SYMBOL_GPL(blk_mq_unfreeze_queue); 156 157 void blk_mq_wake_waiters(struct request_queue *q) 158 { 159 struct blk_mq_hw_ctx *hctx; 160 unsigned int i; 161 162 queue_for_each_hw_ctx(q, hctx, i) 163 if (blk_mq_hw_queue_mapped(hctx)) 164 blk_mq_tag_wakeup_all(hctx->tags, true); 165 166 /* 167 * If we are called because the queue has now been marked as 168 * dying, we need to ensure that processes currently waiting on 169 * the queue are notified as well. 170 */ 171 wake_up_all(&q->mq_freeze_wq); 172 } 173 174 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 175 { 176 return blk_mq_has_free_tags(hctx->tags); 177 } 178 EXPORT_SYMBOL(blk_mq_can_queue); 179 180 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 181 struct request *rq, unsigned int rw_flags) 182 { 183 if (blk_queue_io_stat(q)) 184 rw_flags |= REQ_IO_STAT; 185 186 INIT_LIST_HEAD(&rq->queuelist); 187 /* csd/requeue_work/fifo_time is initialized before use */ 188 rq->q = q; 189 rq->mq_ctx = ctx; 190 rq->cmd_flags |= rw_flags; 191 /* do not touch atomic flags, it needs atomic ops against the timer */ 192 rq->cpu = -1; 193 INIT_HLIST_NODE(&rq->hash); 194 RB_CLEAR_NODE(&rq->rb_node); 195 rq->rq_disk = NULL; 196 rq->part = NULL; 197 rq->start_time = jiffies; 198 #ifdef CONFIG_BLK_CGROUP 199 rq->rl = NULL; 200 set_start_time_ns(rq); 201 rq->io_start_time_ns = 0; 202 #endif 203 rq->nr_phys_segments = 0; 204 #if defined(CONFIG_BLK_DEV_INTEGRITY) 205 rq->nr_integrity_segments = 0; 206 #endif 207 rq->special = NULL; 208 /* tag was already set */ 209 rq->errors = 0; 210 211 rq->cmd = rq->__cmd; 212 213 rq->extra_len = 0; 214 rq->sense_len = 0; 215 rq->resid_len = 0; 216 rq->sense = NULL; 217 218 INIT_LIST_HEAD(&rq->timeout_list); 219 rq->timeout = 0; 220 221 rq->end_io = NULL; 222 rq->end_io_data = NULL; 223 rq->next_rq = NULL; 224 225 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 226 } 227 228 static struct request * 229 __blk_mq_alloc_request(struct blk_mq_alloc_data *data, int rw) 230 { 231 struct request *rq; 232 unsigned int tag; 233 234 tag = blk_mq_get_tag(data); 235 if (tag != BLK_MQ_TAG_FAIL) { 236 rq = data->hctx->tags->rqs[tag]; 237 238 if (blk_mq_tag_busy(data->hctx)) { 239 rq->cmd_flags = REQ_MQ_INFLIGHT; 240 atomic_inc(&data->hctx->nr_active); 241 } 242 243 rq->tag = tag; 244 blk_mq_rq_ctx_init(data->q, data->ctx, rq, rw); 245 return rq; 246 } 247 248 return NULL; 249 } 250 251 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp, 252 bool reserved) 253 { 254 struct blk_mq_ctx *ctx; 255 struct blk_mq_hw_ctx *hctx; 256 struct request *rq; 257 struct blk_mq_alloc_data alloc_data; 258 int ret; 259 260 ret = blk_mq_queue_enter(q); 261 if (ret) 262 return ERR_PTR(ret); 263 264 ctx = blk_mq_get_ctx(q); 265 hctx = q->mq_ops->map_queue(q, ctx->cpu); 266 blk_mq_set_alloc_data(&alloc_data, q, gfp & ~__GFP_WAIT, 267 reserved, ctx, hctx); 268 269 rq = __blk_mq_alloc_request(&alloc_data, rw); 270 if (!rq && (gfp & __GFP_WAIT)) { 271 __blk_mq_run_hw_queue(hctx); 272 blk_mq_put_ctx(ctx); 273 274 ctx = blk_mq_get_ctx(q); 275 hctx = q->mq_ops->map_queue(q, ctx->cpu); 276 blk_mq_set_alloc_data(&alloc_data, q, gfp, reserved, ctx, 277 hctx); 278 rq = __blk_mq_alloc_request(&alloc_data, rw); 279 ctx = alloc_data.ctx; 280 } 281 blk_mq_put_ctx(ctx); 282 if (!rq) { 283 blk_mq_queue_exit(q); 284 return ERR_PTR(-EWOULDBLOCK); 285 } 286 return rq; 287 } 288 EXPORT_SYMBOL(blk_mq_alloc_request); 289 290 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 291 struct blk_mq_ctx *ctx, struct request *rq) 292 { 293 const int tag = rq->tag; 294 struct request_queue *q = rq->q; 295 296 if (rq->cmd_flags & REQ_MQ_INFLIGHT) 297 atomic_dec(&hctx->nr_active); 298 rq->cmd_flags = 0; 299 300 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 301 blk_mq_put_tag(hctx, tag, &ctx->last_tag); 302 blk_mq_queue_exit(q); 303 } 304 305 void blk_mq_free_hctx_request(struct blk_mq_hw_ctx *hctx, struct request *rq) 306 { 307 struct blk_mq_ctx *ctx = rq->mq_ctx; 308 309 ctx->rq_completed[rq_is_sync(rq)]++; 310 __blk_mq_free_request(hctx, ctx, rq); 311 312 } 313 EXPORT_SYMBOL_GPL(blk_mq_free_hctx_request); 314 315 void blk_mq_free_request(struct request *rq) 316 { 317 struct blk_mq_hw_ctx *hctx; 318 struct request_queue *q = rq->q; 319 320 hctx = q->mq_ops->map_queue(q, rq->mq_ctx->cpu); 321 blk_mq_free_hctx_request(hctx, rq); 322 } 323 EXPORT_SYMBOL_GPL(blk_mq_free_request); 324 325 inline void __blk_mq_end_request(struct request *rq, int error) 326 { 327 blk_account_io_done(rq); 328 329 if (rq->end_io) { 330 rq->end_io(rq, error); 331 } else { 332 if (unlikely(blk_bidi_rq(rq))) 333 blk_mq_free_request(rq->next_rq); 334 blk_mq_free_request(rq); 335 } 336 } 337 EXPORT_SYMBOL(__blk_mq_end_request); 338 339 void blk_mq_end_request(struct request *rq, int error) 340 { 341 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 342 BUG(); 343 __blk_mq_end_request(rq, error); 344 } 345 EXPORT_SYMBOL(blk_mq_end_request); 346 347 static void __blk_mq_complete_request_remote(void *data) 348 { 349 struct request *rq = data; 350 351 rq->q->softirq_done_fn(rq); 352 } 353 354 static void blk_mq_ipi_complete_request(struct request *rq) 355 { 356 struct blk_mq_ctx *ctx = rq->mq_ctx; 357 bool shared = false; 358 int cpu; 359 360 if (!test_bit(QUEUE_FLAG_SAME_COMP, &rq->q->queue_flags)) { 361 rq->q->softirq_done_fn(rq); 362 return; 363 } 364 365 cpu = get_cpu(); 366 if (!test_bit(QUEUE_FLAG_SAME_FORCE, &rq->q->queue_flags)) 367 shared = cpus_share_cache(cpu, ctx->cpu); 368 369 if (cpu != ctx->cpu && !shared && cpu_online(ctx->cpu)) { 370 rq->csd.func = __blk_mq_complete_request_remote; 371 rq->csd.info = rq; 372 rq->csd.flags = 0; 373 smp_call_function_single_async(ctx->cpu, &rq->csd); 374 } else { 375 rq->q->softirq_done_fn(rq); 376 } 377 put_cpu(); 378 } 379 380 void __blk_mq_complete_request(struct request *rq) 381 { 382 struct request_queue *q = rq->q; 383 384 if (!q->softirq_done_fn) 385 blk_mq_end_request(rq, rq->errors); 386 else 387 blk_mq_ipi_complete_request(rq); 388 } 389 390 /** 391 * blk_mq_complete_request - end I/O on a request 392 * @rq: the request being processed 393 * 394 * Description: 395 * Ends all I/O on a request. It does not handle partial completions. 396 * The actual completion happens out-of-order, through a IPI handler. 397 **/ 398 void blk_mq_complete_request(struct request *rq) 399 { 400 struct request_queue *q = rq->q; 401 402 if (unlikely(blk_should_fake_timeout(q))) 403 return; 404 if (!blk_mark_rq_complete(rq)) 405 __blk_mq_complete_request(rq); 406 } 407 EXPORT_SYMBOL(blk_mq_complete_request); 408 409 int blk_mq_request_started(struct request *rq) 410 { 411 return test_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 412 } 413 EXPORT_SYMBOL_GPL(blk_mq_request_started); 414 415 void blk_mq_start_request(struct request *rq) 416 { 417 struct request_queue *q = rq->q; 418 419 trace_block_rq_issue(q, rq); 420 421 rq->resid_len = blk_rq_bytes(rq); 422 if (unlikely(blk_bidi_rq(rq))) 423 rq->next_rq->resid_len = blk_rq_bytes(rq->next_rq); 424 425 blk_add_timer(rq); 426 427 /* 428 * Ensure that ->deadline is visible before set the started 429 * flag and clear the completed flag. 430 */ 431 smp_mb__before_atomic(); 432 433 /* 434 * Mark us as started and clear complete. Complete might have been 435 * set if requeue raced with timeout, which then marked it as 436 * complete. So be sure to clear complete again when we start 437 * the request, otherwise we'll ignore the completion event. 438 */ 439 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 440 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 441 if (test_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags)) 442 clear_bit(REQ_ATOM_COMPLETE, &rq->atomic_flags); 443 444 if (q->dma_drain_size && blk_rq_bytes(rq)) { 445 /* 446 * Make sure space for the drain appears. We know we can do 447 * this because max_hw_segments has been adjusted to be one 448 * fewer than the device can handle. 449 */ 450 rq->nr_phys_segments++; 451 } 452 } 453 EXPORT_SYMBOL(blk_mq_start_request); 454 455 static void __blk_mq_requeue_request(struct request *rq) 456 { 457 struct request_queue *q = rq->q; 458 459 trace_block_rq_requeue(q, rq); 460 461 if (test_and_clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 462 if (q->dma_drain_size && blk_rq_bytes(rq)) 463 rq->nr_phys_segments--; 464 } 465 } 466 467 void blk_mq_requeue_request(struct request *rq) 468 { 469 __blk_mq_requeue_request(rq); 470 471 BUG_ON(blk_queued_rq(rq)); 472 blk_mq_add_to_requeue_list(rq, true); 473 } 474 EXPORT_SYMBOL(blk_mq_requeue_request); 475 476 static void blk_mq_requeue_work(struct work_struct *work) 477 { 478 struct request_queue *q = 479 container_of(work, struct request_queue, requeue_work); 480 LIST_HEAD(rq_list); 481 struct request *rq, *next; 482 unsigned long flags; 483 484 spin_lock_irqsave(&q->requeue_lock, flags); 485 list_splice_init(&q->requeue_list, &rq_list); 486 spin_unlock_irqrestore(&q->requeue_lock, flags); 487 488 list_for_each_entry_safe(rq, next, &rq_list, queuelist) { 489 if (!(rq->cmd_flags & REQ_SOFTBARRIER)) 490 continue; 491 492 rq->cmd_flags &= ~REQ_SOFTBARRIER; 493 list_del_init(&rq->queuelist); 494 blk_mq_insert_request(rq, true, false, false); 495 } 496 497 while (!list_empty(&rq_list)) { 498 rq = list_entry(rq_list.next, struct request, queuelist); 499 list_del_init(&rq->queuelist); 500 blk_mq_insert_request(rq, false, false, false); 501 } 502 503 /* 504 * Use the start variant of queue running here, so that running 505 * the requeue work will kick stopped queues. 506 */ 507 blk_mq_start_hw_queues(q); 508 } 509 510 void blk_mq_add_to_requeue_list(struct request *rq, bool at_head) 511 { 512 struct request_queue *q = rq->q; 513 unsigned long flags; 514 515 /* 516 * We abuse this flag that is otherwise used by the I/O scheduler to 517 * request head insertation from the workqueue. 518 */ 519 BUG_ON(rq->cmd_flags & REQ_SOFTBARRIER); 520 521 spin_lock_irqsave(&q->requeue_lock, flags); 522 if (at_head) { 523 rq->cmd_flags |= REQ_SOFTBARRIER; 524 list_add(&rq->queuelist, &q->requeue_list); 525 } else { 526 list_add_tail(&rq->queuelist, &q->requeue_list); 527 } 528 spin_unlock_irqrestore(&q->requeue_lock, flags); 529 } 530 EXPORT_SYMBOL(blk_mq_add_to_requeue_list); 531 532 void blk_mq_cancel_requeue_work(struct request_queue *q) 533 { 534 cancel_work_sync(&q->requeue_work); 535 } 536 EXPORT_SYMBOL_GPL(blk_mq_cancel_requeue_work); 537 538 void blk_mq_kick_requeue_list(struct request_queue *q) 539 { 540 kblockd_schedule_work(&q->requeue_work); 541 } 542 EXPORT_SYMBOL(blk_mq_kick_requeue_list); 543 544 void blk_mq_abort_requeue_list(struct request_queue *q) 545 { 546 unsigned long flags; 547 LIST_HEAD(rq_list); 548 549 spin_lock_irqsave(&q->requeue_lock, flags); 550 list_splice_init(&q->requeue_list, &rq_list); 551 spin_unlock_irqrestore(&q->requeue_lock, flags); 552 553 while (!list_empty(&rq_list)) { 554 struct request *rq; 555 556 rq = list_first_entry(&rq_list, struct request, queuelist); 557 list_del_init(&rq->queuelist); 558 rq->errors = -EIO; 559 blk_mq_end_request(rq, rq->errors); 560 } 561 } 562 EXPORT_SYMBOL(blk_mq_abort_requeue_list); 563 564 static inline bool is_flush_request(struct request *rq, 565 struct blk_flush_queue *fq, unsigned int tag) 566 { 567 return ((rq->cmd_flags & REQ_FLUSH_SEQ) && 568 fq->flush_rq->tag == tag); 569 } 570 571 struct request *blk_mq_tag_to_rq(struct blk_mq_tags *tags, unsigned int tag) 572 { 573 struct request *rq = tags->rqs[tag]; 574 /* mq_ctx of flush rq is always cloned from the corresponding req */ 575 struct blk_flush_queue *fq = blk_get_flush_queue(rq->q, rq->mq_ctx); 576 577 if (!is_flush_request(rq, fq, tag)) 578 return rq; 579 580 return fq->flush_rq; 581 } 582 EXPORT_SYMBOL(blk_mq_tag_to_rq); 583 584 struct blk_mq_timeout_data { 585 unsigned long next; 586 unsigned int next_set; 587 }; 588 589 void blk_mq_rq_timed_out(struct request *req, bool reserved) 590 { 591 struct blk_mq_ops *ops = req->q->mq_ops; 592 enum blk_eh_timer_return ret = BLK_EH_RESET_TIMER; 593 594 /* 595 * We know that complete is set at this point. If STARTED isn't set 596 * anymore, then the request isn't active and the "timeout" should 597 * just be ignored. This can happen due to the bitflag ordering. 598 * Timeout first checks if STARTED is set, and if it is, assumes 599 * the request is active. But if we race with completion, then 600 * we both flags will get cleared. So check here again, and ignore 601 * a timeout event with a request that isn't active. 602 */ 603 if (!test_bit(REQ_ATOM_STARTED, &req->atomic_flags)) 604 return; 605 606 if (ops->timeout) 607 ret = ops->timeout(req, reserved); 608 609 switch (ret) { 610 case BLK_EH_HANDLED: 611 __blk_mq_complete_request(req); 612 break; 613 case BLK_EH_RESET_TIMER: 614 blk_add_timer(req); 615 blk_clear_rq_complete(req); 616 break; 617 case BLK_EH_NOT_HANDLED: 618 break; 619 default: 620 printk(KERN_ERR "block: bad eh return: %d\n", ret); 621 break; 622 } 623 } 624 625 static void blk_mq_check_expired(struct blk_mq_hw_ctx *hctx, 626 struct request *rq, void *priv, bool reserved) 627 { 628 struct blk_mq_timeout_data *data = priv; 629 630 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) { 631 /* 632 * If a request wasn't started before the queue was 633 * marked dying, kill it here or it'll go unnoticed. 634 */ 635 if (unlikely(blk_queue_dying(rq->q))) { 636 rq->errors = -EIO; 637 blk_mq_complete_request(rq); 638 } 639 return; 640 } 641 if (rq->cmd_flags & REQ_NO_TIMEOUT) 642 return; 643 644 if (time_after_eq(jiffies, rq->deadline)) { 645 if (!blk_mark_rq_complete(rq)) 646 blk_mq_rq_timed_out(rq, reserved); 647 } else if (!data->next_set || time_after(data->next, rq->deadline)) { 648 data->next = rq->deadline; 649 data->next_set = 1; 650 } 651 } 652 653 static void blk_mq_rq_timer(unsigned long priv) 654 { 655 struct request_queue *q = (struct request_queue *)priv; 656 struct blk_mq_timeout_data data = { 657 .next = 0, 658 .next_set = 0, 659 }; 660 struct blk_mq_hw_ctx *hctx; 661 int i; 662 663 queue_for_each_hw_ctx(q, hctx, i) { 664 /* 665 * If not software queues are currently mapped to this 666 * hardware queue, there's nothing to check 667 */ 668 if (!blk_mq_hw_queue_mapped(hctx)) 669 continue; 670 671 blk_mq_tag_busy_iter(hctx, blk_mq_check_expired, &data); 672 } 673 674 if (data.next_set) { 675 data.next = blk_rq_timeout(round_jiffies_up(data.next)); 676 mod_timer(&q->timeout, data.next); 677 } else { 678 queue_for_each_hw_ctx(q, hctx, i) 679 blk_mq_tag_idle(hctx); 680 } 681 } 682 683 /* 684 * Reverse check our software queue for entries that we could potentially 685 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 686 * too much time checking for merges. 687 */ 688 static bool blk_mq_attempt_merge(struct request_queue *q, 689 struct blk_mq_ctx *ctx, struct bio *bio) 690 { 691 struct request *rq; 692 int checked = 8; 693 694 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 695 int el_ret; 696 697 if (!checked--) 698 break; 699 700 if (!blk_rq_merge_ok(rq, bio)) 701 continue; 702 703 el_ret = blk_try_merge(rq, bio); 704 if (el_ret == ELEVATOR_BACK_MERGE) { 705 if (bio_attempt_back_merge(q, rq, bio)) { 706 ctx->rq_merged++; 707 return true; 708 } 709 break; 710 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 711 if (bio_attempt_front_merge(q, rq, bio)) { 712 ctx->rq_merged++; 713 return true; 714 } 715 break; 716 } 717 } 718 719 return false; 720 } 721 722 /* 723 * Process software queues that have been marked busy, splicing them 724 * to the for-dispatch 725 */ 726 static void flush_busy_ctxs(struct blk_mq_hw_ctx *hctx, struct list_head *list) 727 { 728 struct blk_mq_ctx *ctx; 729 int i; 730 731 for (i = 0; i < hctx->ctx_map.map_size; i++) { 732 struct blk_align_bitmap *bm = &hctx->ctx_map.map[i]; 733 unsigned int off, bit; 734 735 if (!bm->word) 736 continue; 737 738 bit = 0; 739 off = i * hctx->ctx_map.bits_per_word; 740 do { 741 bit = find_next_bit(&bm->word, bm->depth, bit); 742 if (bit >= bm->depth) 743 break; 744 745 ctx = hctx->ctxs[bit + off]; 746 clear_bit(bit, &bm->word); 747 spin_lock(&ctx->lock); 748 list_splice_tail_init(&ctx->rq_list, list); 749 spin_unlock(&ctx->lock); 750 751 bit++; 752 } while (1); 753 } 754 } 755 756 /* 757 * Run this hardware queue, pulling any software queues mapped to it in. 758 * Note that this function currently has various problems around ordering 759 * of IO. In particular, we'd like FIFO behaviour on handling existing 760 * items on the hctx->dispatch list. Ignore that for now. 761 */ 762 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 763 { 764 struct request_queue *q = hctx->queue; 765 struct request *rq; 766 LIST_HEAD(rq_list); 767 LIST_HEAD(driver_list); 768 struct list_head *dptr; 769 int queued; 770 771 WARN_ON(!cpumask_test_cpu(raw_smp_processor_id(), hctx->cpumask)); 772 773 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 774 return; 775 776 hctx->run++; 777 778 /* 779 * Touch any software queue that has pending entries. 780 */ 781 flush_busy_ctxs(hctx, &rq_list); 782 783 /* 784 * If we have previous entries on our dispatch list, grab them 785 * and stuff them at the front for more fair dispatch. 786 */ 787 if (!list_empty_careful(&hctx->dispatch)) { 788 spin_lock(&hctx->lock); 789 if (!list_empty(&hctx->dispatch)) 790 list_splice_init(&hctx->dispatch, &rq_list); 791 spin_unlock(&hctx->lock); 792 } 793 794 /* 795 * Start off with dptr being NULL, so we start the first request 796 * immediately, even if we have more pending. 797 */ 798 dptr = NULL; 799 800 /* 801 * Now process all the entries, sending them to the driver. 802 */ 803 queued = 0; 804 while (!list_empty(&rq_list)) { 805 struct blk_mq_queue_data bd; 806 int ret; 807 808 rq = list_first_entry(&rq_list, struct request, queuelist); 809 list_del_init(&rq->queuelist); 810 811 bd.rq = rq; 812 bd.list = dptr; 813 bd.last = list_empty(&rq_list); 814 815 ret = q->mq_ops->queue_rq(hctx, &bd); 816 switch (ret) { 817 case BLK_MQ_RQ_QUEUE_OK: 818 queued++; 819 continue; 820 case BLK_MQ_RQ_QUEUE_BUSY: 821 list_add(&rq->queuelist, &rq_list); 822 __blk_mq_requeue_request(rq); 823 break; 824 default: 825 pr_err("blk-mq: bad return on queue: %d\n", ret); 826 case BLK_MQ_RQ_QUEUE_ERROR: 827 rq->errors = -EIO; 828 blk_mq_end_request(rq, rq->errors); 829 break; 830 } 831 832 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 833 break; 834 835 /* 836 * We've done the first request. If we have more than 1 837 * left in the list, set dptr to defer issue. 838 */ 839 if (!dptr && rq_list.next != rq_list.prev) 840 dptr = &driver_list; 841 } 842 843 if (!queued) 844 hctx->dispatched[0]++; 845 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 846 hctx->dispatched[ilog2(queued) + 1]++; 847 848 /* 849 * Any items that need requeuing? Stuff them into hctx->dispatch, 850 * that is where we will continue on next queue run. 851 */ 852 if (!list_empty(&rq_list)) { 853 spin_lock(&hctx->lock); 854 list_splice(&rq_list, &hctx->dispatch); 855 spin_unlock(&hctx->lock); 856 } 857 } 858 859 /* 860 * It'd be great if the workqueue API had a way to pass 861 * in a mask and had some smarts for more clever placement. 862 * For now we just round-robin here, switching for every 863 * BLK_MQ_CPU_WORK_BATCH queued items. 864 */ 865 static int blk_mq_hctx_next_cpu(struct blk_mq_hw_ctx *hctx) 866 { 867 if (hctx->queue->nr_hw_queues == 1) 868 return WORK_CPU_UNBOUND; 869 870 if (--hctx->next_cpu_batch <= 0) { 871 int cpu = hctx->next_cpu, next_cpu; 872 873 next_cpu = cpumask_next(hctx->next_cpu, hctx->cpumask); 874 if (next_cpu >= nr_cpu_ids) 875 next_cpu = cpumask_first(hctx->cpumask); 876 877 hctx->next_cpu = next_cpu; 878 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 879 880 return cpu; 881 } 882 883 return hctx->next_cpu; 884 } 885 886 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 887 { 888 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state) || 889 !blk_mq_hw_queue_mapped(hctx))) 890 return; 891 892 if (!async) { 893 int cpu = get_cpu(); 894 if (cpumask_test_cpu(cpu, hctx->cpumask)) { 895 __blk_mq_run_hw_queue(hctx); 896 put_cpu(); 897 return; 898 } 899 900 put_cpu(); 901 } 902 903 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 904 &hctx->run_work, 0); 905 } 906 907 static void blk_mq_run_queues(struct request_queue *q) 908 { 909 struct blk_mq_hw_ctx *hctx; 910 int i; 911 912 queue_for_each_hw_ctx(q, hctx, i) { 913 if ((!blk_mq_hctx_has_pending(hctx) && 914 list_empty_careful(&hctx->dispatch)) || 915 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 916 continue; 917 918 blk_mq_run_hw_queue(hctx, false); 919 } 920 } 921 922 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 923 { 924 cancel_delayed_work(&hctx->run_work); 925 cancel_delayed_work(&hctx->delay_work); 926 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 927 } 928 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 929 930 void blk_mq_stop_hw_queues(struct request_queue *q) 931 { 932 struct blk_mq_hw_ctx *hctx; 933 int i; 934 935 queue_for_each_hw_ctx(q, hctx, i) 936 blk_mq_stop_hw_queue(hctx); 937 } 938 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 939 940 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 941 { 942 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 943 944 blk_mq_run_hw_queue(hctx, false); 945 } 946 EXPORT_SYMBOL(blk_mq_start_hw_queue); 947 948 void blk_mq_start_hw_queues(struct request_queue *q) 949 { 950 struct blk_mq_hw_ctx *hctx; 951 int i; 952 953 queue_for_each_hw_ctx(q, hctx, i) 954 blk_mq_start_hw_queue(hctx); 955 } 956 EXPORT_SYMBOL(blk_mq_start_hw_queues); 957 958 void blk_mq_start_stopped_hw_queues(struct request_queue *q, bool async) 959 { 960 struct blk_mq_hw_ctx *hctx; 961 int i; 962 963 queue_for_each_hw_ctx(q, hctx, i) { 964 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 965 continue; 966 967 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 968 blk_mq_run_hw_queue(hctx, async); 969 } 970 } 971 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 972 973 static void blk_mq_run_work_fn(struct work_struct *work) 974 { 975 struct blk_mq_hw_ctx *hctx; 976 977 hctx = container_of(work, struct blk_mq_hw_ctx, run_work.work); 978 979 __blk_mq_run_hw_queue(hctx); 980 } 981 982 static void blk_mq_delay_work_fn(struct work_struct *work) 983 { 984 struct blk_mq_hw_ctx *hctx; 985 986 hctx = container_of(work, struct blk_mq_hw_ctx, delay_work.work); 987 988 if (test_and_clear_bit(BLK_MQ_S_STOPPED, &hctx->state)) 989 __blk_mq_run_hw_queue(hctx); 990 } 991 992 void blk_mq_delay_queue(struct blk_mq_hw_ctx *hctx, unsigned long msecs) 993 { 994 if (unlikely(!blk_mq_hw_queue_mapped(hctx))) 995 return; 996 997 kblockd_schedule_delayed_work_on(blk_mq_hctx_next_cpu(hctx), 998 &hctx->delay_work, msecs_to_jiffies(msecs)); 999 } 1000 EXPORT_SYMBOL(blk_mq_delay_queue); 1001 1002 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 1003 struct request *rq, bool at_head) 1004 { 1005 struct blk_mq_ctx *ctx = rq->mq_ctx; 1006 1007 trace_block_rq_insert(hctx->queue, rq); 1008 1009 if (at_head) 1010 list_add(&rq->queuelist, &ctx->rq_list); 1011 else 1012 list_add_tail(&rq->queuelist, &ctx->rq_list); 1013 1014 blk_mq_hctx_mark_pending(hctx, ctx); 1015 } 1016 1017 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 1018 bool async) 1019 { 1020 struct request_queue *q = rq->q; 1021 struct blk_mq_hw_ctx *hctx; 1022 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; 1023 1024 current_ctx = blk_mq_get_ctx(q); 1025 if (!cpu_online(ctx->cpu)) 1026 rq->mq_ctx = ctx = current_ctx; 1027 1028 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1029 1030 spin_lock(&ctx->lock); 1031 __blk_mq_insert_request(hctx, rq, at_head); 1032 spin_unlock(&ctx->lock); 1033 1034 if (run_queue) 1035 blk_mq_run_hw_queue(hctx, async); 1036 1037 blk_mq_put_ctx(current_ctx); 1038 } 1039 1040 static void blk_mq_insert_requests(struct request_queue *q, 1041 struct blk_mq_ctx *ctx, 1042 struct list_head *list, 1043 int depth, 1044 bool from_schedule) 1045 1046 { 1047 struct blk_mq_hw_ctx *hctx; 1048 struct blk_mq_ctx *current_ctx; 1049 1050 trace_block_unplug(q, depth, !from_schedule); 1051 1052 current_ctx = blk_mq_get_ctx(q); 1053 1054 if (!cpu_online(ctx->cpu)) 1055 ctx = current_ctx; 1056 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1057 1058 /* 1059 * preemption doesn't flush plug list, so it's possible ctx->cpu is 1060 * offline now 1061 */ 1062 spin_lock(&ctx->lock); 1063 while (!list_empty(list)) { 1064 struct request *rq; 1065 1066 rq = list_first_entry(list, struct request, queuelist); 1067 list_del_init(&rq->queuelist); 1068 rq->mq_ctx = ctx; 1069 __blk_mq_insert_request(hctx, rq, false); 1070 } 1071 spin_unlock(&ctx->lock); 1072 1073 blk_mq_run_hw_queue(hctx, from_schedule); 1074 blk_mq_put_ctx(current_ctx); 1075 } 1076 1077 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 1078 { 1079 struct request *rqa = container_of(a, struct request, queuelist); 1080 struct request *rqb = container_of(b, struct request, queuelist); 1081 1082 return !(rqa->mq_ctx < rqb->mq_ctx || 1083 (rqa->mq_ctx == rqb->mq_ctx && 1084 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 1085 } 1086 1087 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 1088 { 1089 struct blk_mq_ctx *this_ctx; 1090 struct request_queue *this_q; 1091 struct request *rq; 1092 LIST_HEAD(list); 1093 LIST_HEAD(ctx_list); 1094 unsigned int depth; 1095 1096 list_splice_init(&plug->mq_list, &list); 1097 1098 list_sort(NULL, &list, plug_ctx_cmp); 1099 1100 this_q = NULL; 1101 this_ctx = NULL; 1102 depth = 0; 1103 1104 while (!list_empty(&list)) { 1105 rq = list_entry_rq(list.next); 1106 list_del_init(&rq->queuelist); 1107 BUG_ON(!rq->q); 1108 if (rq->mq_ctx != this_ctx) { 1109 if (this_ctx) { 1110 blk_mq_insert_requests(this_q, this_ctx, 1111 &ctx_list, depth, 1112 from_schedule); 1113 } 1114 1115 this_ctx = rq->mq_ctx; 1116 this_q = rq->q; 1117 depth = 0; 1118 } 1119 1120 depth++; 1121 list_add_tail(&rq->queuelist, &ctx_list); 1122 } 1123 1124 /* 1125 * If 'this_ctx' is set, we know we have entries to complete 1126 * on 'ctx_list'. Do those. 1127 */ 1128 if (this_ctx) { 1129 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 1130 from_schedule); 1131 } 1132 } 1133 1134 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 1135 { 1136 init_request_from_bio(rq, bio); 1137 1138 if (blk_do_io_stat(rq)) 1139 blk_account_io_start(rq, 1); 1140 } 1141 1142 static inline bool hctx_allow_merges(struct blk_mq_hw_ctx *hctx) 1143 { 1144 return (hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 1145 !blk_queue_nomerges(hctx->queue); 1146 } 1147 1148 static inline bool blk_mq_merge_queue_io(struct blk_mq_hw_ctx *hctx, 1149 struct blk_mq_ctx *ctx, 1150 struct request *rq, struct bio *bio) 1151 { 1152 if (!hctx_allow_merges(hctx)) { 1153 blk_mq_bio_to_request(rq, bio); 1154 spin_lock(&ctx->lock); 1155 insert_rq: 1156 __blk_mq_insert_request(hctx, rq, false); 1157 spin_unlock(&ctx->lock); 1158 return false; 1159 } else { 1160 struct request_queue *q = hctx->queue; 1161 1162 spin_lock(&ctx->lock); 1163 if (!blk_mq_attempt_merge(q, ctx, bio)) { 1164 blk_mq_bio_to_request(rq, bio); 1165 goto insert_rq; 1166 } 1167 1168 spin_unlock(&ctx->lock); 1169 __blk_mq_free_request(hctx, ctx, rq); 1170 return true; 1171 } 1172 } 1173 1174 struct blk_map_ctx { 1175 struct blk_mq_hw_ctx *hctx; 1176 struct blk_mq_ctx *ctx; 1177 }; 1178 1179 static struct request *blk_mq_map_request(struct request_queue *q, 1180 struct bio *bio, 1181 struct blk_map_ctx *data) 1182 { 1183 struct blk_mq_hw_ctx *hctx; 1184 struct blk_mq_ctx *ctx; 1185 struct request *rq; 1186 int rw = bio_data_dir(bio); 1187 struct blk_mq_alloc_data alloc_data; 1188 1189 if (unlikely(blk_mq_queue_enter(q))) { 1190 bio_endio(bio, -EIO); 1191 return NULL; 1192 } 1193 1194 ctx = blk_mq_get_ctx(q); 1195 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1196 1197 if (rw_is_sync(bio->bi_rw)) 1198 rw |= REQ_SYNC; 1199 1200 trace_block_getrq(q, bio, rw); 1201 blk_mq_set_alloc_data(&alloc_data, q, GFP_ATOMIC, false, ctx, 1202 hctx); 1203 rq = __blk_mq_alloc_request(&alloc_data, rw); 1204 if (unlikely(!rq)) { 1205 __blk_mq_run_hw_queue(hctx); 1206 blk_mq_put_ctx(ctx); 1207 trace_block_sleeprq(q, bio, rw); 1208 1209 ctx = blk_mq_get_ctx(q); 1210 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1211 blk_mq_set_alloc_data(&alloc_data, q, 1212 __GFP_WAIT|GFP_ATOMIC, false, ctx, hctx); 1213 rq = __blk_mq_alloc_request(&alloc_data, rw); 1214 ctx = alloc_data.ctx; 1215 hctx = alloc_data.hctx; 1216 } 1217 1218 hctx->queued++; 1219 data->hctx = hctx; 1220 data->ctx = ctx; 1221 return rq; 1222 } 1223 1224 /* 1225 * Multiple hardware queue variant. This will not use per-process plugs, 1226 * but will attempt to bypass the hctx queueing if we can go straight to 1227 * hardware for SYNC IO. 1228 */ 1229 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 1230 { 1231 const int is_sync = rw_is_sync(bio->bi_rw); 1232 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1233 struct blk_map_ctx data; 1234 struct request *rq; 1235 1236 blk_queue_bounce(q, &bio); 1237 1238 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1239 bio_endio(bio, -EIO); 1240 return; 1241 } 1242 1243 rq = blk_mq_map_request(q, bio, &data); 1244 if (unlikely(!rq)) 1245 return; 1246 1247 if (unlikely(is_flush_fua)) { 1248 blk_mq_bio_to_request(rq, bio); 1249 blk_insert_flush(rq); 1250 goto run_queue; 1251 } 1252 1253 /* 1254 * If the driver supports defer issued based on 'last', then 1255 * queue it up like normal since we can potentially save some 1256 * CPU this way. 1257 */ 1258 if (is_sync && !(data.hctx->flags & BLK_MQ_F_DEFER_ISSUE)) { 1259 struct blk_mq_queue_data bd = { 1260 .rq = rq, 1261 .list = NULL, 1262 .last = 1 1263 }; 1264 int ret; 1265 1266 blk_mq_bio_to_request(rq, bio); 1267 1268 /* 1269 * For OK queue, we are done. For error, kill it. Any other 1270 * error (busy), just add it to our list as we previously 1271 * would have done 1272 */ 1273 ret = q->mq_ops->queue_rq(data.hctx, &bd); 1274 if (ret == BLK_MQ_RQ_QUEUE_OK) 1275 goto done; 1276 else { 1277 __blk_mq_requeue_request(rq); 1278 1279 if (ret == BLK_MQ_RQ_QUEUE_ERROR) { 1280 rq->errors = -EIO; 1281 blk_mq_end_request(rq, rq->errors); 1282 goto done; 1283 } 1284 } 1285 } 1286 1287 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1288 /* 1289 * For a SYNC request, send it to the hardware immediately. For 1290 * an ASYNC request, just ensure that we run it later on. The 1291 * latter allows for merging opportunities and more efficient 1292 * dispatching. 1293 */ 1294 run_queue: 1295 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1296 } 1297 done: 1298 blk_mq_put_ctx(data.ctx); 1299 } 1300 1301 /* 1302 * Single hardware queue variant. This will attempt to use any per-process 1303 * plug for merging and IO deferral. 1304 */ 1305 static void blk_sq_make_request(struct request_queue *q, struct bio *bio) 1306 { 1307 const int is_sync = rw_is_sync(bio->bi_rw); 1308 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 1309 unsigned int use_plug, request_count = 0; 1310 struct blk_map_ctx data; 1311 struct request *rq; 1312 1313 /* 1314 * If we have multiple hardware queues, just go directly to 1315 * one of those for sync IO. 1316 */ 1317 use_plug = !is_flush_fua && !is_sync; 1318 1319 blk_queue_bounce(q, &bio); 1320 1321 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 1322 bio_endio(bio, -EIO); 1323 return; 1324 } 1325 1326 if (use_plug && !blk_queue_nomerges(q) && 1327 blk_attempt_plug_merge(q, bio, &request_count)) 1328 return; 1329 1330 rq = blk_mq_map_request(q, bio, &data); 1331 if (unlikely(!rq)) 1332 return; 1333 1334 if (unlikely(is_flush_fua)) { 1335 blk_mq_bio_to_request(rq, bio); 1336 blk_insert_flush(rq); 1337 goto run_queue; 1338 } 1339 1340 /* 1341 * A task plug currently exists. Since this is completely lockless, 1342 * utilize that to temporarily store requests until the task is 1343 * either done or scheduled away. 1344 */ 1345 if (use_plug) { 1346 struct blk_plug *plug = current->plug; 1347 1348 if (plug) { 1349 blk_mq_bio_to_request(rq, bio); 1350 if (list_empty(&plug->mq_list)) 1351 trace_block_plug(q); 1352 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 1353 blk_flush_plug_list(plug, false); 1354 trace_block_plug(q); 1355 } 1356 list_add_tail(&rq->queuelist, &plug->mq_list); 1357 blk_mq_put_ctx(data.ctx); 1358 return; 1359 } 1360 } 1361 1362 if (!blk_mq_merge_queue_io(data.hctx, data.ctx, rq, bio)) { 1363 /* 1364 * For a SYNC request, send it to the hardware immediately. For 1365 * an ASYNC request, just ensure that we run it later on. The 1366 * latter allows for merging opportunities and more efficient 1367 * dispatching. 1368 */ 1369 run_queue: 1370 blk_mq_run_hw_queue(data.hctx, !is_sync || is_flush_fua); 1371 } 1372 1373 blk_mq_put_ctx(data.ctx); 1374 } 1375 1376 /* 1377 * Default mapping to a software queue, since we use one per CPU. 1378 */ 1379 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1380 { 1381 return q->queue_hw_ctx[q->mq_map[cpu]]; 1382 } 1383 EXPORT_SYMBOL(blk_mq_map_queue); 1384 1385 static void blk_mq_free_rq_map(struct blk_mq_tag_set *set, 1386 struct blk_mq_tags *tags, unsigned int hctx_idx) 1387 { 1388 struct page *page; 1389 1390 if (tags->rqs && set->ops->exit_request) { 1391 int i; 1392 1393 for (i = 0; i < tags->nr_tags; i++) { 1394 if (!tags->rqs[i]) 1395 continue; 1396 set->ops->exit_request(set->driver_data, tags->rqs[i], 1397 hctx_idx, i); 1398 tags->rqs[i] = NULL; 1399 } 1400 } 1401 1402 while (!list_empty(&tags->page_list)) { 1403 page = list_first_entry(&tags->page_list, struct page, lru); 1404 list_del_init(&page->lru); 1405 __free_pages(page, page->private); 1406 } 1407 1408 kfree(tags->rqs); 1409 1410 blk_mq_free_tags(tags); 1411 } 1412 1413 static size_t order_to_size(unsigned int order) 1414 { 1415 return (size_t)PAGE_SIZE << order; 1416 } 1417 1418 static struct blk_mq_tags *blk_mq_init_rq_map(struct blk_mq_tag_set *set, 1419 unsigned int hctx_idx) 1420 { 1421 struct blk_mq_tags *tags; 1422 unsigned int i, j, entries_per_page, max_order = 4; 1423 size_t rq_size, left; 1424 1425 tags = blk_mq_init_tags(set->queue_depth, set->reserved_tags, 1426 set->numa_node, 1427 BLK_MQ_FLAG_TO_ALLOC_POLICY(set->flags)); 1428 if (!tags) 1429 return NULL; 1430 1431 INIT_LIST_HEAD(&tags->page_list); 1432 1433 tags->rqs = kzalloc_node(set->queue_depth * sizeof(struct request *), 1434 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1435 set->numa_node); 1436 if (!tags->rqs) { 1437 blk_mq_free_tags(tags); 1438 return NULL; 1439 } 1440 1441 /* 1442 * rq_size is the size of the request plus driver payload, rounded 1443 * to the cacheline size 1444 */ 1445 rq_size = round_up(sizeof(struct request) + set->cmd_size, 1446 cache_line_size()); 1447 left = rq_size * set->queue_depth; 1448 1449 for (i = 0; i < set->queue_depth; ) { 1450 int this_order = max_order; 1451 struct page *page; 1452 int to_do; 1453 void *p; 1454 1455 while (left < order_to_size(this_order - 1) && this_order) 1456 this_order--; 1457 1458 do { 1459 page = alloc_pages_node(set->numa_node, 1460 GFP_KERNEL | __GFP_NOWARN | __GFP_NORETRY, 1461 this_order); 1462 if (page) 1463 break; 1464 if (!this_order--) 1465 break; 1466 if (order_to_size(this_order) < rq_size) 1467 break; 1468 } while (1); 1469 1470 if (!page) 1471 goto fail; 1472 1473 page->private = this_order; 1474 list_add_tail(&page->lru, &tags->page_list); 1475 1476 p = page_address(page); 1477 entries_per_page = order_to_size(this_order) / rq_size; 1478 to_do = min(entries_per_page, set->queue_depth - i); 1479 left -= to_do * rq_size; 1480 for (j = 0; j < to_do; j++) { 1481 tags->rqs[i] = p; 1482 tags->rqs[i]->atomic_flags = 0; 1483 tags->rqs[i]->cmd_flags = 0; 1484 if (set->ops->init_request) { 1485 if (set->ops->init_request(set->driver_data, 1486 tags->rqs[i], hctx_idx, i, 1487 set->numa_node)) { 1488 tags->rqs[i] = NULL; 1489 goto fail; 1490 } 1491 } 1492 1493 p += rq_size; 1494 i++; 1495 } 1496 } 1497 1498 return tags; 1499 1500 fail: 1501 blk_mq_free_rq_map(set, tags, hctx_idx); 1502 return NULL; 1503 } 1504 1505 static void blk_mq_free_bitmap(struct blk_mq_ctxmap *bitmap) 1506 { 1507 kfree(bitmap->map); 1508 } 1509 1510 static int blk_mq_alloc_bitmap(struct blk_mq_ctxmap *bitmap, int node) 1511 { 1512 unsigned int bpw = 8, total, num_maps, i; 1513 1514 bitmap->bits_per_word = bpw; 1515 1516 num_maps = ALIGN(nr_cpu_ids, bpw) / bpw; 1517 bitmap->map = kzalloc_node(num_maps * sizeof(struct blk_align_bitmap), 1518 GFP_KERNEL, node); 1519 if (!bitmap->map) 1520 return -ENOMEM; 1521 1522 bitmap->map_size = num_maps; 1523 1524 total = nr_cpu_ids; 1525 for (i = 0; i < num_maps; i++) { 1526 bitmap->map[i].depth = min(total, bitmap->bits_per_word); 1527 total -= bitmap->map[i].depth; 1528 } 1529 1530 return 0; 1531 } 1532 1533 static int blk_mq_hctx_cpu_offline(struct blk_mq_hw_ctx *hctx, int cpu) 1534 { 1535 struct request_queue *q = hctx->queue; 1536 struct blk_mq_ctx *ctx; 1537 LIST_HEAD(tmp); 1538 1539 /* 1540 * Move ctx entries to new CPU, if this one is going away. 1541 */ 1542 ctx = __blk_mq_get_ctx(q, cpu); 1543 1544 spin_lock(&ctx->lock); 1545 if (!list_empty(&ctx->rq_list)) { 1546 list_splice_init(&ctx->rq_list, &tmp); 1547 blk_mq_hctx_clear_pending(hctx, ctx); 1548 } 1549 spin_unlock(&ctx->lock); 1550 1551 if (list_empty(&tmp)) 1552 return NOTIFY_OK; 1553 1554 ctx = blk_mq_get_ctx(q); 1555 spin_lock(&ctx->lock); 1556 1557 while (!list_empty(&tmp)) { 1558 struct request *rq; 1559 1560 rq = list_first_entry(&tmp, struct request, queuelist); 1561 rq->mq_ctx = ctx; 1562 list_move_tail(&rq->queuelist, &ctx->rq_list); 1563 } 1564 1565 hctx = q->mq_ops->map_queue(q, ctx->cpu); 1566 blk_mq_hctx_mark_pending(hctx, ctx); 1567 1568 spin_unlock(&ctx->lock); 1569 1570 blk_mq_run_hw_queue(hctx, true); 1571 blk_mq_put_ctx(ctx); 1572 return NOTIFY_OK; 1573 } 1574 1575 static int blk_mq_hctx_cpu_online(struct blk_mq_hw_ctx *hctx, int cpu) 1576 { 1577 struct request_queue *q = hctx->queue; 1578 struct blk_mq_tag_set *set = q->tag_set; 1579 1580 if (set->tags[hctx->queue_num]) 1581 return NOTIFY_OK; 1582 1583 set->tags[hctx->queue_num] = blk_mq_init_rq_map(set, hctx->queue_num); 1584 if (!set->tags[hctx->queue_num]) 1585 return NOTIFY_STOP; 1586 1587 hctx->tags = set->tags[hctx->queue_num]; 1588 return NOTIFY_OK; 1589 } 1590 1591 static int blk_mq_hctx_notify(void *data, unsigned long action, 1592 unsigned int cpu) 1593 { 1594 struct blk_mq_hw_ctx *hctx = data; 1595 1596 if (action == CPU_DEAD || action == CPU_DEAD_FROZEN) 1597 return blk_mq_hctx_cpu_offline(hctx, cpu); 1598 else if (action == CPU_ONLINE || action == CPU_ONLINE_FROZEN) 1599 return blk_mq_hctx_cpu_online(hctx, cpu); 1600 1601 return NOTIFY_OK; 1602 } 1603 1604 static void blk_mq_exit_hctx(struct request_queue *q, 1605 struct blk_mq_tag_set *set, 1606 struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 1607 { 1608 unsigned flush_start_tag = set->queue_depth; 1609 1610 blk_mq_tag_idle(hctx); 1611 1612 if (set->ops->exit_request) 1613 set->ops->exit_request(set->driver_data, 1614 hctx->fq->flush_rq, hctx_idx, 1615 flush_start_tag + hctx_idx); 1616 1617 if (set->ops->exit_hctx) 1618 set->ops->exit_hctx(hctx, hctx_idx); 1619 1620 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1621 blk_free_flush_queue(hctx->fq); 1622 kfree(hctx->ctxs); 1623 blk_mq_free_bitmap(&hctx->ctx_map); 1624 } 1625 1626 static void blk_mq_exit_hw_queues(struct request_queue *q, 1627 struct blk_mq_tag_set *set, int nr_queue) 1628 { 1629 struct blk_mq_hw_ctx *hctx; 1630 unsigned int i; 1631 1632 queue_for_each_hw_ctx(q, hctx, i) { 1633 if (i == nr_queue) 1634 break; 1635 blk_mq_exit_hctx(q, set, hctx, i); 1636 } 1637 } 1638 1639 static void blk_mq_free_hw_queues(struct request_queue *q, 1640 struct blk_mq_tag_set *set) 1641 { 1642 struct blk_mq_hw_ctx *hctx; 1643 unsigned int i; 1644 1645 queue_for_each_hw_ctx(q, hctx, i) 1646 free_cpumask_var(hctx->cpumask); 1647 } 1648 1649 static int blk_mq_init_hctx(struct request_queue *q, 1650 struct blk_mq_tag_set *set, 1651 struct blk_mq_hw_ctx *hctx, unsigned hctx_idx) 1652 { 1653 int node; 1654 unsigned flush_start_tag = set->queue_depth; 1655 1656 node = hctx->numa_node; 1657 if (node == NUMA_NO_NODE) 1658 node = hctx->numa_node = set->numa_node; 1659 1660 INIT_DELAYED_WORK(&hctx->run_work, blk_mq_run_work_fn); 1661 INIT_DELAYED_WORK(&hctx->delay_work, blk_mq_delay_work_fn); 1662 spin_lock_init(&hctx->lock); 1663 INIT_LIST_HEAD(&hctx->dispatch); 1664 hctx->queue = q; 1665 hctx->queue_num = hctx_idx; 1666 hctx->flags = set->flags; 1667 1668 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1669 blk_mq_hctx_notify, hctx); 1670 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1671 1672 hctx->tags = set->tags[hctx_idx]; 1673 1674 /* 1675 * Allocate space for all possible cpus to avoid allocation at 1676 * runtime 1677 */ 1678 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1679 GFP_KERNEL, node); 1680 if (!hctx->ctxs) 1681 goto unregister_cpu_notifier; 1682 1683 if (blk_mq_alloc_bitmap(&hctx->ctx_map, node)) 1684 goto free_ctxs; 1685 1686 hctx->nr_ctx = 0; 1687 1688 if (set->ops->init_hctx && 1689 set->ops->init_hctx(hctx, set->driver_data, hctx_idx)) 1690 goto free_bitmap; 1691 1692 hctx->fq = blk_alloc_flush_queue(q, hctx->numa_node, set->cmd_size); 1693 if (!hctx->fq) 1694 goto exit_hctx; 1695 1696 if (set->ops->init_request && 1697 set->ops->init_request(set->driver_data, 1698 hctx->fq->flush_rq, hctx_idx, 1699 flush_start_tag + hctx_idx, node)) 1700 goto free_fq; 1701 1702 return 0; 1703 1704 free_fq: 1705 kfree(hctx->fq); 1706 exit_hctx: 1707 if (set->ops->exit_hctx) 1708 set->ops->exit_hctx(hctx, hctx_idx); 1709 free_bitmap: 1710 blk_mq_free_bitmap(&hctx->ctx_map); 1711 free_ctxs: 1712 kfree(hctx->ctxs); 1713 unregister_cpu_notifier: 1714 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1715 1716 return -1; 1717 } 1718 1719 static int blk_mq_init_hw_queues(struct request_queue *q, 1720 struct blk_mq_tag_set *set) 1721 { 1722 struct blk_mq_hw_ctx *hctx; 1723 unsigned int i; 1724 1725 /* 1726 * Initialize hardware queues 1727 */ 1728 queue_for_each_hw_ctx(q, hctx, i) { 1729 if (blk_mq_init_hctx(q, set, hctx, i)) 1730 break; 1731 } 1732 1733 if (i == q->nr_hw_queues) 1734 return 0; 1735 1736 /* 1737 * Init failed 1738 */ 1739 blk_mq_exit_hw_queues(q, set, i); 1740 1741 return 1; 1742 } 1743 1744 static void blk_mq_init_cpu_queues(struct request_queue *q, 1745 unsigned int nr_hw_queues) 1746 { 1747 unsigned int i; 1748 1749 for_each_possible_cpu(i) { 1750 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1751 struct blk_mq_hw_ctx *hctx; 1752 1753 memset(__ctx, 0, sizeof(*__ctx)); 1754 __ctx->cpu = i; 1755 spin_lock_init(&__ctx->lock); 1756 INIT_LIST_HEAD(&__ctx->rq_list); 1757 __ctx->queue = q; 1758 1759 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1760 if (!cpu_online(i)) 1761 continue; 1762 1763 hctx = q->mq_ops->map_queue(q, i); 1764 cpumask_set_cpu(i, hctx->cpumask); 1765 hctx->nr_ctx++; 1766 1767 /* 1768 * Set local node, IFF we have more than one hw queue. If 1769 * not, we remain on the home node of the device 1770 */ 1771 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1772 hctx->numa_node = cpu_to_node(i); 1773 } 1774 } 1775 1776 static void blk_mq_map_swqueue(struct request_queue *q) 1777 { 1778 unsigned int i; 1779 struct blk_mq_hw_ctx *hctx; 1780 struct blk_mq_ctx *ctx; 1781 1782 queue_for_each_hw_ctx(q, hctx, i) { 1783 cpumask_clear(hctx->cpumask); 1784 hctx->nr_ctx = 0; 1785 } 1786 1787 /* 1788 * Map software to hardware queues 1789 */ 1790 queue_for_each_ctx(q, ctx, i) { 1791 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1792 if (!cpu_online(i)) 1793 continue; 1794 1795 hctx = q->mq_ops->map_queue(q, i); 1796 cpumask_set_cpu(i, hctx->cpumask); 1797 ctx->index_hw = hctx->nr_ctx; 1798 hctx->ctxs[hctx->nr_ctx++] = ctx; 1799 } 1800 1801 queue_for_each_hw_ctx(q, hctx, i) { 1802 /* 1803 * If no software queues are mapped to this hardware queue, 1804 * disable it and free the request entries. 1805 */ 1806 if (!hctx->nr_ctx) { 1807 struct blk_mq_tag_set *set = q->tag_set; 1808 1809 if (set->tags[i]) { 1810 blk_mq_free_rq_map(set, set->tags[i], i); 1811 set->tags[i] = NULL; 1812 hctx->tags = NULL; 1813 } 1814 continue; 1815 } 1816 1817 /* 1818 * Initialize batch roundrobin counts 1819 */ 1820 hctx->next_cpu = cpumask_first(hctx->cpumask); 1821 hctx->next_cpu_batch = BLK_MQ_CPU_WORK_BATCH; 1822 } 1823 } 1824 1825 static void blk_mq_update_tag_set_depth(struct blk_mq_tag_set *set) 1826 { 1827 struct blk_mq_hw_ctx *hctx; 1828 struct request_queue *q; 1829 bool shared; 1830 int i; 1831 1832 if (set->tag_list.next == set->tag_list.prev) 1833 shared = false; 1834 else 1835 shared = true; 1836 1837 list_for_each_entry(q, &set->tag_list, tag_set_list) { 1838 blk_mq_freeze_queue(q); 1839 1840 queue_for_each_hw_ctx(q, hctx, i) { 1841 if (shared) 1842 hctx->flags |= BLK_MQ_F_TAG_SHARED; 1843 else 1844 hctx->flags &= ~BLK_MQ_F_TAG_SHARED; 1845 } 1846 blk_mq_unfreeze_queue(q); 1847 } 1848 } 1849 1850 static void blk_mq_del_queue_tag_set(struct request_queue *q) 1851 { 1852 struct blk_mq_tag_set *set = q->tag_set; 1853 1854 mutex_lock(&set->tag_list_lock); 1855 list_del_init(&q->tag_set_list); 1856 blk_mq_update_tag_set_depth(set); 1857 mutex_unlock(&set->tag_list_lock); 1858 } 1859 1860 static void blk_mq_add_queue_tag_set(struct blk_mq_tag_set *set, 1861 struct request_queue *q) 1862 { 1863 q->tag_set = set; 1864 1865 mutex_lock(&set->tag_list_lock); 1866 list_add_tail(&q->tag_set_list, &set->tag_list); 1867 blk_mq_update_tag_set_depth(set); 1868 mutex_unlock(&set->tag_list_lock); 1869 } 1870 1871 /* 1872 * It is the actual release handler for mq, but we do it from 1873 * request queue's release handler for avoiding use-after-free 1874 * and headache because q->mq_kobj shouldn't have been introduced, 1875 * but we can't group ctx/kctx kobj without it. 1876 */ 1877 void blk_mq_release(struct request_queue *q) 1878 { 1879 struct blk_mq_hw_ctx *hctx; 1880 unsigned int i; 1881 1882 /* hctx kobj stays in hctx */ 1883 queue_for_each_hw_ctx(q, hctx, i) 1884 kfree(hctx); 1885 1886 kfree(q->queue_hw_ctx); 1887 1888 /* ctx kobj stays in queue_ctx */ 1889 free_percpu(q->queue_ctx); 1890 } 1891 1892 struct request_queue *blk_mq_init_queue(struct blk_mq_tag_set *set) 1893 { 1894 struct blk_mq_hw_ctx **hctxs; 1895 struct blk_mq_ctx __percpu *ctx; 1896 struct request_queue *q; 1897 unsigned int *map; 1898 int i; 1899 1900 ctx = alloc_percpu(struct blk_mq_ctx); 1901 if (!ctx) 1902 return ERR_PTR(-ENOMEM); 1903 1904 hctxs = kmalloc_node(set->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1905 set->numa_node); 1906 1907 if (!hctxs) 1908 goto err_percpu; 1909 1910 map = blk_mq_make_queue_map(set); 1911 if (!map) 1912 goto err_map; 1913 1914 for (i = 0; i < set->nr_hw_queues; i++) { 1915 int node = blk_mq_hw_queue_to_node(map, i); 1916 1917 hctxs[i] = kzalloc_node(sizeof(struct blk_mq_hw_ctx), 1918 GFP_KERNEL, node); 1919 if (!hctxs[i]) 1920 goto err_hctxs; 1921 1922 if (!zalloc_cpumask_var_node(&hctxs[i]->cpumask, GFP_KERNEL, 1923 node)) 1924 goto err_hctxs; 1925 1926 atomic_set(&hctxs[i]->nr_active, 0); 1927 hctxs[i]->numa_node = node; 1928 hctxs[i]->queue_num = i; 1929 } 1930 1931 q = blk_alloc_queue_node(GFP_KERNEL, set->numa_node); 1932 if (!q) 1933 goto err_hctxs; 1934 1935 /* 1936 * Init percpu_ref in atomic mode so that it's faster to shutdown. 1937 * See blk_register_queue() for details. 1938 */ 1939 if (percpu_ref_init(&q->mq_usage_counter, blk_mq_usage_counter_release, 1940 PERCPU_REF_INIT_ATOMIC, GFP_KERNEL)) 1941 goto err_map; 1942 1943 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1944 blk_queue_rq_timeout(q, 30000); 1945 1946 q->nr_queues = nr_cpu_ids; 1947 q->nr_hw_queues = set->nr_hw_queues; 1948 q->mq_map = map; 1949 1950 q->queue_ctx = ctx; 1951 q->queue_hw_ctx = hctxs; 1952 1953 q->mq_ops = set->ops; 1954 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1955 1956 if (!(set->flags & BLK_MQ_F_SG_MERGE)) 1957 q->queue_flags |= 1 << QUEUE_FLAG_NO_SG_MERGE; 1958 1959 q->sg_reserved_size = INT_MAX; 1960 1961 INIT_WORK(&q->requeue_work, blk_mq_requeue_work); 1962 INIT_LIST_HEAD(&q->requeue_list); 1963 spin_lock_init(&q->requeue_lock); 1964 1965 if (q->nr_hw_queues > 1) 1966 blk_queue_make_request(q, blk_mq_make_request); 1967 else 1968 blk_queue_make_request(q, blk_sq_make_request); 1969 1970 if (set->timeout) 1971 blk_queue_rq_timeout(q, set->timeout); 1972 1973 /* 1974 * Do this after blk_queue_make_request() overrides it... 1975 */ 1976 q->nr_requests = set->queue_depth; 1977 1978 if (set->ops->complete) 1979 blk_queue_softirq_done(q, set->ops->complete); 1980 1981 blk_mq_init_cpu_queues(q, set->nr_hw_queues); 1982 1983 if (blk_mq_init_hw_queues(q, set)) 1984 goto err_hw; 1985 1986 mutex_lock(&all_q_mutex); 1987 list_add_tail(&q->all_q_node, &all_q_list); 1988 mutex_unlock(&all_q_mutex); 1989 1990 blk_mq_add_queue_tag_set(set, q); 1991 1992 blk_mq_map_swqueue(q); 1993 1994 return q; 1995 1996 err_hw: 1997 blk_cleanup_queue(q); 1998 err_hctxs: 1999 kfree(map); 2000 for (i = 0; i < set->nr_hw_queues; i++) { 2001 if (!hctxs[i]) 2002 break; 2003 free_cpumask_var(hctxs[i]->cpumask); 2004 kfree(hctxs[i]); 2005 } 2006 err_map: 2007 kfree(hctxs); 2008 err_percpu: 2009 free_percpu(ctx); 2010 return ERR_PTR(-ENOMEM); 2011 } 2012 EXPORT_SYMBOL(blk_mq_init_queue); 2013 2014 void blk_mq_free_queue(struct request_queue *q) 2015 { 2016 struct blk_mq_tag_set *set = q->tag_set; 2017 2018 blk_mq_del_queue_tag_set(q); 2019 2020 blk_mq_exit_hw_queues(q, set, set->nr_hw_queues); 2021 blk_mq_free_hw_queues(q, set); 2022 2023 percpu_ref_exit(&q->mq_usage_counter); 2024 2025 kfree(q->mq_map); 2026 2027 q->mq_map = NULL; 2028 2029 mutex_lock(&all_q_mutex); 2030 list_del_init(&q->all_q_node); 2031 mutex_unlock(&all_q_mutex); 2032 } 2033 2034 /* Basically redo blk_mq_init_queue with queue frozen */ 2035 static void blk_mq_queue_reinit(struct request_queue *q) 2036 { 2037 WARN_ON_ONCE(!q->mq_freeze_depth); 2038 2039 blk_mq_sysfs_unregister(q); 2040 2041 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 2042 2043 /* 2044 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 2045 * we should change hctx numa_node according to new topology (this 2046 * involves free and re-allocate memory, worthy doing?) 2047 */ 2048 2049 blk_mq_map_swqueue(q); 2050 2051 blk_mq_sysfs_register(q); 2052 } 2053 2054 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 2055 unsigned long action, void *hcpu) 2056 { 2057 struct request_queue *q; 2058 2059 /* 2060 * Before new mappings are established, hotadded cpu might already 2061 * start handling requests. This doesn't break anything as we map 2062 * offline CPUs to first hardware queue. We will re-init the queue 2063 * below to get optimal settings. 2064 */ 2065 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 2066 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 2067 return NOTIFY_OK; 2068 2069 mutex_lock(&all_q_mutex); 2070 2071 /* 2072 * We need to freeze and reinit all existing queues. Freezing 2073 * involves synchronous wait for an RCU grace period and doing it 2074 * one by one may take a long time. Start freezing all queues in 2075 * one swoop and then wait for the completions so that freezing can 2076 * take place in parallel. 2077 */ 2078 list_for_each_entry(q, &all_q_list, all_q_node) 2079 blk_mq_freeze_queue_start(q); 2080 list_for_each_entry(q, &all_q_list, all_q_node) 2081 blk_mq_freeze_queue_wait(q); 2082 2083 list_for_each_entry(q, &all_q_list, all_q_node) 2084 blk_mq_queue_reinit(q); 2085 2086 list_for_each_entry(q, &all_q_list, all_q_node) 2087 blk_mq_unfreeze_queue(q); 2088 2089 mutex_unlock(&all_q_mutex); 2090 return NOTIFY_OK; 2091 } 2092 2093 static int __blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2094 { 2095 int i; 2096 2097 for (i = 0; i < set->nr_hw_queues; i++) { 2098 set->tags[i] = blk_mq_init_rq_map(set, i); 2099 if (!set->tags[i]) 2100 goto out_unwind; 2101 } 2102 2103 return 0; 2104 2105 out_unwind: 2106 while (--i >= 0) 2107 blk_mq_free_rq_map(set, set->tags[i], i); 2108 2109 return -ENOMEM; 2110 } 2111 2112 /* 2113 * Allocate the request maps associated with this tag_set. Note that this 2114 * may reduce the depth asked for, if memory is tight. set->queue_depth 2115 * will be updated to reflect the allocated depth. 2116 */ 2117 static int blk_mq_alloc_rq_maps(struct blk_mq_tag_set *set) 2118 { 2119 unsigned int depth; 2120 int err; 2121 2122 depth = set->queue_depth; 2123 do { 2124 err = __blk_mq_alloc_rq_maps(set); 2125 if (!err) 2126 break; 2127 2128 set->queue_depth >>= 1; 2129 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) { 2130 err = -ENOMEM; 2131 break; 2132 } 2133 } while (set->queue_depth); 2134 2135 if (!set->queue_depth || err) { 2136 pr_err("blk-mq: failed to allocate request map\n"); 2137 return -ENOMEM; 2138 } 2139 2140 if (depth != set->queue_depth) 2141 pr_info("blk-mq: reduced tag depth (%u -> %u)\n", 2142 depth, set->queue_depth); 2143 2144 return 0; 2145 } 2146 2147 /* 2148 * Alloc a tag set to be associated with one or more request queues. 2149 * May fail with EINVAL for various error conditions. May adjust the 2150 * requested depth down, if if it too large. In that case, the set 2151 * value will be stored in set->queue_depth. 2152 */ 2153 int blk_mq_alloc_tag_set(struct blk_mq_tag_set *set) 2154 { 2155 BUILD_BUG_ON(BLK_MQ_MAX_DEPTH > 1 << BLK_MQ_UNIQUE_TAG_BITS); 2156 2157 if (!set->nr_hw_queues) 2158 return -EINVAL; 2159 if (!set->queue_depth) 2160 return -EINVAL; 2161 if (set->queue_depth < set->reserved_tags + BLK_MQ_TAG_MIN) 2162 return -EINVAL; 2163 2164 if (!set->nr_hw_queues || !set->ops->queue_rq || !set->ops->map_queue) 2165 return -EINVAL; 2166 2167 if (set->queue_depth > BLK_MQ_MAX_DEPTH) { 2168 pr_info("blk-mq: reduced tag depth to %u\n", 2169 BLK_MQ_MAX_DEPTH); 2170 set->queue_depth = BLK_MQ_MAX_DEPTH; 2171 } 2172 2173 /* 2174 * If a crashdump is active, then we are potentially in a very 2175 * memory constrained environment. Limit us to 1 queue and 2176 * 64 tags to prevent using too much memory. 2177 */ 2178 if (is_kdump_kernel()) { 2179 set->nr_hw_queues = 1; 2180 set->queue_depth = min(64U, set->queue_depth); 2181 } 2182 2183 set->tags = kmalloc_node(set->nr_hw_queues * 2184 sizeof(struct blk_mq_tags *), 2185 GFP_KERNEL, set->numa_node); 2186 if (!set->tags) 2187 return -ENOMEM; 2188 2189 if (blk_mq_alloc_rq_maps(set)) 2190 goto enomem; 2191 2192 mutex_init(&set->tag_list_lock); 2193 INIT_LIST_HEAD(&set->tag_list); 2194 2195 return 0; 2196 enomem: 2197 kfree(set->tags); 2198 set->tags = NULL; 2199 return -ENOMEM; 2200 } 2201 EXPORT_SYMBOL(blk_mq_alloc_tag_set); 2202 2203 void blk_mq_free_tag_set(struct blk_mq_tag_set *set) 2204 { 2205 int i; 2206 2207 for (i = 0; i < set->nr_hw_queues; i++) { 2208 if (set->tags[i]) 2209 blk_mq_free_rq_map(set, set->tags[i], i); 2210 } 2211 2212 kfree(set->tags); 2213 set->tags = NULL; 2214 } 2215 EXPORT_SYMBOL(blk_mq_free_tag_set); 2216 2217 int blk_mq_update_nr_requests(struct request_queue *q, unsigned int nr) 2218 { 2219 struct blk_mq_tag_set *set = q->tag_set; 2220 struct blk_mq_hw_ctx *hctx; 2221 int i, ret; 2222 2223 if (!set || nr > set->queue_depth) 2224 return -EINVAL; 2225 2226 ret = 0; 2227 queue_for_each_hw_ctx(q, hctx, i) { 2228 ret = blk_mq_tag_update_depth(hctx->tags, nr); 2229 if (ret) 2230 break; 2231 } 2232 2233 if (!ret) 2234 q->nr_requests = nr; 2235 2236 return ret; 2237 } 2238 2239 void blk_mq_disable_hotplug(void) 2240 { 2241 mutex_lock(&all_q_mutex); 2242 } 2243 2244 void blk_mq_enable_hotplug(void) 2245 { 2246 mutex_unlock(&all_q_mutex); 2247 } 2248 2249 static int __init blk_mq_init(void) 2250 { 2251 blk_mq_cpu_init(); 2252 2253 hotcpu_notifier(blk_mq_queue_reinit_notify, 0); 2254 2255 return 0; 2256 } 2257 subsys_initcall(blk_mq_init); 2258