1 #include <linux/kernel.h> 2 #include <linux/module.h> 3 #include <linux/backing-dev.h> 4 #include <linux/bio.h> 5 #include <linux/blkdev.h> 6 #include <linux/mm.h> 7 #include <linux/init.h> 8 #include <linux/slab.h> 9 #include <linux/workqueue.h> 10 #include <linux/smp.h> 11 #include <linux/llist.h> 12 #include <linux/list_sort.h> 13 #include <linux/cpu.h> 14 #include <linux/cache.h> 15 #include <linux/sched/sysctl.h> 16 #include <linux/delay.h> 17 18 #include <trace/events/block.h> 19 20 #include <linux/blk-mq.h> 21 #include "blk.h" 22 #include "blk-mq.h" 23 #include "blk-mq-tag.h" 24 25 static DEFINE_MUTEX(all_q_mutex); 26 static LIST_HEAD(all_q_list); 27 28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 31 unsigned int cpu) 32 { 33 return per_cpu_ptr(q->queue_ctx, cpu); 34 } 35 36 /* 37 * This assumes per-cpu software queueing queues. They could be per-node 38 * as well, for instance. For now this is hardcoded as-is. Note that we don't 39 * care about preemption, since we know the ctx's are persistent. This does 40 * mean that we can't rely on ctx always matching the currently running CPU. 41 */ 42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 43 { 44 return __blk_mq_get_ctx(q, get_cpu()); 45 } 46 47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 48 { 49 put_cpu(); 50 } 51 52 /* 53 * Check if any of the ctx's have pending work in this hardware queue 54 */ 55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 56 { 57 unsigned int i; 58 59 for (i = 0; i < hctx->nr_ctx_map; i++) 60 if (hctx->ctx_map[i]) 61 return true; 62 63 return false; 64 } 65 66 /* 67 * Mark this ctx as having pending work in this hardware queue 68 */ 69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 70 struct blk_mq_ctx *ctx) 71 { 72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 73 set_bit(ctx->index_hw, hctx->ctx_map); 74 } 75 76 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 77 gfp_t gfp, bool reserved) 78 { 79 struct request *rq; 80 unsigned int tag; 81 82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 83 if (tag != BLK_MQ_TAG_FAIL) { 84 rq = hctx->rqs[tag]; 85 rq->tag = tag; 86 87 return rq; 88 } 89 90 return NULL; 91 } 92 93 static int blk_mq_queue_enter(struct request_queue *q) 94 { 95 int ret; 96 97 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 98 smp_wmb(); 99 /* we have problems to freeze the queue if it's initializing */ 100 if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 101 return 0; 102 103 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 104 105 spin_lock_irq(q->queue_lock); 106 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 107 !blk_queue_bypass(q) || blk_queue_dying(q), 108 *q->queue_lock); 109 /* inc usage with lock hold to avoid freeze_queue runs here */ 110 if (!ret && !blk_queue_dying(q)) 111 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 112 else if (blk_queue_dying(q)) 113 ret = -ENODEV; 114 spin_unlock_irq(q->queue_lock); 115 116 return ret; 117 } 118 119 static void blk_mq_queue_exit(struct request_queue *q) 120 { 121 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 122 } 123 124 static void __blk_mq_drain_queue(struct request_queue *q) 125 { 126 while (true) { 127 s64 count; 128 129 spin_lock_irq(q->queue_lock); 130 count = percpu_counter_sum(&q->mq_usage_counter); 131 spin_unlock_irq(q->queue_lock); 132 133 if (count == 0) 134 break; 135 blk_mq_run_queues(q, false); 136 msleep(10); 137 } 138 } 139 140 /* 141 * Guarantee no request is in use, so we can change any data structure of 142 * the queue afterward. 143 */ 144 static void blk_mq_freeze_queue(struct request_queue *q) 145 { 146 bool drain; 147 148 spin_lock_irq(q->queue_lock); 149 drain = !q->bypass_depth++; 150 queue_flag_set(QUEUE_FLAG_BYPASS, q); 151 spin_unlock_irq(q->queue_lock); 152 153 if (drain) 154 __blk_mq_drain_queue(q); 155 } 156 157 void blk_mq_drain_queue(struct request_queue *q) 158 { 159 __blk_mq_drain_queue(q); 160 } 161 162 static void blk_mq_unfreeze_queue(struct request_queue *q) 163 { 164 bool wake = false; 165 166 spin_lock_irq(q->queue_lock); 167 if (!--q->bypass_depth) { 168 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 169 wake = true; 170 } 171 WARN_ON_ONCE(q->bypass_depth < 0); 172 spin_unlock_irq(q->queue_lock); 173 if (wake) 174 wake_up_all(&q->mq_freeze_wq); 175 } 176 177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 178 { 179 return blk_mq_has_free_tags(hctx->tags); 180 } 181 EXPORT_SYMBOL(blk_mq_can_queue); 182 183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 184 struct request *rq, unsigned int rw_flags) 185 { 186 if (blk_queue_io_stat(q)) 187 rw_flags |= REQ_IO_STAT; 188 189 rq->mq_ctx = ctx; 190 rq->cmd_flags = rw_flags; 191 rq->start_time = jiffies; 192 set_start_time_ns(rq); 193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 194 } 195 196 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 197 int rw, gfp_t gfp, 198 bool reserved) 199 { 200 struct request *rq; 201 202 do { 203 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 204 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 205 206 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 207 if (rq) { 208 blk_mq_rq_ctx_init(q, ctx, rq, rw); 209 break; 210 } 211 212 blk_mq_put_ctx(ctx); 213 if (!(gfp & __GFP_WAIT)) 214 break; 215 216 __blk_mq_run_hw_queue(hctx); 217 blk_mq_wait_for_tags(hctx->tags); 218 } while (1); 219 220 return rq; 221 } 222 223 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 224 { 225 struct request *rq; 226 227 if (blk_mq_queue_enter(q)) 228 return NULL; 229 230 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 231 if (rq) 232 blk_mq_put_ctx(rq->mq_ctx); 233 return rq; 234 } 235 236 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 237 gfp_t gfp) 238 { 239 struct request *rq; 240 241 if (blk_mq_queue_enter(q)) 242 return NULL; 243 244 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 245 if (rq) 246 blk_mq_put_ctx(rq->mq_ctx); 247 return rq; 248 } 249 EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 250 251 /* 252 * Re-init and set pdu, if we have it 253 */ 254 void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 255 { 256 blk_rq_init(hctx->queue, rq); 257 258 if (hctx->cmd_size) 259 rq->special = blk_mq_rq_to_pdu(rq); 260 } 261 262 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 263 struct blk_mq_ctx *ctx, struct request *rq) 264 { 265 const int tag = rq->tag; 266 struct request_queue *q = rq->q; 267 268 blk_mq_rq_init(hctx, rq); 269 blk_mq_put_tag(hctx->tags, tag); 270 271 blk_mq_queue_exit(q); 272 } 273 274 void blk_mq_free_request(struct request *rq) 275 { 276 struct blk_mq_ctx *ctx = rq->mq_ctx; 277 struct blk_mq_hw_ctx *hctx; 278 struct request_queue *q = rq->q; 279 280 ctx->rq_completed[rq_is_sync(rq)]++; 281 282 hctx = q->mq_ops->map_queue(q, ctx->cpu); 283 __blk_mq_free_request(hctx, ctx, rq); 284 } 285 286 bool blk_mq_end_io_partial(struct request *rq, int error, unsigned int nr_bytes) 287 { 288 if (blk_update_request(rq, error, blk_rq_bytes(rq))) 289 return true; 290 291 blk_account_io_done(rq); 292 293 if (rq->end_io) 294 rq->end_io(rq, error); 295 else 296 blk_mq_free_request(rq); 297 return false; 298 } 299 EXPORT_SYMBOL(blk_mq_end_io_partial); 300 301 static void __blk_mq_complete_request_remote(void *data) 302 { 303 struct request *rq = data; 304 305 rq->q->softirq_done_fn(rq); 306 } 307 308 void __blk_mq_complete_request(struct request *rq) 309 { 310 struct blk_mq_ctx *ctx = rq->mq_ctx; 311 int cpu; 312 313 if (!ctx->ipi_redirect) { 314 rq->q->softirq_done_fn(rq); 315 return; 316 } 317 318 cpu = get_cpu(); 319 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 320 rq->csd.func = __blk_mq_complete_request_remote; 321 rq->csd.info = rq; 322 rq->csd.flags = 0; 323 smp_call_function_single_async(ctx->cpu, &rq->csd); 324 } else { 325 rq->q->softirq_done_fn(rq); 326 } 327 put_cpu(); 328 } 329 330 /** 331 * blk_mq_complete_request - end I/O on a request 332 * @rq: the request being processed 333 * 334 * Description: 335 * Ends all I/O on a request. It does not handle partial completions. 336 * The actual completion happens out-of-order, through a IPI handler. 337 **/ 338 void blk_mq_complete_request(struct request *rq) 339 { 340 if (unlikely(blk_should_fake_timeout(rq->q))) 341 return; 342 if (!blk_mark_rq_complete(rq)) 343 __blk_mq_complete_request(rq); 344 } 345 EXPORT_SYMBOL(blk_mq_complete_request); 346 347 static void blk_mq_start_request(struct request *rq, bool last) 348 { 349 struct request_queue *q = rq->q; 350 351 trace_block_rq_issue(q, rq); 352 353 /* 354 * Just mark start time and set the started bit. Due to memory 355 * ordering, we know we'll see the correct deadline as long as 356 * REQ_ATOMIC_STARTED is seen. 357 */ 358 rq->deadline = jiffies + q->rq_timeout; 359 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 360 361 if (q->dma_drain_size && blk_rq_bytes(rq)) { 362 /* 363 * Make sure space for the drain appears. We know we can do 364 * this because max_hw_segments has been adjusted to be one 365 * fewer than the device can handle. 366 */ 367 rq->nr_phys_segments++; 368 } 369 370 /* 371 * Flag the last request in the series so that drivers know when IO 372 * should be kicked off, if they don't do it on a per-request basis. 373 * 374 * Note: the flag isn't the only condition drivers should do kick off. 375 * If drive is busy, the last request might not have the bit set. 376 */ 377 if (last) 378 rq->cmd_flags |= REQ_END; 379 } 380 381 static void blk_mq_requeue_request(struct request *rq) 382 { 383 struct request_queue *q = rq->q; 384 385 trace_block_rq_requeue(q, rq); 386 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 387 388 rq->cmd_flags &= ~REQ_END; 389 390 if (q->dma_drain_size && blk_rq_bytes(rq)) 391 rq->nr_phys_segments--; 392 } 393 394 struct blk_mq_timeout_data { 395 struct blk_mq_hw_ctx *hctx; 396 unsigned long *next; 397 unsigned int *next_set; 398 }; 399 400 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 401 { 402 struct blk_mq_timeout_data *data = __data; 403 struct blk_mq_hw_ctx *hctx = data->hctx; 404 unsigned int tag; 405 406 /* It may not be in flight yet (this is where 407 * the REQ_ATOMIC_STARTED flag comes in). The requests are 408 * statically allocated, so we know it's always safe to access the 409 * memory associated with a bit offset into ->rqs[]. 410 */ 411 tag = 0; 412 do { 413 struct request *rq; 414 415 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 416 if (tag >= hctx->queue_depth) 417 break; 418 419 rq = hctx->rqs[tag++]; 420 421 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 422 continue; 423 424 blk_rq_check_expired(rq, data->next, data->next_set); 425 } while (1); 426 } 427 428 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 429 unsigned long *next, 430 unsigned int *next_set) 431 { 432 struct blk_mq_timeout_data data = { 433 .hctx = hctx, 434 .next = next, 435 .next_set = next_set, 436 }; 437 438 /* 439 * Ask the tagging code to iterate busy requests, so we can 440 * check them for timeout. 441 */ 442 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 443 } 444 445 static void blk_mq_rq_timer(unsigned long data) 446 { 447 struct request_queue *q = (struct request_queue *) data; 448 struct blk_mq_hw_ctx *hctx; 449 unsigned long next = 0; 450 int i, next_set = 0; 451 452 queue_for_each_hw_ctx(q, hctx, i) 453 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 454 455 if (next_set) 456 mod_timer(&q->timeout, round_jiffies_up(next)); 457 } 458 459 /* 460 * Reverse check our software queue for entries that we could potentially 461 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 462 * too much time checking for merges. 463 */ 464 static bool blk_mq_attempt_merge(struct request_queue *q, 465 struct blk_mq_ctx *ctx, struct bio *bio) 466 { 467 struct request *rq; 468 int checked = 8; 469 470 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 471 int el_ret; 472 473 if (!checked--) 474 break; 475 476 if (!blk_rq_merge_ok(rq, bio)) 477 continue; 478 479 el_ret = blk_try_merge(rq, bio); 480 if (el_ret == ELEVATOR_BACK_MERGE) { 481 if (bio_attempt_back_merge(q, rq, bio)) { 482 ctx->rq_merged++; 483 return true; 484 } 485 break; 486 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 487 if (bio_attempt_front_merge(q, rq, bio)) { 488 ctx->rq_merged++; 489 return true; 490 } 491 break; 492 } 493 } 494 495 return false; 496 } 497 498 void blk_mq_add_timer(struct request *rq) 499 { 500 __blk_add_timer(rq, NULL); 501 } 502 503 /* 504 * Run this hardware queue, pulling any software queues mapped to it in. 505 * Note that this function currently has various problems around ordering 506 * of IO. In particular, we'd like FIFO behaviour on handling existing 507 * items on the hctx->dispatch list. Ignore that for now. 508 */ 509 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 510 { 511 struct request_queue *q = hctx->queue; 512 struct blk_mq_ctx *ctx; 513 struct request *rq; 514 LIST_HEAD(rq_list); 515 int bit, queued; 516 517 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 518 return; 519 520 hctx->run++; 521 522 /* 523 * Touch any software queue that has pending entries. 524 */ 525 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 526 clear_bit(bit, hctx->ctx_map); 527 ctx = hctx->ctxs[bit]; 528 BUG_ON(bit != ctx->index_hw); 529 530 spin_lock(&ctx->lock); 531 list_splice_tail_init(&ctx->rq_list, &rq_list); 532 spin_unlock(&ctx->lock); 533 } 534 535 /* 536 * If we have previous entries on our dispatch list, grab them 537 * and stuff them at the front for more fair dispatch. 538 */ 539 if (!list_empty_careful(&hctx->dispatch)) { 540 spin_lock(&hctx->lock); 541 if (!list_empty(&hctx->dispatch)) 542 list_splice_init(&hctx->dispatch, &rq_list); 543 spin_unlock(&hctx->lock); 544 } 545 546 /* 547 * Delete and return all entries from our dispatch list 548 */ 549 queued = 0; 550 551 /* 552 * Now process all the entries, sending them to the driver. 553 */ 554 while (!list_empty(&rq_list)) { 555 int ret; 556 557 rq = list_first_entry(&rq_list, struct request, queuelist); 558 list_del_init(&rq->queuelist); 559 560 blk_mq_start_request(rq, list_empty(&rq_list)); 561 562 ret = q->mq_ops->queue_rq(hctx, rq); 563 switch (ret) { 564 case BLK_MQ_RQ_QUEUE_OK: 565 queued++; 566 continue; 567 case BLK_MQ_RQ_QUEUE_BUSY: 568 /* 569 * FIXME: we should have a mechanism to stop the queue 570 * like blk_stop_queue, otherwise we will waste cpu 571 * time 572 */ 573 list_add(&rq->queuelist, &rq_list); 574 blk_mq_requeue_request(rq); 575 break; 576 default: 577 pr_err("blk-mq: bad return on queue: %d\n", ret); 578 case BLK_MQ_RQ_QUEUE_ERROR: 579 rq->errors = -EIO; 580 blk_mq_end_io(rq, rq->errors); 581 break; 582 } 583 584 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 585 break; 586 } 587 588 if (!queued) 589 hctx->dispatched[0]++; 590 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 591 hctx->dispatched[ilog2(queued) + 1]++; 592 593 /* 594 * Any items that need requeuing? Stuff them into hctx->dispatch, 595 * that is where we will continue on next queue run. 596 */ 597 if (!list_empty(&rq_list)) { 598 spin_lock(&hctx->lock); 599 list_splice(&rq_list, &hctx->dispatch); 600 spin_unlock(&hctx->lock); 601 } 602 } 603 604 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 605 { 606 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->state))) 607 return; 608 609 if (!async) 610 __blk_mq_run_hw_queue(hctx); 611 else { 612 struct request_queue *q = hctx->queue; 613 614 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 615 } 616 } 617 618 void blk_mq_run_queues(struct request_queue *q, bool async) 619 { 620 struct blk_mq_hw_ctx *hctx; 621 int i; 622 623 queue_for_each_hw_ctx(q, hctx, i) { 624 if ((!blk_mq_hctx_has_pending(hctx) && 625 list_empty_careful(&hctx->dispatch)) || 626 test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 627 continue; 628 629 blk_mq_run_hw_queue(hctx, async); 630 } 631 } 632 EXPORT_SYMBOL(blk_mq_run_queues); 633 634 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 635 { 636 cancel_delayed_work(&hctx->delayed_work); 637 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 638 } 639 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 640 641 void blk_mq_stop_hw_queues(struct request_queue *q) 642 { 643 struct blk_mq_hw_ctx *hctx; 644 int i; 645 646 queue_for_each_hw_ctx(q, hctx, i) 647 blk_mq_stop_hw_queue(hctx); 648 } 649 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 650 651 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 652 { 653 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 654 __blk_mq_run_hw_queue(hctx); 655 } 656 EXPORT_SYMBOL(blk_mq_start_hw_queue); 657 658 void blk_mq_start_stopped_hw_queues(struct request_queue *q) 659 { 660 struct blk_mq_hw_ctx *hctx; 661 int i; 662 663 queue_for_each_hw_ctx(q, hctx, i) { 664 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 665 continue; 666 667 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 668 blk_mq_run_hw_queue(hctx, true); 669 } 670 } 671 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 672 673 static void blk_mq_work_fn(struct work_struct *work) 674 { 675 struct blk_mq_hw_ctx *hctx; 676 677 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 678 __blk_mq_run_hw_queue(hctx); 679 } 680 681 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 682 struct request *rq, bool at_head) 683 { 684 struct blk_mq_ctx *ctx = rq->mq_ctx; 685 686 trace_block_rq_insert(hctx->queue, rq); 687 688 if (at_head) 689 list_add(&rq->queuelist, &ctx->rq_list); 690 else 691 list_add_tail(&rq->queuelist, &ctx->rq_list); 692 blk_mq_hctx_mark_pending(hctx, ctx); 693 694 /* 695 * We do this early, to ensure we are on the right CPU. 696 */ 697 blk_mq_add_timer(rq); 698 } 699 700 void blk_mq_insert_request(struct request *rq, bool at_head, bool run_queue, 701 bool async) 702 { 703 struct request_queue *q = rq->q; 704 struct blk_mq_hw_ctx *hctx; 705 struct blk_mq_ctx *ctx = rq->mq_ctx, *current_ctx; 706 707 current_ctx = blk_mq_get_ctx(q); 708 if (!cpu_online(ctx->cpu)) 709 rq->mq_ctx = ctx = current_ctx; 710 711 hctx = q->mq_ops->map_queue(q, ctx->cpu); 712 713 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA) && 714 !(rq->cmd_flags & (REQ_FLUSH_SEQ))) { 715 blk_insert_flush(rq); 716 } else { 717 spin_lock(&ctx->lock); 718 __blk_mq_insert_request(hctx, rq, at_head); 719 spin_unlock(&ctx->lock); 720 } 721 722 blk_mq_put_ctx(current_ctx); 723 724 if (run_queue) 725 blk_mq_run_hw_queue(hctx, async); 726 } 727 728 static void blk_mq_insert_requests(struct request_queue *q, 729 struct blk_mq_ctx *ctx, 730 struct list_head *list, 731 int depth, 732 bool from_schedule) 733 734 { 735 struct blk_mq_hw_ctx *hctx; 736 struct blk_mq_ctx *current_ctx; 737 738 trace_block_unplug(q, depth, !from_schedule); 739 740 current_ctx = blk_mq_get_ctx(q); 741 742 if (!cpu_online(ctx->cpu)) 743 ctx = current_ctx; 744 hctx = q->mq_ops->map_queue(q, ctx->cpu); 745 746 /* 747 * preemption doesn't flush plug list, so it's possible ctx->cpu is 748 * offline now 749 */ 750 spin_lock(&ctx->lock); 751 while (!list_empty(list)) { 752 struct request *rq; 753 754 rq = list_first_entry(list, struct request, queuelist); 755 list_del_init(&rq->queuelist); 756 rq->mq_ctx = ctx; 757 __blk_mq_insert_request(hctx, rq, false); 758 } 759 spin_unlock(&ctx->lock); 760 761 blk_mq_put_ctx(current_ctx); 762 763 blk_mq_run_hw_queue(hctx, from_schedule); 764 } 765 766 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 767 { 768 struct request *rqa = container_of(a, struct request, queuelist); 769 struct request *rqb = container_of(b, struct request, queuelist); 770 771 return !(rqa->mq_ctx < rqb->mq_ctx || 772 (rqa->mq_ctx == rqb->mq_ctx && 773 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 774 } 775 776 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 777 { 778 struct blk_mq_ctx *this_ctx; 779 struct request_queue *this_q; 780 struct request *rq; 781 LIST_HEAD(list); 782 LIST_HEAD(ctx_list); 783 unsigned int depth; 784 785 list_splice_init(&plug->mq_list, &list); 786 787 list_sort(NULL, &list, plug_ctx_cmp); 788 789 this_q = NULL; 790 this_ctx = NULL; 791 depth = 0; 792 793 while (!list_empty(&list)) { 794 rq = list_entry_rq(list.next); 795 list_del_init(&rq->queuelist); 796 BUG_ON(!rq->q); 797 if (rq->mq_ctx != this_ctx) { 798 if (this_ctx) { 799 blk_mq_insert_requests(this_q, this_ctx, 800 &ctx_list, depth, 801 from_schedule); 802 } 803 804 this_ctx = rq->mq_ctx; 805 this_q = rq->q; 806 depth = 0; 807 } 808 809 depth++; 810 list_add_tail(&rq->queuelist, &ctx_list); 811 } 812 813 /* 814 * If 'this_ctx' is set, we know we have entries to complete 815 * on 'ctx_list'. Do those. 816 */ 817 if (this_ctx) { 818 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 819 from_schedule); 820 } 821 } 822 823 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 824 { 825 init_request_from_bio(rq, bio); 826 blk_account_io_start(rq, 1); 827 } 828 829 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 830 { 831 struct blk_mq_hw_ctx *hctx; 832 struct blk_mq_ctx *ctx; 833 const int is_sync = rw_is_sync(bio->bi_rw); 834 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 835 int rw = bio_data_dir(bio); 836 struct request *rq; 837 unsigned int use_plug, request_count = 0; 838 839 /* 840 * If we have multiple hardware queues, just go directly to 841 * one of those for sync IO. 842 */ 843 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 844 845 blk_queue_bounce(q, &bio); 846 847 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 848 bio_endio(bio, -EIO); 849 return; 850 } 851 852 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 853 return; 854 855 if (blk_mq_queue_enter(q)) { 856 bio_endio(bio, -EIO); 857 return; 858 } 859 860 ctx = blk_mq_get_ctx(q); 861 hctx = q->mq_ops->map_queue(q, ctx->cpu); 862 863 if (is_sync) 864 rw |= REQ_SYNC; 865 trace_block_getrq(q, bio, rw); 866 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 867 if (likely(rq)) 868 blk_mq_rq_ctx_init(q, ctx, rq, rw); 869 else { 870 blk_mq_put_ctx(ctx); 871 trace_block_sleeprq(q, bio, rw); 872 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 873 false); 874 ctx = rq->mq_ctx; 875 hctx = q->mq_ops->map_queue(q, ctx->cpu); 876 } 877 878 hctx->queued++; 879 880 if (unlikely(is_flush_fua)) { 881 blk_mq_bio_to_request(rq, bio); 882 blk_mq_put_ctx(ctx); 883 blk_insert_flush(rq); 884 goto run_queue; 885 } 886 887 /* 888 * A task plug currently exists. Since this is completely lockless, 889 * utilize that to temporarily store requests until the task is 890 * either done or scheduled away. 891 */ 892 if (use_plug) { 893 struct blk_plug *plug = current->plug; 894 895 if (plug) { 896 blk_mq_bio_to_request(rq, bio); 897 if (list_empty(&plug->mq_list)) 898 trace_block_plug(q); 899 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 900 blk_flush_plug_list(plug, false); 901 trace_block_plug(q); 902 } 903 list_add_tail(&rq->queuelist, &plug->mq_list); 904 blk_mq_put_ctx(ctx); 905 return; 906 } 907 } 908 909 spin_lock(&ctx->lock); 910 911 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 912 blk_mq_attempt_merge(q, ctx, bio)) 913 __blk_mq_free_request(hctx, ctx, rq); 914 else { 915 blk_mq_bio_to_request(rq, bio); 916 __blk_mq_insert_request(hctx, rq, false); 917 } 918 919 spin_unlock(&ctx->lock); 920 blk_mq_put_ctx(ctx); 921 922 /* 923 * For a SYNC request, send it to the hardware immediately. For an 924 * ASYNC request, just ensure that we run it later on. The latter 925 * allows for merging opportunities and more efficient dispatching. 926 */ 927 run_queue: 928 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 929 } 930 931 /* 932 * Default mapping to a software queue, since we use one per CPU. 933 */ 934 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 935 { 936 return q->queue_hw_ctx[q->mq_map[cpu]]; 937 } 938 EXPORT_SYMBOL(blk_mq_map_queue); 939 940 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 941 unsigned int hctx_index) 942 { 943 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 944 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 945 } 946 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 947 948 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 949 unsigned int hctx_index) 950 { 951 kfree(hctx); 952 } 953 EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 954 955 static void blk_mq_hctx_notify(void *data, unsigned long action, 956 unsigned int cpu) 957 { 958 struct blk_mq_hw_ctx *hctx = data; 959 struct request_queue *q = hctx->queue; 960 struct blk_mq_ctx *ctx; 961 LIST_HEAD(tmp); 962 963 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 964 return; 965 966 /* 967 * Move ctx entries to new CPU, if this one is going away. 968 */ 969 ctx = __blk_mq_get_ctx(q, cpu); 970 971 spin_lock(&ctx->lock); 972 if (!list_empty(&ctx->rq_list)) { 973 list_splice_init(&ctx->rq_list, &tmp); 974 clear_bit(ctx->index_hw, hctx->ctx_map); 975 } 976 spin_unlock(&ctx->lock); 977 978 if (list_empty(&tmp)) 979 return; 980 981 ctx = blk_mq_get_ctx(q); 982 spin_lock(&ctx->lock); 983 984 while (!list_empty(&tmp)) { 985 struct request *rq; 986 987 rq = list_first_entry(&tmp, struct request, queuelist); 988 rq->mq_ctx = ctx; 989 list_move_tail(&rq->queuelist, &ctx->rq_list); 990 } 991 992 hctx = q->mq_ops->map_queue(q, ctx->cpu); 993 blk_mq_hctx_mark_pending(hctx, ctx); 994 995 spin_unlock(&ctx->lock); 996 blk_mq_put_ctx(ctx); 997 998 blk_mq_run_hw_queue(hctx, true); 999 } 1000 1001 static int blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1002 int (*init)(void *, struct blk_mq_hw_ctx *, 1003 struct request *, unsigned int), 1004 void *data) 1005 { 1006 unsigned int i; 1007 int ret = 0; 1008 1009 for (i = 0; i < hctx->queue_depth; i++) { 1010 struct request *rq = hctx->rqs[i]; 1011 1012 ret = init(data, hctx, rq, i); 1013 if (ret) 1014 break; 1015 } 1016 1017 return ret; 1018 } 1019 1020 int blk_mq_init_commands(struct request_queue *q, 1021 int (*init)(void *, struct blk_mq_hw_ctx *, 1022 struct request *, unsigned int), 1023 void *data) 1024 { 1025 struct blk_mq_hw_ctx *hctx; 1026 unsigned int i; 1027 int ret = 0; 1028 1029 queue_for_each_hw_ctx(q, hctx, i) { 1030 ret = blk_mq_init_hw_commands(hctx, init, data); 1031 if (ret) 1032 break; 1033 } 1034 1035 return ret; 1036 } 1037 EXPORT_SYMBOL(blk_mq_init_commands); 1038 1039 static void blk_mq_free_hw_commands(struct blk_mq_hw_ctx *hctx, 1040 void (*free)(void *, struct blk_mq_hw_ctx *, 1041 struct request *, unsigned int), 1042 void *data) 1043 { 1044 unsigned int i; 1045 1046 for (i = 0; i < hctx->queue_depth; i++) { 1047 struct request *rq = hctx->rqs[i]; 1048 1049 free(data, hctx, rq, i); 1050 } 1051 } 1052 1053 void blk_mq_free_commands(struct request_queue *q, 1054 void (*free)(void *, struct blk_mq_hw_ctx *, 1055 struct request *, unsigned int), 1056 void *data) 1057 { 1058 struct blk_mq_hw_ctx *hctx; 1059 unsigned int i; 1060 1061 queue_for_each_hw_ctx(q, hctx, i) 1062 blk_mq_free_hw_commands(hctx, free, data); 1063 } 1064 EXPORT_SYMBOL(blk_mq_free_commands); 1065 1066 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1067 { 1068 struct page *page; 1069 1070 while (!list_empty(&hctx->page_list)) { 1071 page = list_first_entry(&hctx->page_list, struct page, lru); 1072 list_del_init(&page->lru); 1073 __free_pages(page, page->private); 1074 } 1075 1076 kfree(hctx->rqs); 1077 1078 if (hctx->tags) 1079 blk_mq_free_tags(hctx->tags); 1080 } 1081 1082 static size_t order_to_size(unsigned int order) 1083 { 1084 size_t ret = PAGE_SIZE; 1085 1086 while (order--) 1087 ret *= 2; 1088 1089 return ret; 1090 } 1091 1092 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1093 unsigned int reserved_tags, int node) 1094 { 1095 unsigned int i, j, entries_per_page, max_order = 4; 1096 size_t rq_size, left; 1097 1098 INIT_LIST_HEAD(&hctx->page_list); 1099 1100 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1101 GFP_KERNEL, node); 1102 if (!hctx->rqs) 1103 return -ENOMEM; 1104 1105 /* 1106 * rq_size is the size of the request plus driver payload, rounded 1107 * to the cacheline size 1108 */ 1109 rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1110 cache_line_size()); 1111 left = rq_size * hctx->queue_depth; 1112 1113 for (i = 0; i < hctx->queue_depth;) { 1114 int this_order = max_order; 1115 struct page *page; 1116 int to_do; 1117 void *p; 1118 1119 while (left < order_to_size(this_order - 1) && this_order) 1120 this_order--; 1121 1122 do { 1123 page = alloc_pages_node(node, GFP_KERNEL, this_order); 1124 if (page) 1125 break; 1126 if (!this_order--) 1127 break; 1128 if (order_to_size(this_order) < rq_size) 1129 break; 1130 } while (1); 1131 1132 if (!page) 1133 break; 1134 1135 page->private = this_order; 1136 list_add_tail(&page->lru, &hctx->page_list); 1137 1138 p = page_address(page); 1139 entries_per_page = order_to_size(this_order) / rq_size; 1140 to_do = min(entries_per_page, hctx->queue_depth - i); 1141 left -= to_do * rq_size; 1142 for (j = 0; j < to_do; j++) { 1143 hctx->rqs[i] = p; 1144 blk_mq_rq_init(hctx, hctx->rqs[i]); 1145 p += rq_size; 1146 i++; 1147 } 1148 } 1149 1150 if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1151 goto err_rq_map; 1152 else if (i != hctx->queue_depth) { 1153 hctx->queue_depth = i; 1154 pr_warn("%s: queue depth set to %u because of low memory\n", 1155 __func__, i); 1156 } 1157 1158 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1159 if (!hctx->tags) { 1160 err_rq_map: 1161 blk_mq_free_rq_map(hctx); 1162 return -ENOMEM; 1163 } 1164 1165 return 0; 1166 } 1167 1168 static int blk_mq_init_hw_queues(struct request_queue *q, 1169 struct blk_mq_reg *reg, void *driver_data) 1170 { 1171 struct blk_mq_hw_ctx *hctx; 1172 unsigned int i, j; 1173 1174 /* 1175 * Initialize hardware queues 1176 */ 1177 queue_for_each_hw_ctx(q, hctx, i) { 1178 unsigned int num_maps; 1179 int node; 1180 1181 node = hctx->numa_node; 1182 if (node == NUMA_NO_NODE) 1183 node = hctx->numa_node = reg->numa_node; 1184 1185 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1186 spin_lock_init(&hctx->lock); 1187 INIT_LIST_HEAD(&hctx->dispatch); 1188 hctx->queue = q; 1189 hctx->queue_num = i; 1190 hctx->flags = reg->flags; 1191 hctx->queue_depth = reg->queue_depth; 1192 hctx->cmd_size = reg->cmd_size; 1193 1194 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1195 blk_mq_hctx_notify, hctx); 1196 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1197 1198 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1199 break; 1200 1201 /* 1202 * Allocate space for all possible cpus to avoid allocation in 1203 * runtime 1204 */ 1205 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1206 GFP_KERNEL, node); 1207 if (!hctx->ctxs) 1208 break; 1209 1210 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1211 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1212 GFP_KERNEL, node); 1213 if (!hctx->ctx_map) 1214 break; 1215 1216 hctx->nr_ctx_map = num_maps; 1217 hctx->nr_ctx = 0; 1218 1219 if (reg->ops->init_hctx && 1220 reg->ops->init_hctx(hctx, driver_data, i)) 1221 break; 1222 } 1223 1224 if (i == q->nr_hw_queues) 1225 return 0; 1226 1227 /* 1228 * Init failed 1229 */ 1230 queue_for_each_hw_ctx(q, hctx, j) { 1231 if (i == j) 1232 break; 1233 1234 if (reg->ops->exit_hctx) 1235 reg->ops->exit_hctx(hctx, j); 1236 1237 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1238 blk_mq_free_rq_map(hctx); 1239 kfree(hctx->ctxs); 1240 } 1241 1242 return 1; 1243 } 1244 1245 static void blk_mq_init_cpu_queues(struct request_queue *q, 1246 unsigned int nr_hw_queues) 1247 { 1248 unsigned int i; 1249 1250 for_each_possible_cpu(i) { 1251 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1252 struct blk_mq_hw_ctx *hctx; 1253 1254 memset(__ctx, 0, sizeof(*__ctx)); 1255 __ctx->cpu = i; 1256 spin_lock_init(&__ctx->lock); 1257 INIT_LIST_HEAD(&__ctx->rq_list); 1258 __ctx->queue = q; 1259 1260 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1261 hctx = q->mq_ops->map_queue(q, i); 1262 hctx->nr_ctx++; 1263 1264 if (!cpu_online(i)) 1265 continue; 1266 1267 /* 1268 * Set local node, IFF we have more than one hw queue. If 1269 * not, we remain on the home node of the device 1270 */ 1271 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1272 hctx->numa_node = cpu_to_node(i); 1273 } 1274 } 1275 1276 static void blk_mq_map_swqueue(struct request_queue *q) 1277 { 1278 unsigned int i; 1279 struct blk_mq_hw_ctx *hctx; 1280 struct blk_mq_ctx *ctx; 1281 1282 queue_for_each_hw_ctx(q, hctx, i) { 1283 hctx->nr_ctx = 0; 1284 } 1285 1286 /* 1287 * Map software to hardware queues 1288 */ 1289 queue_for_each_ctx(q, ctx, i) { 1290 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1291 hctx = q->mq_ops->map_queue(q, i); 1292 ctx->index_hw = hctx->nr_ctx; 1293 hctx->ctxs[hctx->nr_ctx++] = ctx; 1294 } 1295 } 1296 1297 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1298 void *driver_data) 1299 { 1300 struct blk_mq_hw_ctx **hctxs; 1301 struct blk_mq_ctx *ctx; 1302 struct request_queue *q; 1303 int i; 1304 1305 if (!reg->nr_hw_queues || 1306 !reg->ops->queue_rq || !reg->ops->map_queue || 1307 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1308 return ERR_PTR(-EINVAL); 1309 1310 if (!reg->queue_depth) 1311 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1312 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1313 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1314 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1315 } 1316 1317 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1318 return ERR_PTR(-EINVAL); 1319 1320 ctx = alloc_percpu(struct blk_mq_ctx); 1321 if (!ctx) 1322 return ERR_PTR(-ENOMEM); 1323 1324 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1325 reg->numa_node); 1326 1327 if (!hctxs) 1328 goto err_percpu; 1329 1330 for (i = 0; i < reg->nr_hw_queues; i++) { 1331 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1332 if (!hctxs[i]) 1333 goto err_hctxs; 1334 1335 hctxs[i]->numa_node = NUMA_NO_NODE; 1336 hctxs[i]->queue_num = i; 1337 } 1338 1339 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1340 if (!q) 1341 goto err_hctxs; 1342 1343 q->mq_map = blk_mq_make_queue_map(reg); 1344 if (!q->mq_map) 1345 goto err_map; 1346 1347 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1348 blk_queue_rq_timeout(q, 30000); 1349 1350 q->nr_queues = nr_cpu_ids; 1351 q->nr_hw_queues = reg->nr_hw_queues; 1352 1353 q->queue_ctx = ctx; 1354 q->queue_hw_ctx = hctxs; 1355 1356 q->mq_ops = reg->ops; 1357 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1358 1359 q->sg_reserved_size = INT_MAX; 1360 1361 blk_queue_make_request(q, blk_mq_make_request); 1362 blk_queue_rq_timed_out(q, reg->ops->timeout); 1363 if (reg->timeout) 1364 blk_queue_rq_timeout(q, reg->timeout); 1365 1366 if (reg->ops->complete) 1367 blk_queue_softirq_done(q, reg->ops->complete); 1368 1369 blk_mq_init_flush(q); 1370 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1371 1372 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, 1373 cache_line_size()), GFP_KERNEL); 1374 if (!q->flush_rq) 1375 goto err_hw; 1376 1377 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1378 goto err_flush_rq; 1379 1380 blk_mq_map_swqueue(q); 1381 1382 mutex_lock(&all_q_mutex); 1383 list_add_tail(&q->all_q_node, &all_q_list); 1384 mutex_unlock(&all_q_mutex); 1385 1386 return q; 1387 1388 err_flush_rq: 1389 kfree(q->flush_rq); 1390 err_hw: 1391 kfree(q->mq_map); 1392 err_map: 1393 blk_cleanup_queue(q); 1394 err_hctxs: 1395 for (i = 0; i < reg->nr_hw_queues; i++) { 1396 if (!hctxs[i]) 1397 break; 1398 reg->ops->free_hctx(hctxs[i], i); 1399 } 1400 kfree(hctxs); 1401 err_percpu: 1402 free_percpu(ctx); 1403 return ERR_PTR(-ENOMEM); 1404 } 1405 EXPORT_SYMBOL(blk_mq_init_queue); 1406 1407 void blk_mq_free_queue(struct request_queue *q) 1408 { 1409 struct blk_mq_hw_ctx *hctx; 1410 int i; 1411 1412 queue_for_each_hw_ctx(q, hctx, i) { 1413 kfree(hctx->ctx_map); 1414 kfree(hctx->ctxs); 1415 blk_mq_free_rq_map(hctx); 1416 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1417 if (q->mq_ops->exit_hctx) 1418 q->mq_ops->exit_hctx(hctx, i); 1419 q->mq_ops->free_hctx(hctx, i); 1420 } 1421 1422 free_percpu(q->queue_ctx); 1423 kfree(q->queue_hw_ctx); 1424 kfree(q->mq_map); 1425 1426 q->queue_ctx = NULL; 1427 q->queue_hw_ctx = NULL; 1428 q->mq_map = NULL; 1429 1430 mutex_lock(&all_q_mutex); 1431 list_del_init(&q->all_q_node); 1432 mutex_unlock(&all_q_mutex); 1433 } 1434 1435 /* Basically redo blk_mq_init_queue with queue frozen */ 1436 static void blk_mq_queue_reinit(struct request_queue *q) 1437 { 1438 blk_mq_freeze_queue(q); 1439 1440 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1441 1442 /* 1443 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1444 * we should change hctx numa_node according to new topology (this 1445 * involves free and re-allocate memory, worthy doing?) 1446 */ 1447 1448 blk_mq_map_swqueue(q); 1449 1450 blk_mq_unfreeze_queue(q); 1451 } 1452 1453 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1454 unsigned long action, void *hcpu) 1455 { 1456 struct request_queue *q; 1457 1458 /* 1459 * Before new mapping is established, hotadded cpu might already start 1460 * handling requests. This doesn't break anything as we map offline 1461 * CPUs to first hardware queue. We will re-init queue below to get 1462 * optimal settings. 1463 */ 1464 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1465 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1466 return NOTIFY_OK; 1467 1468 mutex_lock(&all_q_mutex); 1469 list_for_each_entry(q, &all_q_list, all_q_node) 1470 blk_mq_queue_reinit(q); 1471 mutex_unlock(&all_q_mutex); 1472 return NOTIFY_OK; 1473 } 1474 1475 void blk_mq_disable_hotplug(void) 1476 { 1477 mutex_lock(&all_q_mutex); 1478 } 1479 1480 void blk_mq_enable_hotplug(void) 1481 { 1482 mutex_unlock(&all_q_mutex); 1483 } 1484 1485 static int __init blk_mq_init(void) 1486 { 1487 blk_mq_cpu_init(); 1488 1489 /* Must be called after percpu_counter_hotcpu_callback() */ 1490 hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1491 1492 return 0; 1493 } 1494 subsys_initcall(blk_mq_init); 1495