1 #include <linux/kernel.h> 2 #include <linux/module.h> 3 #include <linux/backing-dev.h> 4 #include <linux/bio.h> 5 #include <linux/blkdev.h> 6 #include <linux/mm.h> 7 #include <linux/init.h> 8 #include <linux/slab.h> 9 #include <linux/workqueue.h> 10 #include <linux/smp.h> 11 #include <linux/llist.h> 12 #include <linux/list_sort.h> 13 #include <linux/cpu.h> 14 #include <linux/cache.h> 15 #include <linux/sched/sysctl.h> 16 #include <linux/delay.h> 17 18 #include <trace/events/block.h> 19 20 #include <linux/blk-mq.h> 21 #include "blk.h" 22 #include "blk-mq.h" 23 #include "blk-mq-tag.h" 24 25 static DEFINE_MUTEX(all_q_mutex); 26 static LIST_HEAD(all_q_list); 27 28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 30 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 31 unsigned int cpu) 32 { 33 return per_cpu_ptr(q->queue_ctx, cpu); 34 } 35 36 /* 37 * This assumes per-cpu software queueing queues. They could be per-node 38 * as well, for instance. For now this is hardcoded as-is. Note that we don't 39 * care about preemption, since we know the ctx's are persistent. This does 40 * mean that we can't rely on ctx always matching the currently running CPU. 41 */ 42 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 43 { 44 return __blk_mq_get_ctx(q, get_cpu()); 45 } 46 47 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 48 { 49 put_cpu(); 50 } 51 52 /* 53 * Check if any of the ctx's have pending work in this hardware queue 54 */ 55 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 56 { 57 unsigned int i; 58 59 for (i = 0; i < hctx->nr_ctx_map; i++) 60 if (hctx->ctx_map[i]) 61 return true; 62 63 return false; 64 } 65 66 /* 67 * Mark this ctx as having pending work in this hardware queue 68 */ 69 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 70 struct blk_mq_ctx *ctx) 71 { 72 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 73 set_bit(ctx->index_hw, hctx->ctx_map); 74 } 75 76 static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 77 bool reserved) 78 { 79 struct request *rq; 80 unsigned int tag; 81 82 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 83 if (tag != BLK_MQ_TAG_FAIL) { 84 rq = hctx->rqs[tag]; 85 rq->tag = tag; 86 87 return rq; 88 } 89 90 return NULL; 91 } 92 93 static int blk_mq_queue_enter(struct request_queue *q) 94 { 95 int ret; 96 97 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 98 smp_wmb(); 99 /* we have problems to freeze the queue if it's initializing */ 100 if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 101 return 0; 102 103 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 104 105 spin_lock_irq(q->queue_lock); 106 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 107 !blk_queue_bypass(q) || blk_queue_dying(q), 108 *q->queue_lock); 109 /* inc usage with lock hold to avoid freeze_queue runs here */ 110 if (!ret && !blk_queue_dying(q)) 111 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 112 else if (blk_queue_dying(q)) 113 ret = -ENODEV; 114 spin_unlock_irq(q->queue_lock); 115 116 return ret; 117 } 118 119 static void blk_mq_queue_exit(struct request_queue *q) 120 { 121 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 122 } 123 124 static void __blk_mq_drain_queue(struct request_queue *q) 125 { 126 while (true) { 127 s64 count; 128 129 spin_lock_irq(q->queue_lock); 130 count = percpu_counter_sum(&q->mq_usage_counter); 131 spin_unlock_irq(q->queue_lock); 132 133 if (count == 0) 134 break; 135 blk_mq_run_queues(q, false); 136 msleep(10); 137 } 138 } 139 140 /* 141 * Guarantee no request is in use, so we can change any data structure of 142 * the queue afterward. 143 */ 144 static void blk_mq_freeze_queue(struct request_queue *q) 145 { 146 bool drain; 147 148 spin_lock_irq(q->queue_lock); 149 drain = !q->bypass_depth++; 150 queue_flag_set(QUEUE_FLAG_BYPASS, q); 151 spin_unlock_irq(q->queue_lock); 152 153 if (drain) 154 __blk_mq_drain_queue(q); 155 } 156 157 void blk_mq_drain_queue(struct request_queue *q) 158 { 159 __blk_mq_drain_queue(q); 160 } 161 162 static void blk_mq_unfreeze_queue(struct request_queue *q) 163 { 164 bool wake = false; 165 166 spin_lock_irq(q->queue_lock); 167 if (!--q->bypass_depth) { 168 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 169 wake = true; 170 } 171 WARN_ON_ONCE(q->bypass_depth < 0); 172 spin_unlock_irq(q->queue_lock); 173 if (wake) 174 wake_up_all(&q->mq_freeze_wq); 175 } 176 177 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 178 { 179 return blk_mq_has_free_tags(hctx->tags); 180 } 181 EXPORT_SYMBOL(blk_mq_can_queue); 182 183 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 184 struct request *rq, unsigned int rw_flags) 185 { 186 if (blk_queue_io_stat(q)) 187 rw_flags |= REQ_IO_STAT; 188 189 rq->mq_ctx = ctx; 190 rq->cmd_flags = rw_flags; 191 rq->start_time = jiffies; 192 set_start_time_ns(rq); 193 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 194 } 195 196 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 197 gfp_t gfp, bool reserved) 198 { 199 return blk_mq_alloc_rq(hctx, gfp, reserved); 200 } 201 202 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 203 int rw, gfp_t gfp, 204 bool reserved) 205 { 206 struct request *rq; 207 208 do { 209 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 210 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 211 212 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 213 if (rq) { 214 blk_mq_rq_ctx_init(q, ctx, rq, rw); 215 break; 216 } 217 218 blk_mq_put_ctx(ctx); 219 if (!(gfp & __GFP_WAIT)) 220 break; 221 222 __blk_mq_run_hw_queue(hctx); 223 blk_mq_wait_for_tags(hctx->tags); 224 } while (1); 225 226 return rq; 227 } 228 229 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, gfp_t gfp) 230 { 231 struct request *rq; 232 233 if (blk_mq_queue_enter(q)) 234 return NULL; 235 236 rq = blk_mq_alloc_request_pinned(q, rw, gfp, false); 237 if (rq) 238 blk_mq_put_ctx(rq->mq_ctx); 239 return rq; 240 } 241 242 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 243 gfp_t gfp) 244 { 245 struct request *rq; 246 247 if (blk_mq_queue_enter(q)) 248 return NULL; 249 250 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 251 if (rq) 252 blk_mq_put_ctx(rq->mq_ctx); 253 return rq; 254 } 255 EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 256 257 /* 258 * Re-init and set pdu, if we have it 259 */ 260 void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 261 { 262 blk_rq_init(hctx->queue, rq); 263 264 if (hctx->cmd_size) 265 rq->special = blk_mq_rq_to_pdu(rq); 266 } 267 268 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 269 struct blk_mq_ctx *ctx, struct request *rq) 270 { 271 const int tag = rq->tag; 272 struct request_queue *q = rq->q; 273 274 blk_mq_rq_init(hctx, rq); 275 blk_mq_put_tag(hctx->tags, tag); 276 277 blk_mq_queue_exit(q); 278 } 279 280 void blk_mq_free_request(struct request *rq) 281 { 282 struct blk_mq_ctx *ctx = rq->mq_ctx; 283 struct blk_mq_hw_ctx *hctx; 284 struct request_queue *q = rq->q; 285 286 ctx->rq_completed[rq_is_sync(rq)]++; 287 288 hctx = q->mq_ops->map_queue(q, ctx->cpu); 289 __blk_mq_free_request(hctx, ctx, rq); 290 } 291 292 static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 293 { 294 if (error) 295 clear_bit(BIO_UPTODATE, &bio->bi_flags); 296 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 297 error = -EIO; 298 299 if (unlikely(rq->cmd_flags & REQ_QUIET)) 300 set_bit(BIO_QUIET, &bio->bi_flags); 301 302 /* don't actually finish bio if it's part of flush sequence */ 303 if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 304 bio_endio(bio, error); 305 } 306 307 void blk_mq_end_io(struct request *rq, int error) 308 { 309 struct bio *bio = rq->bio; 310 unsigned int bytes = 0; 311 312 trace_block_rq_complete(rq->q, rq); 313 314 while (bio) { 315 struct bio *next = bio->bi_next; 316 317 bio->bi_next = NULL; 318 bytes += bio->bi_iter.bi_size; 319 blk_mq_bio_endio(rq, bio, error); 320 bio = next; 321 } 322 323 blk_account_io_completion(rq, bytes); 324 325 blk_account_io_done(rq); 326 327 if (rq->end_io) 328 rq->end_io(rq, error); 329 else 330 blk_mq_free_request(rq); 331 } 332 EXPORT_SYMBOL(blk_mq_end_io); 333 334 static void __blk_mq_complete_request_remote(void *data) 335 { 336 struct request *rq = data; 337 338 rq->q->softirq_done_fn(rq); 339 } 340 341 void __blk_mq_complete_request(struct request *rq) 342 { 343 struct blk_mq_ctx *ctx = rq->mq_ctx; 344 int cpu; 345 346 if (!ctx->ipi_redirect) { 347 rq->q->softirq_done_fn(rq); 348 return; 349 } 350 351 cpu = get_cpu(); 352 if (cpu != ctx->cpu && cpu_online(ctx->cpu)) { 353 rq->csd.func = __blk_mq_complete_request_remote; 354 rq->csd.info = rq; 355 rq->csd.flags = 0; 356 __smp_call_function_single(ctx->cpu, &rq->csd, 0); 357 } else { 358 rq->q->softirq_done_fn(rq); 359 } 360 put_cpu(); 361 } 362 363 /** 364 * blk_mq_complete_request - end I/O on a request 365 * @rq: the request being processed 366 * 367 * Description: 368 * Ends all I/O on a request. It does not handle partial completions. 369 * The actual completion happens out-of-order, through a IPI handler. 370 **/ 371 void blk_mq_complete_request(struct request *rq) 372 { 373 if (unlikely(blk_should_fake_timeout(rq->q))) 374 return; 375 if (!blk_mark_rq_complete(rq)) 376 __blk_mq_complete_request(rq); 377 } 378 EXPORT_SYMBOL(blk_mq_complete_request); 379 380 static void blk_mq_start_request(struct request *rq, bool last) 381 { 382 struct request_queue *q = rq->q; 383 384 trace_block_rq_issue(q, rq); 385 386 /* 387 * Just mark start time and set the started bit. Due to memory 388 * ordering, we know we'll see the correct deadline as long as 389 * REQ_ATOMIC_STARTED is seen. 390 */ 391 rq->deadline = jiffies + q->rq_timeout; 392 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 393 394 if (q->dma_drain_size && blk_rq_bytes(rq)) { 395 /* 396 * Make sure space for the drain appears. We know we can do 397 * this because max_hw_segments has been adjusted to be one 398 * fewer than the device can handle. 399 */ 400 rq->nr_phys_segments++; 401 } 402 403 /* 404 * Flag the last request in the series so that drivers know when IO 405 * should be kicked off, if they don't do it on a per-request basis. 406 * 407 * Note: the flag isn't the only condition drivers should do kick off. 408 * If drive is busy, the last request might not have the bit set. 409 */ 410 if (last) 411 rq->cmd_flags |= REQ_END; 412 } 413 414 static void blk_mq_requeue_request(struct request *rq) 415 { 416 struct request_queue *q = rq->q; 417 418 trace_block_rq_requeue(q, rq); 419 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 420 421 rq->cmd_flags &= ~REQ_END; 422 423 if (q->dma_drain_size && blk_rq_bytes(rq)) 424 rq->nr_phys_segments--; 425 } 426 427 struct blk_mq_timeout_data { 428 struct blk_mq_hw_ctx *hctx; 429 unsigned long *next; 430 unsigned int *next_set; 431 }; 432 433 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 434 { 435 struct blk_mq_timeout_data *data = __data; 436 struct blk_mq_hw_ctx *hctx = data->hctx; 437 unsigned int tag; 438 439 /* It may not be in flight yet (this is where 440 * the REQ_ATOMIC_STARTED flag comes in). The requests are 441 * statically allocated, so we know it's always safe to access the 442 * memory associated with a bit offset into ->rqs[]. 443 */ 444 tag = 0; 445 do { 446 struct request *rq; 447 448 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 449 if (tag >= hctx->queue_depth) 450 break; 451 452 rq = hctx->rqs[tag++]; 453 454 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 455 continue; 456 457 blk_rq_check_expired(rq, data->next, data->next_set); 458 } while (1); 459 } 460 461 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 462 unsigned long *next, 463 unsigned int *next_set) 464 { 465 struct blk_mq_timeout_data data = { 466 .hctx = hctx, 467 .next = next, 468 .next_set = next_set, 469 }; 470 471 /* 472 * Ask the tagging code to iterate busy requests, so we can 473 * check them for timeout. 474 */ 475 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 476 } 477 478 static void blk_mq_rq_timer(unsigned long data) 479 { 480 struct request_queue *q = (struct request_queue *) data; 481 struct blk_mq_hw_ctx *hctx; 482 unsigned long next = 0; 483 int i, next_set = 0; 484 485 queue_for_each_hw_ctx(q, hctx, i) 486 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 487 488 if (next_set) 489 mod_timer(&q->timeout, round_jiffies_up(next)); 490 } 491 492 /* 493 * Reverse check our software queue for entries that we could potentially 494 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 495 * too much time checking for merges. 496 */ 497 static bool blk_mq_attempt_merge(struct request_queue *q, 498 struct blk_mq_ctx *ctx, struct bio *bio) 499 { 500 struct request *rq; 501 int checked = 8; 502 503 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 504 int el_ret; 505 506 if (!checked--) 507 break; 508 509 if (!blk_rq_merge_ok(rq, bio)) 510 continue; 511 512 el_ret = blk_try_merge(rq, bio); 513 if (el_ret == ELEVATOR_BACK_MERGE) { 514 if (bio_attempt_back_merge(q, rq, bio)) { 515 ctx->rq_merged++; 516 return true; 517 } 518 break; 519 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 520 if (bio_attempt_front_merge(q, rq, bio)) { 521 ctx->rq_merged++; 522 return true; 523 } 524 break; 525 } 526 } 527 528 return false; 529 } 530 531 void blk_mq_add_timer(struct request *rq) 532 { 533 __blk_add_timer(rq, NULL); 534 } 535 536 /* 537 * Run this hardware queue, pulling any software queues mapped to it in. 538 * Note that this function currently has various problems around ordering 539 * of IO. In particular, we'd like FIFO behaviour on handling existing 540 * items on the hctx->dispatch list. Ignore that for now. 541 */ 542 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 543 { 544 struct request_queue *q = hctx->queue; 545 struct blk_mq_ctx *ctx; 546 struct request *rq; 547 LIST_HEAD(rq_list); 548 int bit, queued; 549 550 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 551 return; 552 553 hctx->run++; 554 555 /* 556 * Touch any software queue that has pending entries. 557 */ 558 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 559 clear_bit(bit, hctx->ctx_map); 560 ctx = hctx->ctxs[bit]; 561 BUG_ON(bit != ctx->index_hw); 562 563 spin_lock(&ctx->lock); 564 list_splice_tail_init(&ctx->rq_list, &rq_list); 565 spin_unlock(&ctx->lock); 566 } 567 568 /* 569 * If we have previous entries on our dispatch list, grab them 570 * and stuff them at the front for more fair dispatch. 571 */ 572 if (!list_empty_careful(&hctx->dispatch)) { 573 spin_lock(&hctx->lock); 574 if (!list_empty(&hctx->dispatch)) 575 list_splice_init(&hctx->dispatch, &rq_list); 576 spin_unlock(&hctx->lock); 577 } 578 579 /* 580 * Delete and return all entries from our dispatch list 581 */ 582 queued = 0; 583 584 /* 585 * Now process all the entries, sending them to the driver. 586 */ 587 while (!list_empty(&rq_list)) { 588 int ret; 589 590 rq = list_first_entry(&rq_list, struct request, queuelist); 591 list_del_init(&rq->queuelist); 592 593 blk_mq_start_request(rq, list_empty(&rq_list)); 594 595 ret = q->mq_ops->queue_rq(hctx, rq); 596 switch (ret) { 597 case BLK_MQ_RQ_QUEUE_OK: 598 queued++; 599 continue; 600 case BLK_MQ_RQ_QUEUE_BUSY: 601 /* 602 * FIXME: we should have a mechanism to stop the queue 603 * like blk_stop_queue, otherwise we will waste cpu 604 * time 605 */ 606 list_add(&rq->queuelist, &rq_list); 607 blk_mq_requeue_request(rq); 608 break; 609 default: 610 pr_err("blk-mq: bad return on queue: %d\n", ret); 611 case BLK_MQ_RQ_QUEUE_ERROR: 612 rq->errors = -EIO; 613 blk_mq_end_io(rq, rq->errors); 614 break; 615 } 616 617 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 618 break; 619 } 620 621 if (!queued) 622 hctx->dispatched[0]++; 623 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 624 hctx->dispatched[ilog2(queued) + 1]++; 625 626 /* 627 * Any items that need requeuing? Stuff them into hctx->dispatch, 628 * that is where we will continue on next queue run. 629 */ 630 if (!list_empty(&rq_list)) { 631 spin_lock(&hctx->lock); 632 list_splice(&rq_list, &hctx->dispatch); 633 spin_unlock(&hctx->lock); 634 } 635 } 636 637 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 638 { 639 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 640 return; 641 642 if (!async) 643 __blk_mq_run_hw_queue(hctx); 644 else { 645 struct request_queue *q = hctx->queue; 646 647 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 648 } 649 } 650 651 void blk_mq_run_queues(struct request_queue *q, bool async) 652 { 653 struct blk_mq_hw_ctx *hctx; 654 int i; 655 656 queue_for_each_hw_ctx(q, hctx, i) { 657 if ((!blk_mq_hctx_has_pending(hctx) && 658 list_empty_careful(&hctx->dispatch)) || 659 test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 660 continue; 661 662 blk_mq_run_hw_queue(hctx, async); 663 } 664 } 665 EXPORT_SYMBOL(blk_mq_run_queues); 666 667 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 668 { 669 cancel_delayed_work(&hctx->delayed_work); 670 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 671 } 672 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 673 674 void blk_mq_stop_hw_queues(struct request_queue *q) 675 { 676 struct blk_mq_hw_ctx *hctx; 677 int i; 678 679 queue_for_each_hw_ctx(q, hctx, i) 680 blk_mq_stop_hw_queue(hctx); 681 } 682 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 683 684 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 685 { 686 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 687 __blk_mq_run_hw_queue(hctx); 688 } 689 EXPORT_SYMBOL(blk_mq_start_hw_queue); 690 691 void blk_mq_start_stopped_hw_queues(struct request_queue *q) 692 { 693 struct blk_mq_hw_ctx *hctx; 694 int i; 695 696 queue_for_each_hw_ctx(q, hctx, i) { 697 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 698 continue; 699 700 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 701 blk_mq_run_hw_queue(hctx, true); 702 } 703 } 704 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 705 706 static void blk_mq_work_fn(struct work_struct *work) 707 { 708 struct blk_mq_hw_ctx *hctx; 709 710 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 711 __blk_mq_run_hw_queue(hctx); 712 } 713 714 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 715 struct request *rq, bool at_head) 716 { 717 struct blk_mq_ctx *ctx = rq->mq_ctx; 718 719 trace_block_rq_insert(hctx->queue, rq); 720 721 if (at_head) 722 list_add(&rq->queuelist, &ctx->rq_list); 723 else 724 list_add_tail(&rq->queuelist, &ctx->rq_list); 725 blk_mq_hctx_mark_pending(hctx, ctx); 726 727 /* 728 * We do this early, to ensure we are on the right CPU. 729 */ 730 blk_mq_add_timer(rq); 731 } 732 733 void blk_mq_insert_request(struct request_queue *q, struct request *rq, 734 bool at_head, bool run_queue) 735 { 736 struct blk_mq_hw_ctx *hctx; 737 struct blk_mq_ctx *ctx, *current_ctx; 738 739 ctx = rq->mq_ctx; 740 hctx = q->mq_ops->map_queue(q, ctx->cpu); 741 742 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 743 blk_insert_flush(rq); 744 } else { 745 current_ctx = blk_mq_get_ctx(q); 746 747 if (!cpu_online(ctx->cpu)) { 748 ctx = current_ctx; 749 hctx = q->mq_ops->map_queue(q, ctx->cpu); 750 rq->mq_ctx = ctx; 751 } 752 spin_lock(&ctx->lock); 753 __blk_mq_insert_request(hctx, rq, at_head); 754 spin_unlock(&ctx->lock); 755 756 blk_mq_put_ctx(current_ctx); 757 } 758 759 if (run_queue) 760 __blk_mq_run_hw_queue(hctx); 761 } 762 EXPORT_SYMBOL(blk_mq_insert_request); 763 764 /* 765 * This is a special version of blk_mq_insert_request to bypass FLUSH request 766 * check. Should only be used internally. 767 */ 768 void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 769 { 770 struct request_queue *q = rq->q; 771 struct blk_mq_hw_ctx *hctx; 772 struct blk_mq_ctx *ctx, *current_ctx; 773 774 current_ctx = blk_mq_get_ctx(q); 775 776 ctx = rq->mq_ctx; 777 if (!cpu_online(ctx->cpu)) { 778 ctx = current_ctx; 779 rq->mq_ctx = ctx; 780 } 781 hctx = q->mq_ops->map_queue(q, ctx->cpu); 782 783 /* ctx->cpu might be offline */ 784 spin_lock(&ctx->lock); 785 __blk_mq_insert_request(hctx, rq, false); 786 spin_unlock(&ctx->lock); 787 788 blk_mq_put_ctx(current_ctx); 789 790 if (run_queue) 791 blk_mq_run_hw_queue(hctx, async); 792 } 793 794 static void blk_mq_insert_requests(struct request_queue *q, 795 struct blk_mq_ctx *ctx, 796 struct list_head *list, 797 int depth, 798 bool from_schedule) 799 800 { 801 struct blk_mq_hw_ctx *hctx; 802 struct blk_mq_ctx *current_ctx; 803 804 trace_block_unplug(q, depth, !from_schedule); 805 806 current_ctx = blk_mq_get_ctx(q); 807 808 if (!cpu_online(ctx->cpu)) 809 ctx = current_ctx; 810 hctx = q->mq_ops->map_queue(q, ctx->cpu); 811 812 /* 813 * preemption doesn't flush plug list, so it's possible ctx->cpu is 814 * offline now 815 */ 816 spin_lock(&ctx->lock); 817 while (!list_empty(list)) { 818 struct request *rq; 819 820 rq = list_first_entry(list, struct request, queuelist); 821 list_del_init(&rq->queuelist); 822 rq->mq_ctx = ctx; 823 __blk_mq_insert_request(hctx, rq, false); 824 } 825 spin_unlock(&ctx->lock); 826 827 blk_mq_put_ctx(current_ctx); 828 829 blk_mq_run_hw_queue(hctx, from_schedule); 830 } 831 832 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 833 { 834 struct request *rqa = container_of(a, struct request, queuelist); 835 struct request *rqb = container_of(b, struct request, queuelist); 836 837 return !(rqa->mq_ctx < rqb->mq_ctx || 838 (rqa->mq_ctx == rqb->mq_ctx && 839 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 840 } 841 842 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 843 { 844 struct blk_mq_ctx *this_ctx; 845 struct request_queue *this_q; 846 struct request *rq; 847 LIST_HEAD(list); 848 LIST_HEAD(ctx_list); 849 unsigned int depth; 850 851 list_splice_init(&plug->mq_list, &list); 852 853 list_sort(NULL, &list, plug_ctx_cmp); 854 855 this_q = NULL; 856 this_ctx = NULL; 857 depth = 0; 858 859 while (!list_empty(&list)) { 860 rq = list_entry_rq(list.next); 861 list_del_init(&rq->queuelist); 862 BUG_ON(!rq->q); 863 if (rq->mq_ctx != this_ctx) { 864 if (this_ctx) { 865 blk_mq_insert_requests(this_q, this_ctx, 866 &ctx_list, depth, 867 from_schedule); 868 } 869 870 this_ctx = rq->mq_ctx; 871 this_q = rq->q; 872 depth = 0; 873 } 874 875 depth++; 876 list_add_tail(&rq->queuelist, &ctx_list); 877 } 878 879 /* 880 * If 'this_ctx' is set, we know we have entries to complete 881 * on 'ctx_list'. Do those. 882 */ 883 if (this_ctx) { 884 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 885 from_schedule); 886 } 887 } 888 889 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 890 { 891 init_request_from_bio(rq, bio); 892 blk_account_io_start(rq, 1); 893 } 894 895 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 896 { 897 struct blk_mq_hw_ctx *hctx; 898 struct blk_mq_ctx *ctx; 899 const int is_sync = rw_is_sync(bio->bi_rw); 900 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 901 int rw = bio_data_dir(bio); 902 struct request *rq; 903 unsigned int use_plug, request_count = 0; 904 905 /* 906 * If we have multiple hardware queues, just go directly to 907 * one of those for sync IO. 908 */ 909 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 910 911 blk_queue_bounce(q, &bio); 912 913 if (bio_integrity_enabled(bio) && bio_integrity_prep(bio)) { 914 bio_endio(bio, -EIO); 915 return; 916 } 917 918 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 919 return; 920 921 if (blk_mq_queue_enter(q)) { 922 bio_endio(bio, -EIO); 923 return; 924 } 925 926 ctx = blk_mq_get_ctx(q); 927 hctx = q->mq_ops->map_queue(q, ctx->cpu); 928 929 trace_block_getrq(q, bio, rw); 930 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 931 if (likely(rq)) 932 blk_mq_rq_ctx_init(q, ctx, rq, rw); 933 else { 934 blk_mq_put_ctx(ctx); 935 trace_block_sleeprq(q, bio, rw); 936 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 937 false); 938 ctx = rq->mq_ctx; 939 hctx = q->mq_ops->map_queue(q, ctx->cpu); 940 } 941 942 hctx->queued++; 943 944 if (unlikely(is_flush_fua)) { 945 blk_mq_bio_to_request(rq, bio); 946 blk_mq_put_ctx(ctx); 947 blk_insert_flush(rq); 948 goto run_queue; 949 } 950 951 /* 952 * A task plug currently exists. Since this is completely lockless, 953 * utilize that to temporarily store requests until the task is 954 * either done or scheduled away. 955 */ 956 if (use_plug) { 957 struct blk_plug *plug = current->plug; 958 959 if (plug) { 960 blk_mq_bio_to_request(rq, bio); 961 if (list_empty(&plug->mq_list)) 962 trace_block_plug(q); 963 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 964 blk_flush_plug_list(plug, false); 965 trace_block_plug(q); 966 } 967 list_add_tail(&rq->queuelist, &plug->mq_list); 968 blk_mq_put_ctx(ctx); 969 return; 970 } 971 } 972 973 spin_lock(&ctx->lock); 974 975 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 976 blk_mq_attempt_merge(q, ctx, bio)) 977 __blk_mq_free_request(hctx, ctx, rq); 978 else { 979 blk_mq_bio_to_request(rq, bio); 980 __blk_mq_insert_request(hctx, rq, false); 981 } 982 983 spin_unlock(&ctx->lock); 984 blk_mq_put_ctx(ctx); 985 986 /* 987 * For a SYNC request, send it to the hardware immediately. For an 988 * ASYNC request, just ensure that we run it later on. The latter 989 * allows for merging opportunities and more efficient dispatching. 990 */ 991 run_queue: 992 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 993 } 994 995 /* 996 * Default mapping to a software queue, since we use one per CPU. 997 */ 998 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 999 { 1000 return q->queue_hw_ctx[q->mq_map[cpu]]; 1001 } 1002 EXPORT_SYMBOL(blk_mq_map_queue); 1003 1004 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1005 unsigned int hctx_index) 1006 { 1007 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1008 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1009 } 1010 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 1011 1012 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1013 unsigned int hctx_index) 1014 { 1015 kfree(hctx); 1016 } 1017 EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 1018 1019 static void blk_mq_hctx_notify(void *data, unsigned long action, 1020 unsigned int cpu) 1021 { 1022 struct blk_mq_hw_ctx *hctx = data; 1023 struct blk_mq_ctx *ctx; 1024 LIST_HEAD(tmp); 1025 1026 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1027 return; 1028 1029 /* 1030 * Move ctx entries to new CPU, if this one is going away. 1031 */ 1032 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1033 1034 spin_lock(&ctx->lock); 1035 if (!list_empty(&ctx->rq_list)) { 1036 list_splice_init(&ctx->rq_list, &tmp); 1037 clear_bit(ctx->index_hw, hctx->ctx_map); 1038 } 1039 spin_unlock(&ctx->lock); 1040 1041 if (list_empty(&tmp)) 1042 return; 1043 1044 ctx = blk_mq_get_ctx(hctx->queue); 1045 spin_lock(&ctx->lock); 1046 1047 while (!list_empty(&tmp)) { 1048 struct request *rq; 1049 1050 rq = list_first_entry(&tmp, struct request, queuelist); 1051 rq->mq_ctx = ctx; 1052 list_move_tail(&rq->queuelist, &ctx->rq_list); 1053 } 1054 1055 blk_mq_hctx_mark_pending(hctx, ctx); 1056 1057 spin_unlock(&ctx->lock); 1058 blk_mq_put_ctx(ctx); 1059 } 1060 1061 static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1062 void (*init)(void *, struct blk_mq_hw_ctx *, 1063 struct request *, unsigned int), 1064 void *data) 1065 { 1066 unsigned int i; 1067 1068 for (i = 0; i < hctx->queue_depth; i++) { 1069 struct request *rq = hctx->rqs[i]; 1070 1071 init(data, hctx, rq, i); 1072 } 1073 } 1074 1075 void blk_mq_init_commands(struct request_queue *q, 1076 void (*init)(void *, struct blk_mq_hw_ctx *, 1077 struct request *, unsigned int), 1078 void *data) 1079 { 1080 struct blk_mq_hw_ctx *hctx; 1081 unsigned int i; 1082 1083 queue_for_each_hw_ctx(q, hctx, i) 1084 blk_mq_init_hw_commands(hctx, init, data); 1085 } 1086 EXPORT_SYMBOL(blk_mq_init_commands); 1087 1088 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1089 { 1090 struct page *page; 1091 1092 while (!list_empty(&hctx->page_list)) { 1093 page = list_first_entry(&hctx->page_list, struct page, lru); 1094 list_del_init(&page->lru); 1095 __free_pages(page, page->private); 1096 } 1097 1098 kfree(hctx->rqs); 1099 1100 if (hctx->tags) 1101 blk_mq_free_tags(hctx->tags); 1102 } 1103 1104 static size_t order_to_size(unsigned int order) 1105 { 1106 size_t ret = PAGE_SIZE; 1107 1108 while (order--) 1109 ret *= 2; 1110 1111 return ret; 1112 } 1113 1114 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1115 unsigned int reserved_tags, int node) 1116 { 1117 unsigned int i, j, entries_per_page, max_order = 4; 1118 size_t rq_size, left; 1119 1120 INIT_LIST_HEAD(&hctx->page_list); 1121 1122 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1123 GFP_KERNEL, node); 1124 if (!hctx->rqs) 1125 return -ENOMEM; 1126 1127 /* 1128 * rq_size is the size of the request plus driver payload, rounded 1129 * to the cacheline size 1130 */ 1131 rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1132 cache_line_size()); 1133 left = rq_size * hctx->queue_depth; 1134 1135 for (i = 0; i < hctx->queue_depth;) { 1136 int this_order = max_order; 1137 struct page *page; 1138 int to_do; 1139 void *p; 1140 1141 while (left < order_to_size(this_order - 1) && this_order) 1142 this_order--; 1143 1144 do { 1145 page = alloc_pages_node(node, GFP_KERNEL, this_order); 1146 if (page) 1147 break; 1148 if (!this_order--) 1149 break; 1150 if (order_to_size(this_order) < rq_size) 1151 break; 1152 } while (1); 1153 1154 if (!page) 1155 break; 1156 1157 page->private = this_order; 1158 list_add_tail(&page->lru, &hctx->page_list); 1159 1160 p = page_address(page); 1161 entries_per_page = order_to_size(this_order) / rq_size; 1162 to_do = min(entries_per_page, hctx->queue_depth - i); 1163 left -= to_do * rq_size; 1164 for (j = 0; j < to_do; j++) { 1165 hctx->rqs[i] = p; 1166 blk_mq_rq_init(hctx, hctx->rqs[i]); 1167 p += rq_size; 1168 i++; 1169 } 1170 } 1171 1172 if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1173 goto err_rq_map; 1174 else if (i != hctx->queue_depth) { 1175 hctx->queue_depth = i; 1176 pr_warn("%s: queue depth set to %u because of low memory\n", 1177 __func__, i); 1178 } 1179 1180 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1181 if (!hctx->tags) { 1182 err_rq_map: 1183 blk_mq_free_rq_map(hctx); 1184 return -ENOMEM; 1185 } 1186 1187 return 0; 1188 } 1189 1190 static int blk_mq_init_hw_queues(struct request_queue *q, 1191 struct blk_mq_reg *reg, void *driver_data) 1192 { 1193 struct blk_mq_hw_ctx *hctx; 1194 unsigned int i, j; 1195 1196 /* 1197 * Initialize hardware queues 1198 */ 1199 queue_for_each_hw_ctx(q, hctx, i) { 1200 unsigned int num_maps; 1201 int node; 1202 1203 node = hctx->numa_node; 1204 if (node == NUMA_NO_NODE) 1205 node = hctx->numa_node = reg->numa_node; 1206 1207 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1208 spin_lock_init(&hctx->lock); 1209 INIT_LIST_HEAD(&hctx->dispatch); 1210 hctx->queue = q; 1211 hctx->queue_num = i; 1212 hctx->flags = reg->flags; 1213 hctx->queue_depth = reg->queue_depth; 1214 hctx->cmd_size = reg->cmd_size; 1215 1216 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1217 blk_mq_hctx_notify, hctx); 1218 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1219 1220 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1221 break; 1222 1223 /* 1224 * Allocate space for all possible cpus to avoid allocation in 1225 * runtime 1226 */ 1227 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1228 GFP_KERNEL, node); 1229 if (!hctx->ctxs) 1230 break; 1231 1232 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1233 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1234 GFP_KERNEL, node); 1235 if (!hctx->ctx_map) 1236 break; 1237 1238 hctx->nr_ctx_map = num_maps; 1239 hctx->nr_ctx = 0; 1240 1241 if (reg->ops->init_hctx && 1242 reg->ops->init_hctx(hctx, driver_data, i)) 1243 break; 1244 } 1245 1246 if (i == q->nr_hw_queues) 1247 return 0; 1248 1249 /* 1250 * Init failed 1251 */ 1252 queue_for_each_hw_ctx(q, hctx, j) { 1253 if (i == j) 1254 break; 1255 1256 if (reg->ops->exit_hctx) 1257 reg->ops->exit_hctx(hctx, j); 1258 1259 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1260 blk_mq_free_rq_map(hctx); 1261 kfree(hctx->ctxs); 1262 } 1263 1264 return 1; 1265 } 1266 1267 static void blk_mq_init_cpu_queues(struct request_queue *q, 1268 unsigned int nr_hw_queues) 1269 { 1270 unsigned int i; 1271 1272 for_each_possible_cpu(i) { 1273 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1274 struct blk_mq_hw_ctx *hctx; 1275 1276 memset(__ctx, 0, sizeof(*__ctx)); 1277 __ctx->cpu = i; 1278 spin_lock_init(&__ctx->lock); 1279 INIT_LIST_HEAD(&__ctx->rq_list); 1280 __ctx->queue = q; 1281 1282 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1283 hctx = q->mq_ops->map_queue(q, i); 1284 hctx->nr_ctx++; 1285 1286 if (!cpu_online(i)) 1287 continue; 1288 1289 /* 1290 * Set local node, IFF we have more than one hw queue. If 1291 * not, we remain on the home node of the device 1292 */ 1293 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1294 hctx->numa_node = cpu_to_node(i); 1295 } 1296 } 1297 1298 static void blk_mq_map_swqueue(struct request_queue *q) 1299 { 1300 unsigned int i; 1301 struct blk_mq_hw_ctx *hctx; 1302 struct blk_mq_ctx *ctx; 1303 1304 queue_for_each_hw_ctx(q, hctx, i) { 1305 hctx->nr_ctx = 0; 1306 } 1307 1308 /* 1309 * Map software to hardware queues 1310 */ 1311 queue_for_each_ctx(q, ctx, i) { 1312 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1313 hctx = q->mq_ops->map_queue(q, i); 1314 ctx->index_hw = hctx->nr_ctx; 1315 hctx->ctxs[hctx->nr_ctx++] = ctx; 1316 } 1317 } 1318 1319 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1320 void *driver_data) 1321 { 1322 struct blk_mq_hw_ctx **hctxs; 1323 struct blk_mq_ctx *ctx; 1324 struct request_queue *q; 1325 int i; 1326 1327 if (!reg->nr_hw_queues || 1328 !reg->ops->queue_rq || !reg->ops->map_queue || 1329 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1330 return ERR_PTR(-EINVAL); 1331 1332 if (!reg->queue_depth) 1333 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1334 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1335 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1336 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1337 } 1338 1339 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1340 return ERR_PTR(-EINVAL); 1341 1342 ctx = alloc_percpu(struct blk_mq_ctx); 1343 if (!ctx) 1344 return ERR_PTR(-ENOMEM); 1345 1346 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1347 reg->numa_node); 1348 1349 if (!hctxs) 1350 goto err_percpu; 1351 1352 for (i = 0; i < reg->nr_hw_queues; i++) { 1353 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1354 if (!hctxs[i]) 1355 goto err_hctxs; 1356 1357 hctxs[i]->numa_node = NUMA_NO_NODE; 1358 hctxs[i]->queue_num = i; 1359 } 1360 1361 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1362 if (!q) 1363 goto err_hctxs; 1364 1365 q->mq_map = blk_mq_make_queue_map(reg); 1366 if (!q->mq_map) 1367 goto err_map; 1368 1369 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1370 blk_queue_rq_timeout(q, 30000); 1371 1372 q->nr_queues = nr_cpu_ids; 1373 q->nr_hw_queues = reg->nr_hw_queues; 1374 1375 q->queue_ctx = ctx; 1376 q->queue_hw_ctx = hctxs; 1377 1378 q->mq_ops = reg->ops; 1379 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1380 1381 q->sg_reserved_size = INT_MAX; 1382 1383 blk_queue_make_request(q, blk_mq_make_request); 1384 blk_queue_rq_timed_out(q, reg->ops->timeout); 1385 if (reg->timeout) 1386 blk_queue_rq_timeout(q, reg->timeout); 1387 1388 if (reg->ops->complete) 1389 blk_queue_softirq_done(q, reg->ops->complete); 1390 1391 blk_mq_init_flush(q); 1392 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1393 1394 q->flush_rq = kzalloc(round_up(sizeof(struct request) + reg->cmd_size, 1395 cache_line_size()), GFP_KERNEL); 1396 if (!q->flush_rq) 1397 goto err_hw; 1398 1399 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1400 goto err_flush_rq; 1401 1402 blk_mq_map_swqueue(q); 1403 1404 mutex_lock(&all_q_mutex); 1405 list_add_tail(&q->all_q_node, &all_q_list); 1406 mutex_unlock(&all_q_mutex); 1407 1408 return q; 1409 1410 err_flush_rq: 1411 kfree(q->flush_rq); 1412 err_hw: 1413 kfree(q->mq_map); 1414 err_map: 1415 blk_cleanup_queue(q); 1416 err_hctxs: 1417 for (i = 0; i < reg->nr_hw_queues; i++) { 1418 if (!hctxs[i]) 1419 break; 1420 reg->ops->free_hctx(hctxs[i], i); 1421 } 1422 kfree(hctxs); 1423 err_percpu: 1424 free_percpu(ctx); 1425 return ERR_PTR(-ENOMEM); 1426 } 1427 EXPORT_SYMBOL(blk_mq_init_queue); 1428 1429 void blk_mq_free_queue(struct request_queue *q) 1430 { 1431 struct blk_mq_hw_ctx *hctx; 1432 int i; 1433 1434 queue_for_each_hw_ctx(q, hctx, i) { 1435 kfree(hctx->ctx_map); 1436 kfree(hctx->ctxs); 1437 blk_mq_free_rq_map(hctx); 1438 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1439 if (q->mq_ops->exit_hctx) 1440 q->mq_ops->exit_hctx(hctx, i); 1441 q->mq_ops->free_hctx(hctx, i); 1442 } 1443 1444 free_percpu(q->queue_ctx); 1445 kfree(q->queue_hw_ctx); 1446 kfree(q->mq_map); 1447 1448 q->queue_ctx = NULL; 1449 q->queue_hw_ctx = NULL; 1450 q->mq_map = NULL; 1451 1452 mutex_lock(&all_q_mutex); 1453 list_del_init(&q->all_q_node); 1454 mutex_unlock(&all_q_mutex); 1455 } 1456 1457 /* Basically redo blk_mq_init_queue with queue frozen */ 1458 static void blk_mq_queue_reinit(struct request_queue *q) 1459 { 1460 blk_mq_freeze_queue(q); 1461 1462 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1463 1464 /* 1465 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1466 * we should change hctx numa_node according to new topology (this 1467 * involves free and re-allocate memory, worthy doing?) 1468 */ 1469 1470 blk_mq_map_swqueue(q); 1471 1472 blk_mq_unfreeze_queue(q); 1473 } 1474 1475 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1476 unsigned long action, void *hcpu) 1477 { 1478 struct request_queue *q; 1479 1480 /* 1481 * Before new mapping is established, hotadded cpu might already start 1482 * handling requests. This doesn't break anything as we map offline 1483 * CPUs to first hardware queue. We will re-init queue below to get 1484 * optimal settings. 1485 */ 1486 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1487 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1488 return NOTIFY_OK; 1489 1490 mutex_lock(&all_q_mutex); 1491 list_for_each_entry(q, &all_q_list, all_q_node) 1492 blk_mq_queue_reinit(q); 1493 mutex_unlock(&all_q_mutex); 1494 return NOTIFY_OK; 1495 } 1496 1497 static int __init blk_mq_init(void) 1498 { 1499 blk_mq_cpu_init(); 1500 1501 /* Must be called after percpu_counter_hotcpu_callback() */ 1502 hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1503 1504 return 0; 1505 } 1506 subsys_initcall(blk_mq_init); 1507