1 #include <linux/kernel.h> 2 #include <linux/module.h> 3 #include <linux/backing-dev.h> 4 #include <linux/bio.h> 5 #include <linux/blkdev.h> 6 #include <linux/mm.h> 7 #include <linux/init.h> 8 #include <linux/slab.h> 9 #include <linux/workqueue.h> 10 #include <linux/smp.h> 11 #include <linux/llist.h> 12 #include <linux/list_sort.h> 13 #include <linux/cpu.h> 14 #include <linux/cache.h> 15 #include <linux/sched/sysctl.h> 16 #include <linux/delay.h> 17 18 #include <trace/events/block.h> 19 20 #include <linux/blk-mq.h> 21 #include "blk.h" 22 #include "blk-mq.h" 23 #include "blk-mq-tag.h" 24 25 static DEFINE_MUTEX(all_q_mutex); 26 static LIST_HEAD(all_q_list); 27 28 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx); 29 30 DEFINE_PER_CPU(struct llist_head, ipi_lists); 31 32 static struct blk_mq_ctx *__blk_mq_get_ctx(struct request_queue *q, 33 unsigned int cpu) 34 { 35 return per_cpu_ptr(q->queue_ctx, cpu); 36 } 37 38 /* 39 * This assumes per-cpu software queueing queues. They could be per-node 40 * as well, for instance. For now this is hardcoded as-is. Note that we don't 41 * care about preemption, since we know the ctx's are persistent. This does 42 * mean that we can't rely on ctx always matching the currently running CPU. 43 */ 44 static struct blk_mq_ctx *blk_mq_get_ctx(struct request_queue *q) 45 { 46 return __blk_mq_get_ctx(q, get_cpu()); 47 } 48 49 static void blk_mq_put_ctx(struct blk_mq_ctx *ctx) 50 { 51 put_cpu(); 52 } 53 54 /* 55 * Check if any of the ctx's have pending work in this hardware queue 56 */ 57 static bool blk_mq_hctx_has_pending(struct blk_mq_hw_ctx *hctx) 58 { 59 unsigned int i; 60 61 for (i = 0; i < hctx->nr_ctx_map; i++) 62 if (hctx->ctx_map[i]) 63 return true; 64 65 return false; 66 } 67 68 /* 69 * Mark this ctx as having pending work in this hardware queue 70 */ 71 static void blk_mq_hctx_mark_pending(struct blk_mq_hw_ctx *hctx, 72 struct blk_mq_ctx *ctx) 73 { 74 if (!test_bit(ctx->index_hw, hctx->ctx_map)) 75 set_bit(ctx->index_hw, hctx->ctx_map); 76 } 77 78 static struct request *blk_mq_alloc_rq(struct blk_mq_hw_ctx *hctx, gfp_t gfp, 79 bool reserved) 80 { 81 struct request *rq; 82 unsigned int tag; 83 84 tag = blk_mq_get_tag(hctx->tags, gfp, reserved); 85 if (tag != BLK_MQ_TAG_FAIL) { 86 rq = hctx->rqs[tag]; 87 rq->tag = tag; 88 89 return rq; 90 } 91 92 return NULL; 93 } 94 95 static int blk_mq_queue_enter(struct request_queue *q) 96 { 97 int ret; 98 99 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 100 smp_wmb(); 101 /* we have problems to freeze the queue if it's initializing */ 102 if (!blk_queue_bypass(q) || !blk_queue_init_done(q)) 103 return 0; 104 105 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 106 107 spin_lock_irq(q->queue_lock); 108 ret = wait_event_interruptible_lock_irq(q->mq_freeze_wq, 109 !blk_queue_bypass(q), *q->queue_lock); 110 /* inc usage with lock hold to avoid freeze_queue runs here */ 111 if (!ret) 112 __percpu_counter_add(&q->mq_usage_counter, 1, 1000000); 113 spin_unlock_irq(q->queue_lock); 114 115 return ret; 116 } 117 118 static void blk_mq_queue_exit(struct request_queue *q) 119 { 120 __percpu_counter_add(&q->mq_usage_counter, -1, 1000000); 121 } 122 123 /* 124 * Guarantee no request is in use, so we can change any data structure of 125 * the queue afterward. 126 */ 127 static void blk_mq_freeze_queue(struct request_queue *q) 128 { 129 bool drain; 130 131 spin_lock_irq(q->queue_lock); 132 drain = !q->bypass_depth++; 133 queue_flag_set(QUEUE_FLAG_BYPASS, q); 134 spin_unlock_irq(q->queue_lock); 135 136 if (!drain) 137 return; 138 139 while (true) { 140 s64 count; 141 142 spin_lock_irq(q->queue_lock); 143 count = percpu_counter_sum(&q->mq_usage_counter); 144 spin_unlock_irq(q->queue_lock); 145 146 if (count == 0) 147 break; 148 blk_mq_run_queues(q, false); 149 msleep(10); 150 } 151 } 152 153 static void blk_mq_unfreeze_queue(struct request_queue *q) 154 { 155 bool wake = false; 156 157 spin_lock_irq(q->queue_lock); 158 if (!--q->bypass_depth) { 159 queue_flag_clear(QUEUE_FLAG_BYPASS, q); 160 wake = true; 161 } 162 WARN_ON_ONCE(q->bypass_depth < 0); 163 spin_unlock_irq(q->queue_lock); 164 if (wake) 165 wake_up_all(&q->mq_freeze_wq); 166 } 167 168 bool blk_mq_can_queue(struct blk_mq_hw_ctx *hctx) 169 { 170 return blk_mq_has_free_tags(hctx->tags); 171 } 172 EXPORT_SYMBOL(blk_mq_can_queue); 173 174 static void blk_mq_rq_ctx_init(struct request_queue *q, struct blk_mq_ctx *ctx, 175 struct request *rq, unsigned int rw_flags) 176 { 177 if (blk_queue_io_stat(q)) 178 rw_flags |= REQ_IO_STAT; 179 180 rq->mq_ctx = ctx; 181 rq->cmd_flags = rw_flags; 182 ctx->rq_dispatched[rw_is_sync(rw_flags)]++; 183 } 184 185 static struct request *__blk_mq_alloc_request(struct blk_mq_hw_ctx *hctx, 186 gfp_t gfp, bool reserved) 187 { 188 return blk_mq_alloc_rq(hctx, gfp, reserved); 189 } 190 191 static struct request *blk_mq_alloc_request_pinned(struct request_queue *q, 192 int rw, gfp_t gfp, 193 bool reserved) 194 { 195 struct request *rq; 196 197 do { 198 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 199 struct blk_mq_hw_ctx *hctx = q->mq_ops->map_queue(q, ctx->cpu); 200 201 rq = __blk_mq_alloc_request(hctx, gfp & ~__GFP_WAIT, reserved); 202 if (rq) { 203 blk_mq_rq_ctx_init(q, ctx, rq, rw); 204 break; 205 } 206 207 blk_mq_put_ctx(ctx); 208 if (!(gfp & __GFP_WAIT)) 209 break; 210 211 __blk_mq_run_hw_queue(hctx); 212 blk_mq_wait_for_tags(hctx->tags); 213 } while (1); 214 215 return rq; 216 } 217 218 struct request *blk_mq_alloc_request(struct request_queue *q, int rw, 219 gfp_t gfp, bool reserved) 220 { 221 struct request *rq; 222 223 if (blk_mq_queue_enter(q)) 224 return NULL; 225 226 rq = blk_mq_alloc_request_pinned(q, rw, gfp, reserved); 227 if (rq) 228 blk_mq_put_ctx(rq->mq_ctx); 229 return rq; 230 } 231 232 struct request *blk_mq_alloc_reserved_request(struct request_queue *q, int rw, 233 gfp_t gfp) 234 { 235 struct request *rq; 236 237 if (blk_mq_queue_enter(q)) 238 return NULL; 239 240 rq = blk_mq_alloc_request_pinned(q, rw, gfp, true); 241 if (rq) 242 blk_mq_put_ctx(rq->mq_ctx); 243 return rq; 244 } 245 EXPORT_SYMBOL(blk_mq_alloc_reserved_request); 246 247 /* 248 * Re-init and set pdu, if we have it 249 */ 250 static void blk_mq_rq_init(struct blk_mq_hw_ctx *hctx, struct request *rq) 251 { 252 blk_rq_init(hctx->queue, rq); 253 254 if (hctx->cmd_size) 255 rq->special = blk_mq_rq_to_pdu(rq); 256 } 257 258 static void __blk_mq_free_request(struct blk_mq_hw_ctx *hctx, 259 struct blk_mq_ctx *ctx, struct request *rq) 260 { 261 const int tag = rq->tag; 262 struct request_queue *q = rq->q; 263 264 blk_mq_rq_init(hctx, rq); 265 blk_mq_put_tag(hctx->tags, tag); 266 267 blk_mq_queue_exit(q); 268 } 269 270 void blk_mq_free_request(struct request *rq) 271 { 272 struct blk_mq_ctx *ctx = rq->mq_ctx; 273 struct blk_mq_hw_ctx *hctx; 274 struct request_queue *q = rq->q; 275 276 ctx->rq_completed[rq_is_sync(rq)]++; 277 278 hctx = q->mq_ops->map_queue(q, ctx->cpu); 279 __blk_mq_free_request(hctx, ctx, rq); 280 } 281 282 static void blk_mq_bio_endio(struct request *rq, struct bio *bio, int error) 283 { 284 if (error) 285 clear_bit(BIO_UPTODATE, &bio->bi_flags); 286 else if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 287 error = -EIO; 288 289 if (unlikely(rq->cmd_flags & REQ_QUIET)) 290 set_bit(BIO_QUIET, &bio->bi_flags); 291 292 /* don't actually finish bio if it's part of flush sequence */ 293 if (!(rq->cmd_flags & REQ_FLUSH_SEQ)) 294 bio_endio(bio, error); 295 } 296 297 void blk_mq_complete_request(struct request *rq, int error) 298 { 299 struct bio *bio = rq->bio; 300 unsigned int bytes = 0; 301 302 trace_block_rq_complete(rq->q, rq); 303 304 while (bio) { 305 struct bio *next = bio->bi_next; 306 307 bio->bi_next = NULL; 308 bytes += bio->bi_size; 309 blk_mq_bio_endio(rq, bio, error); 310 bio = next; 311 } 312 313 blk_account_io_completion(rq, bytes); 314 315 blk_account_io_done(rq); 316 317 if (rq->end_io) 318 rq->end_io(rq, error); 319 else 320 blk_mq_free_request(rq); 321 } 322 323 void __blk_mq_end_io(struct request *rq, int error) 324 { 325 if (!blk_mark_rq_complete(rq)) 326 blk_mq_complete_request(rq, error); 327 } 328 329 #if defined(CONFIG_SMP) 330 331 /* 332 * Called with interrupts disabled. 333 */ 334 static void ipi_end_io(void *data) 335 { 336 struct llist_head *list = &per_cpu(ipi_lists, smp_processor_id()); 337 struct llist_node *entry, *next; 338 struct request *rq; 339 340 entry = llist_del_all(list); 341 342 while (entry) { 343 next = entry->next; 344 rq = llist_entry(entry, struct request, ll_list); 345 __blk_mq_end_io(rq, rq->errors); 346 entry = next; 347 } 348 } 349 350 static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 351 struct request *rq, const int error) 352 { 353 struct call_single_data *data = &rq->csd; 354 355 rq->errors = error; 356 rq->ll_list.next = NULL; 357 358 /* 359 * If the list is non-empty, an existing IPI must already 360 * be "in flight". If that is the case, we need not schedule 361 * a new one. 362 */ 363 if (llist_add(&rq->ll_list, &per_cpu(ipi_lists, ctx->cpu))) { 364 data->func = ipi_end_io; 365 data->flags = 0; 366 __smp_call_function_single(ctx->cpu, data, 0); 367 } 368 369 return true; 370 } 371 #else /* CONFIG_SMP */ 372 static int ipi_remote_cpu(struct blk_mq_ctx *ctx, const int cpu, 373 struct request *rq, const int error) 374 { 375 return false; 376 } 377 #endif 378 379 /* 380 * End IO on this request on a multiqueue enabled driver. We'll either do 381 * it directly inline, or punt to a local IPI handler on the matching 382 * remote CPU. 383 */ 384 void blk_mq_end_io(struct request *rq, int error) 385 { 386 struct blk_mq_ctx *ctx = rq->mq_ctx; 387 int cpu; 388 389 if (!ctx->ipi_redirect) 390 return __blk_mq_end_io(rq, error); 391 392 cpu = get_cpu(); 393 394 if (cpu == ctx->cpu || !cpu_online(ctx->cpu) || 395 !ipi_remote_cpu(ctx, cpu, rq, error)) 396 __blk_mq_end_io(rq, error); 397 398 put_cpu(); 399 } 400 EXPORT_SYMBOL(blk_mq_end_io); 401 402 static void blk_mq_start_request(struct request *rq) 403 { 404 struct request_queue *q = rq->q; 405 406 trace_block_rq_issue(q, rq); 407 408 /* 409 * Just mark start time and set the started bit. Due to memory 410 * ordering, we know we'll see the correct deadline as long as 411 * REQ_ATOMIC_STARTED is seen. 412 */ 413 rq->deadline = jiffies + q->rq_timeout; 414 set_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 415 } 416 417 static void blk_mq_requeue_request(struct request *rq) 418 { 419 struct request_queue *q = rq->q; 420 421 trace_block_rq_requeue(q, rq); 422 clear_bit(REQ_ATOM_STARTED, &rq->atomic_flags); 423 } 424 425 struct blk_mq_timeout_data { 426 struct blk_mq_hw_ctx *hctx; 427 unsigned long *next; 428 unsigned int *next_set; 429 }; 430 431 static void blk_mq_timeout_check(void *__data, unsigned long *free_tags) 432 { 433 struct blk_mq_timeout_data *data = __data; 434 struct blk_mq_hw_ctx *hctx = data->hctx; 435 unsigned int tag; 436 437 /* It may not be in flight yet (this is where 438 * the REQ_ATOMIC_STARTED flag comes in). The requests are 439 * statically allocated, so we know it's always safe to access the 440 * memory associated with a bit offset into ->rqs[]. 441 */ 442 tag = 0; 443 do { 444 struct request *rq; 445 446 tag = find_next_zero_bit(free_tags, hctx->queue_depth, tag); 447 if (tag >= hctx->queue_depth) 448 break; 449 450 rq = hctx->rqs[tag++]; 451 452 if (!test_bit(REQ_ATOM_STARTED, &rq->atomic_flags)) 453 continue; 454 455 blk_rq_check_expired(rq, data->next, data->next_set); 456 } while (1); 457 } 458 459 static void blk_mq_hw_ctx_check_timeout(struct blk_mq_hw_ctx *hctx, 460 unsigned long *next, 461 unsigned int *next_set) 462 { 463 struct blk_mq_timeout_data data = { 464 .hctx = hctx, 465 .next = next, 466 .next_set = next_set, 467 }; 468 469 /* 470 * Ask the tagging code to iterate busy requests, so we can 471 * check them for timeout. 472 */ 473 blk_mq_tag_busy_iter(hctx->tags, blk_mq_timeout_check, &data); 474 } 475 476 static void blk_mq_rq_timer(unsigned long data) 477 { 478 struct request_queue *q = (struct request_queue *) data; 479 struct blk_mq_hw_ctx *hctx; 480 unsigned long next = 0; 481 int i, next_set = 0; 482 483 queue_for_each_hw_ctx(q, hctx, i) 484 blk_mq_hw_ctx_check_timeout(hctx, &next, &next_set); 485 486 if (next_set) 487 mod_timer(&q->timeout, round_jiffies_up(next)); 488 } 489 490 /* 491 * Reverse check our software queue for entries that we could potentially 492 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 493 * too much time checking for merges. 494 */ 495 static bool blk_mq_attempt_merge(struct request_queue *q, 496 struct blk_mq_ctx *ctx, struct bio *bio) 497 { 498 struct request *rq; 499 int checked = 8; 500 501 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 502 int el_ret; 503 504 if (!checked--) 505 break; 506 507 if (!blk_rq_merge_ok(rq, bio)) 508 continue; 509 510 el_ret = blk_try_merge(rq, bio); 511 if (el_ret == ELEVATOR_BACK_MERGE) { 512 if (bio_attempt_back_merge(q, rq, bio)) { 513 ctx->rq_merged++; 514 return true; 515 } 516 break; 517 } else if (el_ret == ELEVATOR_FRONT_MERGE) { 518 if (bio_attempt_front_merge(q, rq, bio)) { 519 ctx->rq_merged++; 520 return true; 521 } 522 break; 523 } 524 } 525 526 return false; 527 } 528 529 void blk_mq_add_timer(struct request *rq) 530 { 531 __blk_add_timer(rq, NULL); 532 } 533 534 /* 535 * Run this hardware queue, pulling any software queues mapped to it in. 536 * Note that this function currently has various problems around ordering 537 * of IO. In particular, we'd like FIFO behaviour on handling existing 538 * items on the hctx->dispatch list. Ignore that for now. 539 */ 540 static void __blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx) 541 { 542 struct request_queue *q = hctx->queue; 543 struct blk_mq_ctx *ctx; 544 struct request *rq; 545 LIST_HEAD(rq_list); 546 int bit, queued; 547 548 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 549 return; 550 551 hctx->run++; 552 553 /* 554 * Touch any software queue that has pending entries. 555 */ 556 for_each_set_bit(bit, hctx->ctx_map, hctx->nr_ctx) { 557 clear_bit(bit, hctx->ctx_map); 558 ctx = hctx->ctxs[bit]; 559 BUG_ON(bit != ctx->index_hw); 560 561 spin_lock(&ctx->lock); 562 list_splice_tail_init(&ctx->rq_list, &rq_list); 563 spin_unlock(&ctx->lock); 564 } 565 566 /* 567 * If we have previous entries on our dispatch list, grab them 568 * and stuff them at the front for more fair dispatch. 569 */ 570 if (!list_empty_careful(&hctx->dispatch)) { 571 spin_lock(&hctx->lock); 572 if (!list_empty(&hctx->dispatch)) 573 list_splice_init(&hctx->dispatch, &rq_list); 574 spin_unlock(&hctx->lock); 575 } 576 577 /* 578 * Delete and return all entries from our dispatch list 579 */ 580 queued = 0; 581 582 /* 583 * Now process all the entries, sending them to the driver. 584 */ 585 while (!list_empty(&rq_list)) { 586 int ret; 587 588 rq = list_first_entry(&rq_list, struct request, queuelist); 589 list_del_init(&rq->queuelist); 590 blk_mq_start_request(rq); 591 592 /* 593 * Last request in the series. Flag it as such, this 594 * enables drivers to know when IO should be kicked off, 595 * if they don't do it on a per-request basis. 596 * 597 * Note: the flag isn't the only condition drivers 598 * should do kick off. If drive is busy, the last 599 * request might not have the bit set. 600 */ 601 if (list_empty(&rq_list)) 602 rq->cmd_flags |= REQ_END; 603 604 ret = q->mq_ops->queue_rq(hctx, rq); 605 switch (ret) { 606 case BLK_MQ_RQ_QUEUE_OK: 607 queued++; 608 continue; 609 case BLK_MQ_RQ_QUEUE_BUSY: 610 /* 611 * FIXME: we should have a mechanism to stop the queue 612 * like blk_stop_queue, otherwise we will waste cpu 613 * time 614 */ 615 list_add(&rq->queuelist, &rq_list); 616 blk_mq_requeue_request(rq); 617 break; 618 default: 619 pr_err("blk-mq: bad return on queue: %d\n", ret); 620 rq->errors = -EIO; 621 case BLK_MQ_RQ_QUEUE_ERROR: 622 blk_mq_end_io(rq, rq->errors); 623 break; 624 } 625 626 if (ret == BLK_MQ_RQ_QUEUE_BUSY) 627 break; 628 } 629 630 if (!queued) 631 hctx->dispatched[0]++; 632 else if (queued < (1 << (BLK_MQ_MAX_DISPATCH_ORDER - 1))) 633 hctx->dispatched[ilog2(queued) + 1]++; 634 635 /* 636 * Any items that need requeuing? Stuff them into hctx->dispatch, 637 * that is where we will continue on next queue run. 638 */ 639 if (!list_empty(&rq_list)) { 640 spin_lock(&hctx->lock); 641 list_splice(&rq_list, &hctx->dispatch); 642 spin_unlock(&hctx->lock); 643 } 644 } 645 646 void blk_mq_run_hw_queue(struct blk_mq_hw_ctx *hctx, bool async) 647 { 648 if (unlikely(test_bit(BLK_MQ_S_STOPPED, &hctx->flags))) 649 return; 650 651 if (!async) 652 __blk_mq_run_hw_queue(hctx); 653 else { 654 struct request_queue *q = hctx->queue; 655 656 kblockd_schedule_delayed_work(q, &hctx->delayed_work, 0); 657 } 658 } 659 660 void blk_mq_run_queues(struct request_queue *q, bool async) 661 { 662 struct blk_mq_hw_ctx *hctx; 663 int i; 664 665 queue_for_each_hw_ctx(q, hctx, i) { 666 if ((!blk_mq_hctx_has_pending(hctx) && 667 list_empty_careful(&hctx->dispatch)) || 668 test_bit(BLK_MQ_S_STOPPED, &hctx->flags)) 669 continue; 670 671 blk_mq_run_hw_queue(hctx, async); 672 } 673 } 674 EXPORT_SYMBOL(blk_mq_run_queues); 675 676 void blk_mq_stop_hw_queue(struct blk_mq_hw_ctx *hctx) 677 { 678 cancel_delayed_work(&hctx->delayed_work); 679 set_bit(BLK_MQ_S_STOPPED, &hctx->state); 680 } 681 EXPORT_SYMBOL(blk_mq_stop_hw_queue); 682 683 void blk_mq_stop_hw_queues(struct request_queue *q) 684 { 685 struct blk_mq_hw_ctx *hctx; 686 int i; 687 688 queue_for_each_hw_ctx(q, hctx, i) 689 blk_mq_stop_hw_queue(hctx); 690 } 691 EXPORT_SYMBOL(blk_mq_stop_hw_queues); 692 693 void blk_mq_start_hw_queue(struct blk_mq_hw_ctx *hctx) 694 { 695 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 696 __blk_mq_run_hw_queue(hctx); 697 } 698 EXPORT_SYMBOL(blk_mq_start_hw_queue); 699 700 void blk_mq_start_stopped_hw_queues(struct request_queue *q) 701 { 702 struct blk_mq_hw_ctx *hctx; 703 int i; 704 705 queue_for_each_hw_ctx(q, hctx, i) { 706 if (!test_bit(BLK_MQ_S_STOPPED, &hctx->state)) 707 continue; 708 709 clear_bit(BLK_MQ_S_STOPPED, &hctx->state); 710 blk_mq_run_hw_queue(hctx, true); 711 } 712 } 713 EXPORT_SYMBOL(blk_mq_start_stopped_hw_queues); 714 715 static void blk_mq_work_fn(struct work_struct *work) 716 { 717 struct blk_mq_hw_ctx *hctx; 718 719 hctx = container_of(work, struct blk_mq_hw_ctx, delayed_work.work); 720 __blk_mq_run_hw_queue(hctx); 721 } 722 723 static void __blk_mq_insert_request(struct blk_mq_hw_ctx *hctx, 724 struct request *rq) 725 { 726 struct blk_mq_ctx *ctx = rq->mq_ctx; 727 728 trace_block_rq_insert(hctx->queue, rq); 729 730 list_add_tail(&rq->queuelist, &ctx->rq_list); 731 blk_mq_hctx_mark_pending(hctx, ctx); 732 733 /* 734 * We do this early, to ensure we are on the right CPU. 735 */ 736 blk_mq_add_timer(rq); 737 } 738 739 void blk_mq_insert_request(struct request_queue *q, struct request *rq, 740 bool run_queue) 741 { 742 struct blk_mq_hw_ctx *hctx; 743 struct blk_mq_ctx *ctx, *current_ctx; 744 745 ctx = rq->mq_ctx; 746 hctx = q->mq_ops->map_queue(q, ctx->cpu); 747 748 if (rq->cmd_flags & (REQ_FLUSH | REQ_FUA)) { 749 blk_insert_flush(rq); 750 } else { 751 current_ctx = blk_mq_get_ctx(q); 752 753 if (!cpu_online(ctx->cpu)) { 754 ctx = current_ctx; 755 hctx = q->mq_ops->map_queue(q, ctx->cpu); 756 rq->mq_ctx = ctx; 757 } 758 spin_lock(&ctx->lock); 759 __blk_mq_insert_request(hctx, rq); 760 spin_unlock(&ctx->lock); 761 762 blk_mq_put_ctx(current_ctx); 763 } 764 765 if (run_queue) 766 __blk_mq_run_hw_queue(hctx); 767 } 768 EXPORT_SYMBOL(blk_mq_insert_request); 769 770 /* 771 * This is a special version of blk_mq_insert_request to bypass FLUSH request 772 * check. Should only be used internally. 773 */ 774 void blk_mq_run_request(struct request *rq, bool run_queue, bool async) 775 { 776 struct request_queue *q = rq->q; 777 struct blk_mq_hw_ctx *hctx; 778 struct blk_mq_ctx *ctx, *current_ctx; 779 780 current_ctx = blk_mq_get_ctx(q); 781 782 ctx = rq->mq_ctx; 783 if (!cpu_online(ctx->cpu)) { 784 ctx = current_ctx; 785 rq->mq_ctx = ctx; 786 } 787 hctx = q->mq_ops->map_queue(q, ctx->cpu); 788 789 /* ctx->cpu might be offline */ 790 spin_lock(&ctx->lock); 791 __blk_mq_insert_request(hctx, rq); 792 spin_unlock(&ctx->lock); 793 794 blk_mq_put_ctx(current_ctx); 795 796 if (run_queue) 797 blk_mq_run_hw_queue(hctx, async); 798 } 799 800 static void blk_mq_insert_requests(struct request_queue *q, 801 struct blk_mq_ctx *ctx, 802 struct list_head *list, 803 int depth, 804 bool from_schedule) 805 806 { 807 struct blk_mq_hw_ctx *hctx; 808 struct blk_mq_ctx *current_ctx; 809 810 trace_block_unplug(q, depth, !from_schedule); 811 812 current_ctx = blk_mq_get_ctx(q); 813 814 if (!cpu_online(ctx->cpu)) 815 ctx = current_ctx; 816 hctx = q->mq_ops->map_queue(q, ctx->cpu); 817 818 /* 819 * preemption doesn't flush plug list, so it's possible ctx->cpu is 820 * offline now 821 */ 822 spin_lock(&ctx->lock); 823 while (!list_empty(list)) { 824 struct request *rq; 825 826 rq = list_first_entry(list, struct request, queuelist); 827 list_del_init(&rq->queuelist); 828 rq->mq_ctx = ctx; 829 __blk_mq_insert_request(hctx, rq); 830 } 831 spin_unlock(&ctx->lock); 832 833 blk_mq_put_ctx(current_ctx); 834 835 blk_mq_run_hw_queue(hctx, from_schedule); 836 } 837 838 static int plug_ctx_cmp(void *priv, struct list_head *a, struct list_head *b) 839 { 840 struct request *rqa = container_of(a, struct request, queuelist); 841 struct request *rqb = container_of(b, struct request, queuelist); 842 843 return !(rqa->mq_ctx < rqb->mq_ctx || 844 (rqa->mq_ctx == rqb->mq_ctx && 845 blk_rq_pos(rqa) < blk_rq_pos(rqb))); 846 } 847 848 void blk_mq_flush_plug_list(struct blk_plug *plug, bool from_schedule) 849 { 850 struct blk_mq_ctx *this_ctx; 851 struct request_queue *this_q; 852 struct request *rq; 853 LIST_HEAD(list); 854 LIST_HEAD(ctx_list); 855 unsigned int depth; 856 857 list_splice_init(&plug->mq_list, &list); 858 859 list_sort(NULL, &list, plug_ctx_cmp); 860 861 this_q = NULL; 862 this_ctx = NULL; 863 depth = 0; 864 865 while (!list_empty(&list)) { 866 rq = list_entry_rq(list.next); 867 list_del_init(&rq->queuelist); 868 BUG_ON(!rq->q); 869 if (rq->mq_ctx != this_ctx) { 870 if (this_ctx) { 871 blk_mq_insert_requests(this_q, this_ctx, 872 &ctx_list, depth, 873 from_schedule); 874 } 875 876 this_ctx = rq->mq_ctx; 877 this_q = rq->q; 878 depth = 0; 879 } 880 881 depth++; 882 list_add_tail(&rq->queuelist, &ctx_list); 883 } 884 885 /* 886 * If 'this_ctx' is set, we know we have entries to complete 887 * on 'ctx_list'. Do those. 888 */ 889 if (this_ctx) { 890 blk_mq_insert_requests(this_q, this_ctx, &ctx_list, depth, 891 from_schedule); 892 } 893 } 894 895 static void blk_mq_bio_to_request(struct request *rq, struct bio *bio) 896 { 897 init_request_from_bio(rq, bio); 898 blk_account_io_start(rq, 1); 899 } 900 901 static void blk_mq_make_request(struct request_queue *q, struct bio *bio) 902 { 903 struct blk_mq_hw_ctx *hctx; 904 struct blk_mq_ctx *ctx; 905 const int is_sync = rw_is_sync(bio->bi_rw); 906 const int is_flush_fua = bio->bi_rw & (REQ_FLUSH | REQ_FUA); 907 int rw = bio_data_dir(bio); 908 struct request *rq; 909 unsigned int use_plug, request_count = 0; 910 911 /* 912 * If we have multiple hardware queues, just go directly to 913 * one of those for sync IO. 914 */ 915 use_plug = !is_flush_fua && ((q->nr_hw_queues == 1) || !is_sync); 916 917 blk_queue_bounce(q, &bio); 918 919 if (use_plug && blk_attempt_plug_merge(q, bio, &request_count)) 920 return; 921 922 if (blk_mq_queue_enter(q)) { 923 bio_endio(bio, -EIO); 924 return; 925 } 926 927 ctx = blk_mq_get_ctx(q); 928 hctx = q->mq_ops->map_queue(q, ctx->cpu); 929 930 trace_block_getrq(q, bio, rw); 931 rq = __blk_mq_alloc_request(hctx, GFP_ATOMIC, false); 932 if (likely(rq)) 933 blk_mq_rq_ctx_init(q, ctx, rq, rw); 934 else { 935 blk_mq_put_ctx(ctx); 936 trace_block_sleeprq(q, bio, rw); 937 rq = blk_mq_alloc_request_pinned(q, rw, __GFP_WAIT|GFP_ATOMIC, 938 false); 939 ctx = rq->mq_ctx; 940 hctx = q->mq_ops->map_queue(q, ctx->cpu); 941 } 942 943 hctx->queued++; 944 945 if (unlikely(is_flush_fua)) { 946 blk_mq_bio_to_request(rq, bio); 947 blk_mq_put_ctx(ctx); 948 blk_insert_flush(rq); 949 goto run_queue; 950 } 951 952 /* 953 * A task plug currently exists. Since this is completely lockless, 954 * utilize that to temporarily store requests until the task is 955 * either done or scheduled away. 956 */ 957 if (use_plug) { 958 struct blk_plug *plug = current->plug; 959 960 if (plug) { 961 blk_mq_bio_to_request(rq, bio); 962 if (list_empty(&plug->mq_list)) 963 trace_block_plug(q); 964 else if (request_count >= BLK_MAX_REQUEST_COUNT) { 965 blk_flush_plug_list(plug, false); 966 trace_block_plug(q); 967 } 968 list_add_tail(&rq->queuelist, &plug->mq_list); 969 blk_mq_put_ctx(ctx); 970 return; 971 } 972 } 973 974 spin_lock(&ctx->lock); 975 976 if ((hctx->flags & BLK_MQ_F_SHOULD_MERGE) && 977 blk_mq_attempt_merge(q, ctx, bio)) 978 __blk_mq_free_request(hctx, ctx, rq); 979 else { 980 blk_mq_bio_to_request(rq, bio); 981 __blk_mq_insert_request(hctx, rq); 982 } 983 984 spin_unlock(&ctx->lock); 985 blk_mq_put_ctx(ctx); 986 987 /* 988 * For a SYNC request, send it to the hardware immediately. For an 989 * ASYNC request, just ensure that we run it later on. The latter 990 * allows for merging opportunities and more efficient dispatching. 991 */ 992 run_queue: 993 blk_mq_run_hw_queue(hctx, !is_sync || is_flush_fua); 994 } 995 996 /* 997 * Default mapping to a software queue, since we use one per CPU. 998 */ 999 struct blk_mq_hw_ctx *blk_mq_map_queue(struct request_queue *q, const int cpu) 1000 { 1001 return q->queue_hw_ctx[q->mq_map[cpu]]; 1002 } 1003 EXPORT_SYMBOL(blk_mq_map_queue); 1004 1005 struct blk_mq_hw_ctx *blk_mq_alloc_single_hw_queue(struct blk_mq_reg *reg, 1006 unsigned int hctx_index) 1007 { 1008 return kmalloc_node(sizeof(struct blk_mq_hw_ctx), 1009 GFP_KERNEL | __GFP_ZERO, reg->numa_node); 1010 } 1011 EXPORT_SYMBOL(blk_mq_alloc_single_hw_queue); 1012 1013 void blk_mq_free_single_hw_queue(struct blk_mq_hw_ctx *hctx, 1014 unsigned int hctx_index) 1015 { 1016 kfree(hctx); 1017 } 1018 EXPORT_SYMBOL(blk_mq_free_single_hw_queue); 1019 1020 static void blk_mq_hctx_notify(void *data, unsigned long action, 1021 unsigned int cpu) 1022 { 1023 struct blk_mq_hw_ctx *hctx = data; 1024 struct blk_mq_ctx *ctx; 1025 LIST_HEAD(tmp); 1026 1027 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN) 1028 return; 1029 1030 /* 1031 * Move ctx entries to new CPU, if this one is going away. 1032 */ 1033 ctx = __blk_mq_get_ctx(hctx->queue, cpu); 1034 1035 spin_lock(&ctx->lock); 1036 if (!list_empty(&ctx->rq_list)) { 1037 list_splice_init(&ctx->rq_list, &tmp); 1038 clear_bit(ctx->index_hw, hctx->ctx_map); 1039 } 1040 spin_unlock(&ctx->lock); 1041 1042 if (list_empty(&tmp)) 1043 return; 1044 1045 ctx = blk_mq_get_ctx(hctx->queue); 1046 spin_lock(&ctx->lock); 1047 1048 while (!list_empty(&tmp)) { 1049 struct request *rq; 1050 1051 rq = list_first_entry(&tmp, struct request, queuelist); 1052 rq->mq_ctx = ctx; 1053 list_move_tail(&rq->queuelist, &ctx->rq_list); 1054 } 1055 1056 blk_mq_hctx_mark_pending(hctx, ctx); 1057 1058 spin_unlock(&ctx->lock); 1059 blk_mq_put_ctx(ctx); 1060 } 1061 1062 static void blk_mq_init_hw_commands(struct blk_mq_hw_ctx *hctx, 1063 void (*init)(void *, struct blk_mq_hw_ctx *, 1064 struct request *, unsigned int), 1065 void *data) 1066 { 1067 unsigned int i; 1068 1069 for (i = 0; i < hctx->queue_depth; i++) { 1070 struct request *rq = hctx->rqs[i]; 1071 1072 init(data, hctx, rq, i); 1073 } 1074 } 1075 1076 void blk_mq_init_commands(struct request_queue *q, 1077 void (*init)(void *, struct blk_mq_hw_ctx *, 1078 struct request *, unsigned int), 1079 void *data) 1080 { 1081 struct blk_mq_hw_ctx *hctx; 1082 unsigned int i; 1083 1084 queue_for_each_hw_ctx(q, hctx, i) 1085 blk_mq_init_hw_commands(hctx, init, data); 1086 } 1087 EXPORT_SYMBOL(blk_mq_init_commands); 1088 1089 static void blk_mq_free_rq_map(struct blk_mq_hw_ctx *hctx) 1090 { 1091 struct page *page; 1092 1093 while (!list_empty(&hctx->page_list)) { 1094 page = list_first_entry(&hctx->page_list, struct page, list); 1095 list_del_init(&page->list); 1096 __free_pages(page, page->private); 1097 } 1098 1099 kfree(hctx->rqs); 1100 1101 if (hctx->tags) 1102 blk_mq_free_tags(hctx->tags); 1103 } 1104 1105 static size_t order_to_size(unsigned int order) 1106 { 1107 size_t ret = PAGE_SIZE; 1108 1109 while (order--) 1110 ret *= 2; 1111 1112 return ret; 1113 } 1114 1115 static int blk_mq_init_rq_map(struct blk_mq_hw_ctx *hctx, 1116 unsigned int reserved_tags, int node) 1117 { 1118 unsigned int i, j, entries_per_page, max_order = 4; 1119 size_t rq_size, left; 1120 1121 INIT_LIST_HEAD(&hctx->page_list); 1122 1123 hctx->rqs = kmalloc_node(hctx->queue_depth * sizeof(struct request *), 1124 GFP_KERNEL, node); 1125 if (!hctx->rqs) 1126 return -ENOMEM; 1127 1128 /* 1129 * rq_size is the size of the request plus driver payload, rounded 1130 * to the cacheline size 1131 */ 1132 rq_size = round_up(sizeof(struct request) + hctx->cmd_size, 1133 cache_line_size()); 1134 left = rq_size * hctx->queue_depth; 1135 1136 for (i = 0; i < hctx->queue_depth;) { 1137 int this_order = max_order; 1138 struct page *page; 1139 int to_do; 1140 void *p; 1141 1142 while (left < order_to_size(this_order - 1) && this_order) 1143 this_order--; 1144 1145 do { 1146 page = alloc_pages_node(node, GFP_KERNEL, this_order); 1147 if (page) 1148 break; 1149 if (!this_order--) 1150 break; 1151 if (order_to_size(this_order) < rq_size) 1152 break; 1153 } while (1); 1154 1155 if (!page) 1156 break; 1157 1158 page->private = this_order; 1159 list_add_tail(&page->list, &hctx->page_list); 1160 1161 p = page_address(page); 1162 entries_per_page = order_to_size(this_order) / rq_size; 1163 to_do = min(entries_per_page, hctx->queue_depth - i); 1164 left -= to_do * rq_size; 1165 for (j = 0; j < to_do; j++) { 1166 hctx->rqs[i] = p; 1167 blk_mq_rq_init(hctx, hctx->rqs[i]); 1168 p += rq_size; 1169 i++; 1170 } 1171 } 1172 1173 if (i < (reserved_tags + BLK_MQ_TAG_MIN)) 1174 goto err_rq_map; 1175 else if (i != hctx->queue_depth) { 1176 hctx->queue_depth = i; 1177 pr_warn("%s: queue depth set to %u because of low memory\n", 1178 __func__, i); 1179 } 1180 1181 hctx->tags = blk_mq_init_tags(hctx->queue_depth, reserved_tags, node); 1182 if (!hctx->tags) { 1183 err_rq_map: 1184 blk_mq_free_rq_map(hctx); 1185 return -ENOMEM; 1186 } 1187 1188 return 0; 1189 } 1190 1191 static int blk_mq_init_hw_queues(struct request_queue *q, 1192 struct blk_mq_reg *reg, void *driver_data) 1193 { 1194 struct blk_mq_hw_ctx *hctx; 1195 unsigned int i, j; 1196 1197 /* 1198 * Initialize hardware queues 1199 */ 1200 queue_for_each_hw_ctx(q, hctx, i) { 1201 unsigned int num_maps; 1202 int node; 1203 1204 node = hctx->numa_node; 1205 if (node == NUMA_NO_NODE) 1206 node = hctx->numa_node = reg->numa_node; 1207 1208 INIT_DELAYED_WORK(&hctx->delayed_work, blk_mq_work_fn); 1209 spin_lock_init(&hctx->lock); 1210 INIT_LIST_HEAD(&hctx->dispatch); 1211 hctx->queue = q; 1212 hctx->queue_num = i; 1213 hctx->flags = reg->flags; 1214 hctx->queue_depth = reg->queue_depth; 1215 hctx->cmd_size = reg->cmd_size; 1216 1217 blk_mq_init_cpu_notifier(&hctx->cpu_notifier, 1218 blk_mq_hctx_notify, hctx); 1219 blk_mq_register_cpu_notifier(&hctx->cpu_notifier); 1220 1221 if (blk_mq_init_rq_map(hctx, reg->reserved_tags, node)) 1222 break; 1223 1224 /* 1225 * Allocate space for all possible cpus to avoid allocation in 1226 * runtime 1227 */ 1228 hctx->ctxs = kmalloc_node(nr_cpu_ids * sizeof(void *), 1229 GFP_KERNEL, node); 1230 if (!hctx->ctxs) 1231 break; 1232 1233 num_maps = ALIGN(nr_cpu_ids, BITS_PER_LONG) / BITS_PER_LONG; 1234 hctx->ctx_map = kzalloc_node(num_maps * sizeof(unsigned long), 1235 GFP_KERNEL, node); 1236 if (!hctx->ctx_map) 1237 break; 1238 1239 hctx->nr_ctx_map = num_maps; 1240 hctx->nr_ctx = 0; 1241 1242 if (reg->ops->init_hctx && 1243 reg->ops->init_hctx(hctx, driver_data, i)) 1244 break; 1245 } 1246 1247 if (i == q->nr_hw_queues) 1248 return 0; 1249 1250 /* 1251 * Init failed 1252 */ 1253 queue_for_each_hw_ctx(q, hctx, j) { 1254 if (i == j) 1255 break; 1256 1257 if (reg->ops->exit_hctx) 1258 reg->ops->exit_hctx(hctx, j); 1259 1260 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1261 blk_mq_free_rq_map(hctx); 1262 kfree(hctx->ctxs); 1263 } 1264 1265 return 1; 1266 } 1267 1268 static void blk_mq_init_cpu_queues(struct request_queue *q, 1269 unsigned int nr_hw_queues) 1270 { 1271 unsigned int i; 1272 1273 for_each_possible_cpu(i) { 1274 struct blk_mq_ctx *__ctx = per_cpu_ptr(q->queue_ctx, i); 1275 struct blk_mq_hw_ctx *hctx; 1276 1277 memset(__ctx, 0, sizeof(*__ctx)); 1278 __ctx->cpu = i; 1279 spin_lock_init(&__ctx->lock); 1280 INIT_LIST_HEAD(&__ctx->rq_list); 1281 __ctx->queue = q; 1282 1283 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1284 hctx = q->mq_ops->map_queue(q, i); 1285 hctx->nr_ctx++; 1286 1287 if (!cpu_online(i)) 1288 continue; 1289 1290 /* 1291 * Set local node, IFF we have more than one hw queue. If 1292 * not, we remain on the home node of the device 1293 */ 1294 if (nr_hw_queues > 1 && hctx->numa_node == NUMA_NO_NODE) 1295 hctx->numa_node = cpu_to_node(i); 1296 } 1297 } 1298 1299 static void blk_mq_map_swqueue(struct request_queue *q) 1300 { 1301 unsigned int i; 1302 struct blk_mq_hw_ctx *hctx; 1303 struct blk_mq_ctx *ctx; 1304 1305 queue_for_each_hw_ctx(q, hctx, i) { 1306 hctx->nr_ctx = 0; 1307 } 1308 1309 /* 1310 * Map software to hardware queues 1311 */ 1312 queue_for_each_ctx(q, ctx, i) { 1313 /* If the cpu isn't online, the cpu is mapped to first hctx */ 1314 hctx = q->mq_ops->map_queue(q, i); 1315 ctx->index_hw = hctx->nr_ctx; 1316 hctx->ctxs[hctx->nr_ctx++] = ctx; 1317 } 1318 } 1319 1320 struct request_queue *blk_mq_init_queue(struct blk_mq_reg *reg, 1321 void *driver_data) 1322 { 1323 struct blk_mq_hw_ctx **hctxs; 1324 struct blk_mq_ctx *ctx; 1325 struct request_queue *q; 1326 int i; 1327 1328 if (!reg->nr_hw_queues || 1329 !reg->ops->queue_rq || !reg->ops->map_queue || 1330 !reg->ops->alloc_hctx || !reg->ops->free_hctx) 1331 return ERR_PTR(-EINVAL); 1332 1333 if (!reg->queue_depth) 1334 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1335 else if (reg->queue_depth > BLK_MQ_MAX_DEPTH) { 1336 pr_err("blk-mq: queuedepth too large (%u)\n", reg->queue_depth); 1337 reg->queue_depth = BLK_MQ_MAX_DEPTH; 1338 } 1339 1340 /* 1341 * Set aside a tag for flush requests. It will only be used while 1342 * another flush request is in progress but outside the driver. 1343 * 1344 * TODO: only allocate if flushes are supported 1345 */ 1346 reg->queue_depth++; 1347 reg->reserved_tags++; 1348 1349 if (reg->queue_depth < (reg->reserved_tags + BLK_MQ_TAG_MIN)) 1350 return ERR_PTR(-EINVAL); 1351 1352 ctx = alloc_percpu(struct blk_mq_ctx); 1353 if (!ctx) 1354 return ERR_PTR(-ENOMEM); 1355 1356 hctxs = kmalloc_node(reg->nr_hw_queues * sizeof(*hctxs), GFP_KERNEL, 1357 reg->numa_node); 1358 1359 if (!hctxs) 1360 goto err_percpu; 1361 1362 for (i = 0; i < reg->nr_hw_queues; i++) { 1363 hctxs[i] = reg->ops->alloc_hctx(reg, i); 1364 if (!hctxs[i]) 1365 goto err_hctxs; 1366 1367 hctxs[i]->numa_node = NUMA_NO_NODE; 1368 hctxs[i]->queue_num = i; 1369 } 1370 1371 q = blk_alloc_queue_node(GFP_KERNEL, reg->numa_node); 1372 if (!q) 1373 goto err_hctxs; 1374 1375 q->mq_map = blk_mq_make_queue_map(reg); 1376 if (!q->mq_map) 1377 goto err_map; 1378 1379 setup_timer(&q->timeout, blk_mq_rq_timer, (unsigned long) q); 1380 blk_queue_rq_timeout(q, 30000); 1381 1382 q->nr_queues = nr_cpu_ids; 1383 q->nr_hw_queues = reg->nr_hw_queues; 1384 1385 q->queue_ctx = ctx; 1386 q->queue_hw_ctx = hctxs; 1387 1388 q->mq_ops = reg->ops; 1389 q->queue_flags |= QUEUE_FLAG_MQ_DEFAULT; 1390 1391 blk_queue_make_request(q, blk_mq_make_request); 1392 blk_queue_rq_timed_out(q, reg->ops->timeout); 1393 if (reg->timeout) 1394 blk_queue_rq_timeout(q, reg->timeout); 1395 1396 blk_mq_init_flush(q); 1397 blk_mq_init_cpu_queues(q, reg->nr_hw_queues); 1398 1399 if (blk_mq_init_hw_queues(q, reg, driver_data)) 1400 goto err_hw; 1401 1402 blk_mq_map_swqueue(q); 1403 1404 mutex_lock(&all_q_mutex); 1405 list_add_tail(&q->all_q_node, &all_q_list); 1406 mutex_unlock(&all_q_mutex); 1407 1408 return q; 1409 err_hw: 1410 kfree(q->mq_map); 1411 err_map: 1412 blk_cleanup_queue(q); 1413 err_hctxs: 1414 for (i = 0; i < reg->nr_hw_queues; i++) { 1415 if (!hctxs[i]) 1416 break; 1417 reg->ops->free_hctx(hctxs[i], i); 1418 } 1419 kfree(hctxs); 1420 err_percpu: 1421 free_percpu(ctx); 1422 return ERR_PTR(-ENOMEM); 1423 } 1424 EXPORT_SYMBOL(blk_mq_init_queue); 1425 1426 void blk_mq_free_queue(struct request_queue *q) 1427 { 1428 struct blk_mq_hw_ctx *hctx; 1429 int i; 1430 1431 queue_for_each_hw_ctx(q, hctx, i) { 1432 cancel_delayed_work_sync(&hctx->delayed_work); 1433 kfree(hctx->ctx_map); 1434 kfree(hctx->ctxs); 1435 blk_mq_free_rq_map(hctx); 1436 blk_mq_unregister_cpu_notifier(&hctx->cpu_notifier); 1437 if (q->mq_ops->exit_hctx) 1438 q->mq_ops->exit_hctx(hctx, i); 1439 q->mq_ops->free_hctx(hctx, i); 1440 } 1441 1442 free_percpu(q->queue_ctx); 1443 kfree(q->queue_hw_ctx); 1444 kfree(q->mq_map); 1445 1446 q->queue_ctx = NULL; 1447 q->queue_hw_ctx = NULL; 1448 q->mq_map = NULL; 1449 1450 mutex_lock(&all_q_mutex); 1451 list_del_init(&q->all_q_node); 1452 mutex_unlock(&all_q_mutex); 1453 } 1454 EXPORT_SYMBOL(blk_mq_free_queue); 1455 1456 /* Basically redo blk_mq_init_queue with queue frozen */ 1457 static void blk_mq_queue_reinit(struct request_queue *q) 1458 { 1459 blk_mq_freeze_queue(q); 1460 1461 blk_mq_update_queue_map(q->mq_map, q->nr_hw_queues); 1462 1463 /* 1464 * redo blk_mq_init_cpu_queues and blk_mq_init_hw_queues. FIXME: maybe 1465 * we should change hctx numa_node according to new topology (this 1466 * involves free and re-allocate memory, worthy doing?) 1467 */ 1468 1469 blk_mq_map_swqueue(q); 1470 1471 blk_mq_unfreeze_queue(q); 1472 } 1473 1474 static int blk_mq_queue_reinit_notify(struct notifier_block *nb, 1475 unsigned long action, void *hcpu) 1476 { 1477 struct request_queue *q; 1478 1479 /* 1480 * Before new mapping is established, hotadded cpu might already start 1481 * handling requests. This doesn't break anything as we map offline 1482 * CPUs to first hardware queue. We will re-init queue below to get 1483 * optimal settings. 1484 */ 1485 if (action != CPU_DEAD && action != CPU_DEAD_FROZEN && 1486 action != CPU_ONLINE && action != CPU_ONLINE_FROZEN) 1487 return NOTIFY_OK; 1488 1489 mutex_lock(&all_q_mutex); 1490 list_for_each_entry(q, &all_q_list, all_q_node) 1491 blk_mq_queue_reinit(q); 1492 mutex_unlock(&all_q_mutex); 1493 return NOTIFY_OK; 1494 } 1495 1496 static int __init blk_mq_init(void) 1497 { 1498 unsigned int i; 1499 1500 for_each_possible_cpu(i) 1501 init_llist_head(&per_cpu(ipi_lists, i)); 1502 1503 blk_mq_cpu_init(); 1504 1505 /* Must be called after percpu_counter_hotcpu_callback() */ 1506 hotcpu_notifier(blk_mq_queue_reinit_notify, -10); 1507 1508 return 0; 1509 } 1510 subsys_initcall(blk_mq_init); 1511