1 /* 2 * blk-mq scheduling framework 3 * 4 * Copyright (C) 2016 Jens Axboe 5 */ 6 #include <linux/kernel.h> 7 #include <linux/module.h> 8 #include <linux/blk-mq.h> 9 10 #include <trace/events/block.h> 11 12 #include "blk.h" 13 #include "blk-mq.h" 14 #include "blk-mq-debugfs.h" 15 #include "blk-mq-sched.h" 16 #include "blk-mq-tag.h" 17 #include "blk-wbt.h" 18 19 void blk_mq_sched_free_hctx_data(struct request_queue *q, 20 void (*exit)(struct blk_mq_hw_ctx *)) 21 { 22 struct blk_mq_hw_ctx *hctx; 23 int i; 24 25 queue_for_each_hw_ctx(q, hctx, i) { 26 if (exit && hctx->sched_data) 27 exit(hctx); 28 kfree(hctx->sched_data); 29 hctx->sched_data = NULL; 30 } 31 } 32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 33 34 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 35 { 36 struct request_queue *q = rq->q; 37 struct io_context *ioc = rq_ioc(bio); 38 struct io_cq *icq; 39 40 spin_lock_irq(q->queue_lock); 41 icq = ioc_lookup_icq(ioc, q); 42 spin_unlock_irq(q->queue_lock); 43 44 if (!icq) { 45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 46 if (!icq) 47 return; 48 } 49 get_io_context(icq->ioc); 50 rq->elv.icq = icq; 51 } 52 53 /* 54 * Mark a hardware queue as needing a restart. For shared queues, maintain 55 * a count of how many hardware queues are marked for restart. 56 */ 57 static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 58 { 59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 60 return; 61 62 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 63 struct request_queue *q = hctx->queue; 64 65 if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 atomic_inc(&q->shared_hctx_restart); 67 } else 68 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 } 70 71 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 72 { 73 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 74 return false; 75 76 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 77 struct request_queue *q = hctx->queue; 78 79 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 80 atomic_dec(&q->shared_hctx_restart); 81 } else 82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 83 84 return blk_mq_run_hw_queue(hctx, true); 85 } 86 87 /* 88 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 89 * its queue by itself in its completion handler, so we don't need to 90 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 91 */ 92 static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) 93 { 94 struct request_queue *q = hctx->queue; 95 struct elevator_queue *e = q->elevator; 96 LIST_HEAD(rq_list); 97 98 do { 99 struct request *rq; 100 101 if (e->type->ops.mq.has_work && 102 !e->type->ops.mq.has_work(hctx)) 103 break; 104 105 if (!blk_mq_get_dispatch_budget(hctx)) 106 break; 107 108 rq = e->type->ops.mq.dispatch_request(hctx); 109 if (!rq) { 110 blk_mq_put_dispatch_budget(hctx); 111 break; 112 } 113 114 /* 115 * Now this rq owns the budget which has to be released 116 * if this rq won't be queued to driver via .queue_rq() 117 * in blk_mq_dispatch_rq_list(). 118 */ 119 list_add(&rq->queuelist, &rq_list); 120 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 121 } 122 123 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 124 struct blk_mq_ctx *ctx) 125 { 126 unsigned idx = ctx->index_hw; 127 128 if (++idx == hctx->nr_ctx) 129 idx = 0; 130 131 return hctx->ctxs[idx]; 132 } 133 134 /* 135 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 136 * its queue by itself in its completion handler, so we don't need to 137 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 138 */ 139 static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) 140 { 141 struct request_queue *q = hctx->queue; 142 LIST_HEAD(rq_list); 143 struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); 144 145 do { 146 struct request *rq; 147 148 if (!sbitmap_any_bit_set(&hctx->ctx_map)) 149 break; 150 151 if (!blk_mq_get_dispatch_budget(hctx)) 152 break; 153 154 rq = blk_mq_dequeue_from_ctx(hctx, ctx); 155 if (!rq) { 156 blk_mq_put_dispatch_budget(hctx); 157 break; 158 } 159 160 /* 161 * Now this rq owns the budget which has to be released 162 * if this rq won't be queued to driver via .queue_rq() 163 * in blk_mq_dispatch_rq_list(). 164 */ 165 list_add(&rq->queuelist, &rq_list); 166 167 /* round robin for fair dispatch */ 168 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); 169 170 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 171 172 WRITE_ONCE(hctx->dispatch_from, ctx); 173 } 174 175 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 176 { 177 struct request_queue *q = hctx->queue; 178 struct elevator_queue *e = q->elevator; 179 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 180 LIST_HEAD(rq_list); 181 182 /* RCU or SRCU read lock is needed before checking quiesced flag */ 183 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 184 return; 185 186 hctx->run++; 187 188 /* 189 * If we have previous entries on our dispatch list, grab them first for 190 * more fair dispatch. 191 */ 192 if (!list_empty_careful(&hctx->dispatch)) { 193 spin_lock(&hctx->lock); 194 if (!list_empty(&hctx->dispatch)) 195 list_splice_init(&hctx->dispatch, &rq_list); 196 spin_unlock(&hctx->lock); 197 } 198 199 /* 200 * Only ask the scheduler for requests, if we didn't have residual 201 * requests from the dispatch list. This is to avoid the case where 202 * we only ever dispatch a fraction of the requests available because 203 * of low device queue depth. Once we pull requests out of the IO 204 * scheduler, we can no longer merge or sort them. So it's best to 205 * leave them there for as long as we can. Mark the hw queue as 206 * needing a restart in that case. 207 * 208 * We want to dispatch from the scheduler if there was nothing 209 * on the dispatch list or we were able to dispatch from the 210 * dispatch list. 211 */ 212 if (!list_empty(&rq_list)) { 213 blk_mq_sched_mark_restart_hctx(hctx); 214 if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { 215 if (has_sched_dispatch) 216 blk_mq_do_dispatch_sched(hctx); 217 else 218 blk_mq_do_dispatch_ctx(hctx); 219 } 220 } else if (has_sched_dispatch) { 221 blk_mq_do_dispatch_sched(hctx); 222 } else if (q->mq_ops->get_budget) { 223 /* 224 * If we need to get budget before queuing request, we 225 * dequeue request one by one from sw queue for avoiding 226 * to mess up I/O merge when dispatch runs out of resource. 227 * 228 * TODO: get more budgets, and dequeue more requests in 229 * one time. 230 */ 231 blk_mq_do_dispatch_ctx(hctx); 232 } else { 233 blk_mq_flush_busy_ctxs(hctx, &rq_list); 234 blk_mq_dispatch_rq_list(q, &rq_list, false); 235 } 236 } 237 238 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 239 struct request **merged_request) 240 { 241 struct request *rq; 242 243 switch (elv_merge(q, &rq, bio)) { 244 case ELEVATOR_BACK_MERGE: 245 if (!blk_mq_sched_allow_merge(q, rq, bio)) 246 return false; 247 if (!bio_attempt_back_merge(q, rq, bio)) 248 return false; 249 *merged_request = attempt_back_merge(q, rq); 250 if (!*merged_request) 251 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); 252 return true; 253 case ELEVATOR_FRONT_MERGE: 254 if (!blk_mq_sched_allow_merge(q, rq, bio)) 255 return false; 256 if (!bio_attempt_front_merge(q, rq, bio)) 257 return false; 258 *merged_request = attempt_front_merge(q, rq); 259 if (!*merged_request) 260 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 261 return true; 262 case ELEVATOR_DISCARD_MERGE: 263 return bio_attempt_discard_merge(q, rq, bio); 264 default: 265 return false; 266 } 267 } 268 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 269 270 /* 271 * Reverse check our software queue for entries that we could potentially 272 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 273 * too much time checking for merges. 274 */ 275 static bool blk_mq_attempt_merge(struct request_queue *q, 276 struct blk_mq_ctx *ctx, struct bio *bio) 277 { 278 struct request *rq; 279 int checked = 8; 280 281 lockdep_assert_held(&ctx->lock); 282 283 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 284 bool merged = false; 285 286 if (!checked--) 287 break; 288 289 if (!blk_rq_merge_ok(rq, bio)) 290 continue; 291 292 switch (blk_try_merge(rq, bio)) { 293 case ELEVATOR_BACK_MERGE: 294 if (blk_mq_sched_allow_merge(q, rq, bio)) 295 merged = bio_attempt_back_merge(q, rq, bio); 296 break; 297 case ELEVATOR_FRONT_MERGE: 298 if (blk_mq_sched_allow_merge(q, rq, bio)) 299 merged = bio_attempt_front_merge(q, rq, bio); 300 break; 301 case ELEVATOR_DISCARD_MERGE: 302 merged = bio_attempt_discard_merge(q, rq, bio); 303 break; 304 default: 305 continue; 306 } 307 308 if (merged) 309 ctx->rq_merged++; 310 return merged; 311 } 312 313 return false; 314 } 315 316 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 317 { 318 struct elevator_queue *e = q->elevator; 319 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 320 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 321 bool ret = false; 322 323 if (e && e->type->ops.mq.bio_merge) { 324 blk_mq_put_ctx(ctx); 325 return e->type->ops.mq.bio_merge(hctx, bio); 326 } 327 328 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { 329 /* default per sw-queue merge */ 330 spin_lock(&ctx->lock); 331 ret = blk_mq_attempt_merge(q, ctx, bio); 332 spin_unlock(&ctx->lock); 333 } 334 335 blk_mq_put_ctx(ctx); 336 return ret; 337 } 338 339 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) 340 { 341 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); 342 } 343 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 344 345 void blk_mq_sched_request_inserted(struct request *rq) 346 { 347 trace_block_rq_insert(rq->q, rq); 348 } 349 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 350 351 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 352 bool has_sched, 353 struct request *rq) 354 { 355 /* dispatch flush rq directly */ 356 if (rq->rq_flags & RQF_FLUSH_SEQ) { 357 spin_lock(&hctx->lock); 358 list_add(&rq->queuelist, &hctx->dispatch); 359 spin_unlock(&hctx->lock); 360 return true; 361 } 362 363 if (has_sched) 364 rq->rq_flags |= RQF_SORTED; 365 366 return false; 367 } 368 369 /** 370 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list 371 * @pos: loop cursor. 372 * @skip: the list element that will not be examined. Iteration starts at 373 * @skip->next. 374 * @head: head of the list to examine. This list must have at least one 375 * element, namely @skip. 376 * @member: name of the list_head structure within typeof(*pos). 377 */ 378 #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ 379 for ((pos) = (skip); \ 380 (pos = (pos)->member.next != (head) ? list_entry_rcu( \ 381 (pos)->member.next, typeof(*pos), member) : \ 382 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ 383 (pos) != (skip); ) 384 385 /* 386 * Called after a driver tag has been freed to check whether a hctx needs to 387 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware 388 * queues in a round-robin fashion if the tag set of @hctx is shared with other 389 * hardware queues. 390 */ 391 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) 392 { 393 struct blk_mq_tags *const tags = hctx->tags; 394 struct blk_mq_tag_set *const set = hctx->queue->tag_set; 395 struct request_queue *const queue = hctx->queue, *q; 396 struct blk_mq_hw_ctx *hctx2; 397 unsigned int i, j; 398 399 if (set->flags & BLK_MQ_F_TAG_SHARED) { 400 /* 401 * If this is 0, then we know that no hardware queues 402 * have RESTART marked. We're done. 403 */ 404 if (!atomic_read(&queue->shared_hctx_restart)) 405 return; 406 407 rcu_read_lock(); 408 list_for_each_entry_rcu_rr(q, queue, &set->tag_list, 409 tag_set_list) { 410 queue_for_each_hw_ctx(q, hctx2, i) 411 if (hctx2->tags == tags && 412 blk_mq_sched_restart_hctx(hctx2)) 413 goto done; 414 } 415 j = hctx->queue_num + 1; 416 for (i = 0; i < queue->nr_hw_queues; i++, j++) { 417 if (j == queue->nr_hw_queues) 418 j = 0; 419 hctx2 = queue->queue_hw_ctx[j]; 420 if (hctx2->tags == tags && 421 blk_mq_sched_restart_hctx(hctx2)) 422 break; 423 } 424 done: 425 rcu_read_unlock(); 426 } else { 427 blk_mq_sched_restart_hctx(hctx); 428 } 429 } 430 431 void blk_mq_sched_insert_request(struct request *rq, bool at_head, 432 bool run_queue, bool async) 433 { 434 struct request_queue *q = rq->q; 435 struct elevator_queue *e = q->elevator; 436 struct blk_mq_ctx *ctx = rq->mq_ctx; 437 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 438 439 /* flush rq in flush machinery need to be dispatched directly */ 440 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 441 blk_insert_flush(rq); 442 goto run; 443 } 444 445 WARN_ON(e && (rq->tag != -1)); 446 447 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 448 goto run; 449 450 if (e && e->type->ops.mq.insert_requests) { 451 LIST_HEAD(list); 452 453 list_add(&rq->queuelist, &list); 454 e->type->ops.mq.insert_requests(hctx, &list, at_head); 455 } else { 456 spin_lock(&ctx->lock); 457 __blk_mq_insert_request(hctx, rq, at_head); 458 spin_unlock(&ctx->lock); 459 } 460 461 run: 462 if (run_queue) 463 blk_mq_run_hw_queue(hctx, async); 464 } 465 466 void blk_mq_sched_insert_requests(struct request_queue *q, 467 struct blk_mq_ctx *ctx, 468 struct list_head *list, bool run_queue_async) 469 { 470 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 471 struct elevator_queue *e = hctx->queue->elevator; 472 473 if (e && e->type->ops.mq.insert_requests) 474 e->type->ops.mq.insert_requests(hctx, list, false); 475 else 476 blk_mq_insert_requests(hctx, ctx, list); 477 478 blk_mq_run_hw_queue(hctx, run_queue_async); 479 } 480 481 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 482 struct blk_mq_hw_ctx *hctx, 483 unsigned int hctx_idx) 484 { 485 if (hctx->sched_tags) { 486 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 487 blk_mq_free_rq_map(hctx->sched_tags); 488 hctx->sched_tags = NULL; 489 } 490 } 491 492 static int blk_mq_sched_alloc_tags(struct request_queue *q, 493 struct blk_mq_hw_ctx *hctx, 494 unsigned int hctx_idx) 495 { 496 struct blk_mq_tag_set *set = q->tag_set; 497 int ret; 498 499 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 500 set->reserved_tags); 501 if (!hctx->sched_tags) 502 return -ENOMEM; 503 504 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 505 if (ret) 506 blk_mq_sched_free_tags(set, hctx, hctx_idx); 507 508 return ret; 509 } 510 511 static void blk_mq_sched_tags_teardown(struct request_queue *q) 512 { 513 struct blk_mq_tag_set *set = q->tag_set; 514 struct blk_mq_hw_ctx *hctx; 515 int i; 516 517 queue_for_each_hw_ctx(q, hctx, i) 518 blk_mq_sched_free_tags(set, hctx, i); 519 } 520 521 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 522 unsigned int hctx_idx) 523 { 524 struct elevator_queue *e = q->elevator; 525 int ret; 526 527 if (!e) 528 return 0; 529 530 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); 531 if (ret) 532 return ret; 533 534 if (e->type->ops.mq.init_hctx) { 535 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); 536 if (ret) { 537 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 538 return ret; 539 } 540 } 541 542 blk_mq_debugfs_register_sched_hctx(q, hctx); 543 544 return 0; 545 } 546 547 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 548 unsigned int hctx_idx) 549 { 550 struct elevator_queue *e = q->elevator; 551 552 if (!e) 553 return; 554 555 blk_mq_debugfs_unregister_sched_hctx(hctx); 556 557 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 558 e->type->ops.mq.exit_hctx(hctx, hctx_idx); 559 hctx->sched_data = NULL; 560 } 561 562 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 563 } 564 565 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 566 { 567 struct blk_mq_hw_ctx *hctx; 568 struct elevator_queue *eq; 569 unsigned int i; 570 int ret; 571 572 if (!e) { 573 q->elevator = NULL; 574 return 0; 575 } 576 577 /* 578 * Default to double of smaller one between hw queue_depth and 128, 579 * since we don't split into sync/async like the old code did. 580 * Additionally, this is a per-hw queue depth. 581 */ 582 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 583 BLKDEV_MAX_RQ); 584 585 queue_for_each_hw_ctx(q, hctx, i) { 586 ret = blk_mq_sched_alloc_tags(q, hctx, i); 587 if (ret) 588 goto err; 589 } 590 591 ret = e->ops.mq.init_sched(q, e); 592 if (ret) 593 goto err; 594 595 blk_mq_debugfs_register_sched(q); 596 597 queue_for_each_hw_ctx(q, hctx, i) { 598 if (e->ops.mq.init_hctx) { 599 ret = e->ops.mq.init_hctx(hctx, i); 600 if (ret) { 601 eq = q->elevator; 602 blk_mq_exit_sched(q, eq); 603 kobject_put(&eq->kobj); 604 return ret; 605 } 606 } 607 blk_mq_debugfs_register_sched_hctx(q, hctx); 608 } 609 610 return 0; 611 612 err: 613 blk_mq_sched_tags_teardown(q); 614 q->elevator = NULL; 615 return ret; 616 } 617 618 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 619 { 620 struct blk_mq_hw_ctx *hctx; 621 unsigned int i; 622 623 queue_for_each_hw_ctx(q, hctx, i) { 624 blk_mq_debugfs_unregister_sched_hctx(hctx); 625 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 626 e->type->ops.mq.exit_hctx(hctx, i); 627 hctx->sched_data = NULL; 628 } 629 } 630 blk_mq_debugfs_unregister_sched(q); 631 if (e->type->ops.mq.exit_sched) 632 e->type->ops.mq.exit_sched(e); 633 blk_mq_sched_tags_teardown(q); 634 q->elevator = NULL; 635 } 636 637 int blk_mq_sched_init(struct request_queue *q) 638 { 639 int ret; 640 641 mutex_lock(&q->sysfs_lock); 642 ret = elevator_init(q, NULL); 643 mutex_unlock(&q->sysfs_lock); 644 645 return ret; 646 } 647