1 /* 2 * blk-mq scheduling framework 3 * 4 * Copyright (C) 2016 Jens Axboe 5 */ 6 #include <linux/kernel.h> 7 #include <linux/module.h> 8 #include <linux/blk-mq.h> 9 10 #include <trace/events/block.h> 11 12 #include "blk.h" 13 #include "blk-mq.h" 14 #include "blk-mq-debugfs.h" 15 #include "blk-mq-sched.h" 16 #include "blk-mq-tag.h" 17 #include "blk-wbt.h" 18 19 void blk_mq_sched_free_hctx_data(struct request_queue *q, 20 void (*exit)(struct blk_mq_hw_ctx *)) 21 { 22 struct blk_mq_hw_ctx *hctx; 23 int i; 24 25 queue_for_each_hw_ctx(q, hctx, i) { 26 if (exit && hctx->sched_data) 27 exit(hctx); 28 kfree(hctx->sched_data); 29 hctx->sched_data = NULL; 30 } 31 } 32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 33 34 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 35 { 36 struct request_queue *q = rq->q; 37 struct io_context *ioc = rq_ioc(bio); 38 struct io_cq *icq; 39 40 spin_lock_irq(q->queue_lock); 41 icq = ioc_lookup_icq(ioc, q); 42 spin_unlock_irq(q->queue_lock); 43 44 if (!icq) { 45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 46 if (!icq) 47 return; 48 } 49 get_io_context(icq->ioc); 50 rq->elv.icq = icq; 51 } 52 53 /* 54 * Mark a hardware queue as needing a restart. For shared queues, maintain 55 * a count of how many hardware queues are marked for restart. 56 */ 57 static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 58 { 59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 60 return; 61 62 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 63 struct request_queue *q = hctx->queue; 64 65 if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 atomic_inc(&q->shared_hctx_restart); 67 } else 68 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 } 70 71 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 72 { 73 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 74 return false; 75 76 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 77 struct request_queue *q = hctx->queue; 78 79 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 80 atomic_dec(&q->shared_hctx_restart); 81 } else 82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 83 84 return blk_mq_run_hw_queue(hctx, true); 85 } 86 87 /* 88 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 89 * its queue by itself in its completion handler, so we don't need to 90 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 91 */ 92 static void blk_mq_do_dispatch_sched(struct blk_mq_hw_ctx *hctx) 93 { 94 struct request_queue *q = hctx->queue; 95 struct elevator_queue *e = q->elevator; 96 LIST_HEAD(rq_list); 97 98 do { 99 struct request *rq; 100 101 if (e->type->ops.mq.has_work && 102 !e->type->ops.mq.has_work(hctx)) 103 break; 104 105 if (!blk_mq_get_dispatch_budget(hctx)) 106 break; 107 108 rq = e->type->ops.mq.dispatch_request(hctx); 109 if (!rq) { 110 blk_mq_put_dispatch_budget(hctx); 111 break; 112 } 113 114 /* 115 * Now this rq owns the budget which has to be released 116 * if this rq won't be queued to driver via .queue_rq() 117 * in blk_mq_dispatch_rq_list(). 118 */ 119 list_add(&rq->queuelist, &rq_list); 120 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 121 } 122 123 static struct blk_mq_ctx *blk_mq_next_ctx(struct blk_mq_hw_ctx *hctx, 124 struct blk_mq_ctx *ctx) 125 { 126 unsigned idx = ctx->index_hw; 127 128 if (++idx == hctx->nr_ctx) 129 idx = 0; 130 131 return hctx->ctxs[idx]; 132 } 133 134 /* 135 * Only SCSI implements .get_budget and .put_budget, and SCSI restarts 136 * its queue by itself in its completion handler, so we don't need to 137 * restart queue if .get_budget() returns BLK_STS_NO_RESOURCE. 138 */ 139 static void blk_mq_do_dispatch_ctx(struct blk_mq_hw_ctx *hctx) 140 { 141 struct request_queue *q = hctx->queue; 142 LIST_HEAD(rq_list); 143 struct blk_mq_ctx *ctx = READ_ONCE(hctx->dispatch_from); 144 145 do { 146 struct request *rq; 147 148 if (!sbitmap_any_bit_set(&hctx->ctx_map)) 149 break; 150 151 if (!blk_mq_get_dispatch_budget(hctx)) 152 break; 153 154 rq = blk_mq_dequeue_from_ctx(hctx, ctx); 155 if (!rq) { 156 blk_mq_put_dispatch_budget(hctx); 157 break; 158 } 159 160 /* 161 * Now this rq owns the budget which has to be released 162 * if this rq won't be queued to driver via .queue_rq() 163 * in blk_mq_dispatch_rq_list(). 164 */ 165 list_add(&rq->queuelist, &rq_list); 166 167 /* round robin for fair dispatch */ 168 ctx = blk_mq_next_ctx(hctx, rq->mq_ctx); 169 170 } while (blk_mq_dispatch_rq_list(q, &rq_list, true)); 171 172 WRITE_ONCE(hctx->dispatch_from, ctx); 173 } 174 175 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 176 { 177 struct request_queue *q = hctx->queue; 178 struct elevator_queue *e = q->elevator; 179 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 180 LIST_HEAD(rq_list); 181 182 /* RCU or SRCU read lock is needed before checking quiesced flag */ 183 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 184 return; 185 186 hctx->run++; 187 188 /* 189 * If we have previous entries on our dispatch list, grab them first for 190 * more fair dispatch. 191 */ 192 if (!list_empty_careful(&hctx->dispatch)) { 193 spin_lock(&hctx->lock); 194 if (!list_empty(&hctx->dispatch)) 195 list_splice_init(&hctx->dispatch, &rq_list); 196 spin_unlock(&hctx->lock); 197 } 198 199 /* 200 * Only ask the scheduler for requests, if we didn't have residual 201 * requests from the dispatch list. This is to avoid the case where 202 * we only ever dispatch a fraction of the requests available because 203 * of low device queue depth. Once we pull requests out of the IO 204 * scheduler, we can no longer merge or sort them. So it's best to 205 * leave them there for as long as we can. Mark the hw queue as 206 * needing a restart in that case. 207 * 208 * We want to dispatch from the scheduler if there was nothing 209 * on the dispatch list or we were able to dispatch from the 210 * dispatch list. 211 */ 212 if (!list_empty(&rq_list)) { 213 blk_mq_sched_mark_restart_hctx(hctx); 214 if (blk_mq_dispatch_rq_list(q, &rq_list, false)) { 215 if (has_sched_dispatch) 216 blk_mq_do_dispatch_sched(hctx); 217 else 218 blk_mq_do_dispatch_ctx(hctx); 219 } 220 } else if (has_sched_dispatch) { 221 blk_mq_do_dispatch_sched(hctx); 222 } else if (q->mq_ops->get_budget) { 223 /* 224 * If we need to get budget before queuing request, we 225 * dequeue request one by one from sw queue for avoiding 226 * to mess up I/O merge when dispatch runs out of resource. 227 * 228 * TODO: get more budgets, and dequeue more requests in 229 * one time. 230 */ 231 blk_mq_do_dispatch_ctx(hctx); 232 } else { 233 blk_mq_flush_busy_ctxs(hctx, &rq_list); 234 blk_mq_dispatch_rq_list(q, &rq_list, false); 235 } 236 } 237 238 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 239 struct request **merged_request) 240 { 241 struct request *rq; 242 243 switch (elv_merge(q, &rq, bio)) { 244 case ELEVATOR_BACK_MERGE: 245 if (!blk_mq_sched_allow_merge(q, rq, bio)) 246 return false; 247 if (!bio_attempt_back_merge(q, rq, bio)) 248 return false; 249 *merged_request = attempt_back_merge(q, rq); 250 if (!*merged_request) 251 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); 252 return true; 253 case ELEVATOR_FRONT_MERGE: 254 if (!blk_mq_sched_allow_merge(q, rq, bio)) 255 return false; 256 if (!bio_attempt_front_merge(q, rq, bio)) 257 return false; 258 *merged_request = attempt_front_merge(q, rq); 259 if (!*merged_request) 260 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 261 return true; 262 case ELEVATOR_DISCARD_MERGE: 263 return bio_attempt_discard_merge(q, rq, bio); 264 default: 265 return false; 266 } 267 } 268 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 269 270 /* 271 * Iterate list of requests and see if we can merge this bio with any 272 * of them. 273 */ 274 bool blk_mq_bio_list_merge(struct request_queue *q, struct list_head *list, 275 struct bio *bio) 276 { 277 struct request *rq; 278 int checked = 8; 279 280 list_for_each_entry_reverse(rq, list, queuelist) { 281 bool merged = false; 282 283 if (!checked--) 284 break; 285 286 if (!blk_rq_merge_ok(rq, bio)) 287 continue; 288 289 switch (blk_try_merge(rq, bio)) { 290 case ELEVATOR_BACK_MERGE: 291 if (blk_mq_sched_allow_merge(q, rq, bio)) 292 merged = bio_attempt_back_merge(q, rq, bio); 293 break; 294 case ELEVATOR_FRONT_MERGE: 295 if (blk_mq_sched_allow_merge(q, rq, bio)) 296 merged = bio_attempt_front_merge(q, rq, bio); 297 break; 298 case ELEVATOR_DISCARD_MERGE: 299 merged = bio_attempt_discard_merge(q, rq, bio); 300 break; 301 default: 302 continue; 303 } 304 305 return merged; 306 } 307 308 return false; 309 } 310 EXPORT_SYMBOL_GPL(blk_mq_bio_list_merge); 311 312 /* 313 * Reverse check our software queue for entries that we could potentially 314 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 315 * too much time checking for merges. 316 */ 317 static bool blk_mq_attempt_merge(struct request_queue *q, 318 struct blk_mq_ctx *ctx, struct bio *bio) 319 { 320 lockdep_assert_held(&ctx->lock); 321 322 if (blk_mq_bio_list_merge(q, &ctx->rq_list, bio)) { 323 ctx->rq_merged++; 324 return true; 325 } 326 327 return false; 328 } 329 330 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 331 { 332 struct elevator_queue *e = q->elevator; 333 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 334 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 335 bool ret = false; 336 337 if (e && e->type->ops.mq.bio_merge) { 338 blk_mq_put_ctx(ctx); 339 return e->type->ops.mq.bio_merge(hctx, bio); 340 } 341 342 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { 343 /* default per sw-queue merge */ 344 spin_lock(&ctx->lock); 345 ret = blk_mq_attempt_merge(q, ctx, bio); 346 spin_unlock(&ctx->lock); 347 } 348 349 blk_mq_put_ctx(ctx); 350 return ret; 351 } 352 353 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) 354 { 355 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); 356 } 357 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 358 359 void blk_mq_sched_request_inserted(struct request *rq) 360 { 361 trace_block_rq_insert(rq->q, rq); 362 } 363 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 364 365 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 366 bool has_sched, 367 struct request *rq) 368 { 369 /* dispatch flush rq directly */ 370 if (rq->rq_flags & RQF_FLUSH_SEQ) { 371 spin_lock(&hctx->lock); 372 list_add(&rq->queuelist, &hctx->dispatch); 373 spin_unlock(&hctx->lock); 374 return true; 375 } 376 377 if (has_sched) 378 rq->rq_flags |= RQF_SORTED; 379 380 return false; 381 } 382 383 /** 384 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list 385 * @pos: loop cursor. 386 * @skip: the list element that will not be examined. Iteration starts at 387 * @skip->next. 388 * @head: head of the list to examine. This list must have at least one 389 * element, namely @skip. 390 * @member: name of the list_head structure within typeof(*pos). 391 */ 392 #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ 393 for ((pos) = (skip); \ 394 (pos = (pos)->member.next != (head) ? list_entry_rcu( \ 395 (pos)->member.next, typeof(*pos), member) : \ 396 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ 397 (pos) != (skip); ) 398 399 /* 400 * Called after a driver tag has been freed to check whether a hctx needs to 401 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware 402 * queues in a round-robin fashion if the tag set of @hctx is shared with other 403 * hardware queues. 404 */ 405 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) 406 { 407 struct blk_mq_tags *const tags = hctx->tags; 408 struct blk_mq_tag_set *const set = hctx->queue->tag_set; 409 struct request_queue *const queue = hctx->queue, *q; 410 struct blk_mq_hw_ctx *hctx2; 411 unsigned int i, j; 412 413 if (set->flags & BLK_MQ_F_TAG_SHARED) { 414 /* 415 * If this is 0, then we know that no hardware queues 416 * have RESTART marked. We're done. 417 */ 418 if (!atomic_read(&queue->shared_hctx_restart)) 419 return; 420 421 rcu_read_lock(); 422 list_for_each_entry_rcu_rr(q, queue, &set->tag_list, 423 tag_set_list) { 424 queue_for_each_hw_ctx(q, hctx2, i) 425 if (hctx2->tags == tags && 426 blk_mq_sched_restart_hctx(hctx2)) 427 goto done; 428 } 429 j = hctx->queue_num + 1; 430 for (i = 0; i < queue->nr_hw_queues; i++, j++) { 431 if (j == queue->nr_hw_queues) 432 j = 0; 433 hctx2 = queue->queue_hw_ctx[j]; 434 if (hctx2->tags == tags && 435 blk_mq_sched_restart_hctx(hctx2)) 436 break; 437 } 438 done: 439 rcu_read_unlock(); 440 } else { 441 blk_mq_sched_restart_hctx(hctx); 442 } 443 } 444 445 void blk_mq_sched_insert_request(struct request *rq, bool at_head, 446 bool run_queue, bool async) 447 { 448 struct request_queue *q = rq->q; 449 struct elevator_queue *e = q->elevator; 450 struct blk_mq_ctx *ctx = rq->mq_ctx; 451 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 452 453 /* flush rq in flush machinery need to be dispatched directly */ 454 if (!(rq->rq_flags & RQF_FLUSH_SEQ) && op_is_flush(rq->cmd_flags)) { 455 blk_insert_flush(rq); 456 goto run; 457 } 458 459 WARN_ON(e && (rq->tag != -1)); 460 461 if (blk_mq_sched_bypass_insert(hctx, !!e, rq)) 462 goto run; 463 464 if (e && e->type->ops.mq.insert_requests) { 465 LIST_HEAD(list); 466 467 list_add(&rq->queuelist, &list); 468 e->type->ops.mq.insert_requests(hctx, &list, at_head); 469 } else { 470 spin_lock(&ctx->lock); 471 __blk_mq_insert_request(hctx, rq, at_head); 472 spin_unlock(&ctx->lock); 473 } 474 475 run: 476 if (run_queue) 477 blk_mq_run_hw_queue(hctx, async); 478 } 479 480 void blk_mq_sched_insert_requests(struct request_queue *q, 481 struct blk_mq_ctx *ctx, 482 struct list_head *list, bool run_queue_async) 483 { 484 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 485 struct elevator_queue *e = hctx->queue->elevator; 486 487 if (e && e->type->ops.mq.insert_requests) 488 e->type->ops.mq.insert_requests(hctx, list, false); 489 else 490 blk_mq_insert_requests(hctx, ctx, list); 491 492 blk_mq_run_hw_queue(hctx, run_queue_async); 493 } 494 495 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 496 struct blk_mq_hw_ctx *hctx, 497 unsigned int hctx_idx) 498 { 499 if (hctx->sched_tags) { 500 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 501 blk_mq_free_rq_map(hctx->sched_tags); 502 hctx->sched_tags = NULL; 503 } 504 } 505 506 static int blk_mq_sched_alloc_tags(struct request_queue *q, 507 struct blk_mq_hw_ctx *hctx, 508 unsigned int hctx_idx) 509 { 510 struct blk_mq_tag_set *set = q->tag_set; 511 int ret; 512 513 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 514 set->reserved_tags); 515 if (!hctx->sched_tags) 516 return -ENOMEM; 517 518 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 519 if (ret) 520 blk_mq_sched_free_tags(set, hctx, hctx_idx); 521 522 return ret; 523 } 524 525 static void blk_mq_sched_tags_teardown(struct request_queue *q) 526 { 527 struct blk_mq_tag_set *set = q->tag_set; 528 struct blk_mq_hw_ctx *hctx; 529 int i; 530 531 queue_for_each_hw_ctx(q, hctx, i) 532 blk_mq_sched_free_tags(set, hctx, i); 533 } 534 535 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 536 unsigned int hctx_idx) 537 { 538 struct elevator_queue *e = q->elevator; 539 int ret; 540 541 if (!e) 542 return 0; 543 544 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); 545 if (ret) 546 return ret; 547 548 if (e->type->ops.mq.init_hctx) { 549 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); 550 if (ret) { 551 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 552 return ret; 553 } 554 } 555 556 blk_mq_debugfs_register_sched_hctx(q, hctx); 557 558 return 0; 559 } 560 561 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 562 unsigned int hctx_idx) 563 { 564 struct elevator_queue *e = q->elevator; 565 566 if (!e) 567 return; 568 569 blk_mq_debugfs_unregister_sched_hctx(hctx); 570 571 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 572 e->type->ops.mq.exit_hctx(hctx, hctx_idx); 573 hctx->sched_data = NULL; 574 } 575 576 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 577 } 578 579 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 580 { 581 struct blk_mq_hw_ctx *hctx; 582 struct elevator_queue *eq; 583 unsigned int i; 584 int ret; 585 586 if (!e) { 587 q->elevator = NULL; 588 q->nr_requests = q->tag_set->queue_depth; 589 return 0; 590 } 591 592 /* 593 * Default to double of smaller one between hw queue_depth and 128, 594 * since we don't split into sync/async like the old code did. 595 * Additionally, this is a per-hw queue depth. 596 */ 597 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 598 BLKDEV_MAX_RQ); 599 600 queue_for_each_hw_ctx(q, hctx, i) { 601 ret = blk_mq_sched_alloc_tags(q, hctx, i); 602 if (ret) 603 goto err; 604 } 605 606 ret = e->ops.mq.init_sched(q, e); 607 if (ret) 608 goto err; 609 610 blk_mq_debugfs_register_sched(q); 611 612 queue_for_each_hw_ctx(q, hctx, i) { 613 if (e->ops.mq.init_hctx) { 614 ret = e->ops.mq.init_hctx(hctx, i); 615 if (ret) { 616 eq = q->elevator; 617 blk_mq_exit_sched(q, eq); 618 kobject_put(&eq->kobj); 619 return ret; 620 } 621 } 622 blk_mq_debugfs_register_sched_hctx(q, hctx); 623 } 624 625 return 0; 626 627 err: 628 blk_mq_sched_tags_teardown(q); 629 q->elevator = NULL; 630 return ret; 631 } 632 633 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 634 { 635 struct blk_mq_hw_ctx *hctx; 636 unsigned int i; 637 638 queue_for_each_hw_ctx(q, hctx, i) { 639 blk_mq_debugfs_unregister_sched_hctx(hctx); 640 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 641 e->type->ops.mq.exit_hctx(hctx, i); 642 hctx->sched_data = NULL; 643 } 644 } 645 blk_mq_debugfs_unregister_sched(q); 646 if (e->type->ops.mq.exit_sched) 647 e->type->ops.mq.exit_sched(e); 648 blk_mq_sched_tags_teardown(q); 649 q->elevator = NULL; 650 } 651