1 /* 2 * blk-mq scheduling framework 3 * 4 * Copyright (C) 2016 Jens Axboe 5 */ 6 #include <linux/kernel.h> 7 #include <linux/module.h> 8 #include <linux/blk-mq.h> 9 10 #include <trace/events/block.h> 11 12 #include "blk.h" 13 #include "blk-mq.h" 14 #include "blk-mq-debugfs.h" 15 #include "blk-mq-sched.h" 16 #include "blk-mq-tag.h" 17 #include "blk-wbt.h" 18 19 void blk_mq_sched_free_hctx_data(struct request_queue *q, 20 void (*exit)(struct blk_mq_hw_ctx *)) 21 { 22 struct blk_mq_hw_ctx *hctx; 23 int i; 24 25 queue_for_each_hw_ctx(q, hctx, i) { 26 if (exit && hctx->sched_data) 27 exit(hctx); 28 kfree(hctx->sched_data); 29 hctx->sched_data = NULL; 30 } 31 } 32 EXPORT_SYMBOL_GPL(blk_mq_sched_free_hctx_data); 33 34 void blk_mq_sched_assign_ioc(struct request *rq, struct bio *bio) 35 { 36 struct request_queue *q = rq->q; 37 struct io_context *ioc = rq_ioc(bio); 38 struct io_cq *icq; 39 40 spin_lock_irq(q->queue_lock); 41 icq = ioc_lookup_icq(ioc, q); 42 spin_unlock_irq(q->queue_lock); 43 44 if (!icq) { 45 icq = ioc_create_icq(ioc, q, GFP_ATOMIC); 46 if (!icq) 47 return; 48 } 49 get_io_context(icq->ioc); 50 rq->elv.icq = icq; 51 } 52 53 /* 54 * Mark a hardware queue as needing a restart. For shared queues, maintain 55 * a count of how many hardware queues are marked for restart. 56 */ 57 static void blk_mq_sched_mark_restart_hctx(struct blk_mq_hw_ctx *hctx) 58 { 59 if (test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 60 return; 61 62 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 63 struct request_queue *q = hctx->queue; 64 65 if (!test_and_set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 66 atomic_inc(&q->shared_hctx_restart); 67 } else 68 set_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 69 } 70 71 static bool blk_mq_sched_restart_hctx(struct blk_mq_hw_ctx *hctx) 72 { 73 if (!test_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 74 return false; 75 76 if (hctx->flags & BLK_MQ_F_TAG_SHARED) { 77 struct request_queue *q = hctx->queue; 78 79 if (test_and_clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state)) 80 atomic_dec(&q->shared_hctx_restart); 81 } else 82 clear_bit(BLK_MQ_S_SCHED_RESTART, &hctx->state); 83 84 if (blk_mq_hctx_has_pending(hctx)) { 85 blk_mq_run_hw_queue(hctx, true); 86 return true; 87 } 88 89 return false; 90 } 91 92 void blk_mq_sched_dispatch_requests(struct blk_mq_hw_ctx *hctx) 93 { 94 struct request_queue *q = hctx->queue; 95 struct elevator_queue *e = q->elevator; 96 const bool has_sched_dispatch = e && e->type->ops.mq.dispatch_request; 97 bool did_work = false; 98 LIST_HEAD(rq_list); 99 100 /* RCU or SRCU read lock is needed before checking quiesced flag */ 101 if (unlikely(blk_mq_hctx_stopped(hctx) || blk_queue_quiesced(q))) 102 return; 103 104 hctx->run++; 105 106 /* 107 * If we have previous entries on our dispatch list, grab them first for 108 * more fair dispatch. 109 */ 110 if (!list_empty_careful(&hctx->dispatch)) { 111 spin_lock(&hctx->lock); 112 if (!list_empty(&hctx->dispatch)) 113 list_splice_init(&hctx->dispatch, &rq_list); 114 spin_unlock(&hctx->lock); 115 } 116 117 /* 118 * Only ask the scheduler for requests, if we didn't have residual 119 * requests from the dispatch list. This is to avoid the case where 120 * we only ever dispatch a fraction of the requests available because 121 * of low device queue depth. Once we pull requests out of the IO 122 * scheduler, we can no longer merge or sort them. So it's best to 123 * leave them there for as long as we can. Mark the hw queue as 124 * needing a restart in that case. 125 */ 126 if (!list_empty(&rq_list)) { 127 blk_mq_sched_mark_restart_hctx(hctx); 128 did_work = blk_mq_dispatch_rq_list(q, &rq_list); 129 } else if (!has_sched_dispatch) { 130 blk_mq_flush_busy_ctxs(hctx, &rq_list); 131 blk_mq_dispatch_rq_list(q, &rq_list); 132 } 133 134 /* 135 * We want to dispatch from the scheduler if we had no work left 136 * on the dispatch list, OR if we did have work but weren't able 137 * to make progress. 138 */ 139 if (!did_work && has_sched_dispatch) { 140 do { 141 struct request *rq; 142 143 rq = e->type->ops.mq.dispatch_request(hctx); 144 if (!rq) 145 break; 146 list_add(&rq->queuelist, &rq_list); 147 } while (blk_mq_dispatch_rq_list(q, &rq_list)); 148 } 149 } 150 151 bool blk_mq_sched_try_merge(struct request_queue *q, struct bio *bio, 152 struct request **merged_request) 153 { 154 struct request *rq; 155 156 switch (elv_merge(q, &rq, bio)) { 157 case ELEVATOR_BACK_MERGE: 158 if (!blk_mq_sched_allow_merge(q, rq, bio)) 159 return false; 160 if (!bio_attempt_back_merge(q, rq, bio)) 161 return false; 162 *merged_request = attempt_back_merge(q, rq); 163 if (!*merged_request) 164 elv_merged_request(q, rq, ELEVATOR_BACK_MERGE); 165 return true; 166 case ELEVATOR_FRONT_MERGE: 167 if (!blk_mq_sched_allow_merge(q, rq, bio)) 168 return false; 169 if (!bio_attempt_front_merge(q, rq, bio)) 170 return false; 171 *merged_request = attempt_front_merge(q, rq); 172 if (!*merged_request) 173 elv_merged_request(q, rq, ELEVATOR_FRONT_MERGE); 174 return true; 175 default: 176 return false; 177 } 178 } 179 EXPORT_SYMBOL_GPL(blk_mq_sched_try_merge); 180 181 /* 182 * Reverse check our software queue for entries that we could potentially 183 * merge with. Currently includes a hand-wavy stop count of 8, to not spend 184 * too much time checking for merges. 185 */ 186 static bool blk_mq_attempt_merge(struct request_queue *q, 187 struct blk_mq_ctx *ctx, struct bio *bio) 188 { 189 struct request *rq; 190 int checked = 8; 191 192 lockdep_assert_held(&ctx->lock); 193 194 list_for_each_entry_reverse(rq, &ctx->rq_list, queuelist) { 195 bool merged = false; 196 197 if (!checked--) 198 break; 199 200 if (!blk_rq_merge_ok(rq, bio)) 201 continue; 202 203 switch (blk_try_merge(rq, bio)) { 204 case ELEVATOR_BACK_MERGE: 205 if (blk_mq_sched_allow_merge(q, rq, bio)) 206 merged = bio_attempt_back_merge(q, rq, bio); 207 break; 208 case ELEVATOR_FRONT_MERGE: 209 if (blk_mq_sched_allow_merge(q, rq, bio)) 210 merged = bio_attempt_front_merge(q, rq, bio); 211 break; 212 case ELEVATOR_DISCARD_MERGE: 213 merged = bio_attempt_discard_merge(q, rq, bio); 214 break; 215 default: 216 continue; 217 } 218 219 if (merged) 220 ctx->rq_merged++; 221 return merged; 222 } 223 224 return false; 225 } 226 227 bool __blk_mq_sched_bio_merge(struct request_queue *q, struct bio *bio) 228 { 229 struct elevator_queue *e = q->elevator; 230 struct blk_mq_ctx *ctx = blk_mq_get_ctx(q); 231 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 232 bool ret = false; 233 234 if (e && e->type->ops.mq.bio_merge) { 235 blk_mq_put_ctx(ctx); 236 return e->type->ops.mq.bio_merge(hctx, bio); 237 } 238 239 if (hctx->flags & BLK_MQ_F_SHOULD_MERGE) { 240 /* default per sw-queue merge */ 241 spin_lock(&ctx->lock); 242 ret = blk_mq_attempt_merge(q, ctx, bio); 243 spin_unlock(&ctx->lock); 244 } 245 246 blk_mq_put_ctx(ctx); 247 return ret; 248 } 249 250 bool blk_mq_sched_try_insert_merge(struct request_queue *q, struct request *rq) 251 { 252 return rq_mergeable(rq) && elv_attempt_insert_merge(q, rq); 253 } 254 EXPORT_SYMBOL_GPL(blk_mq_sched_try_insert_merge); 255 256 void blk_mq_sched_request_inserted(struct request *rq) 257 { 258 trace_block_rq_insert(rq->q, rq); 259 } 260 EXPORT_SYMBOL_GPL(blk_mq_sched_request_inserted); 261 262 static bool blk_mq_sched_bypass_insert(struct blk_mq_hw_ctx *hctx, 263 struct request *rq) 264 { 265 if (rq->tag == -1) { 266 rq->rq_flags |= RQF_SORTED; 267 return false; 268 } 269 270 /* 271 * If we already have a real request tag, send directly to 272 * the dispatch list. 273 */ 274 spin_lock(&hctx->lock); 275 list_add(&rq->queuelist, &hctx->dispatch); 276 spin_unlock(&hctx->lock); 277 return true; 278 } 279 280 /** 281 * list_for_each_entry_rcu_rr - iterate in a round-robin fashion over rcu list 282 * @pos: loop cursor. 283 * @skip: the list element that will not be examined. Iteration starts at 284 * @skip->next. 285 * @head: head of the list to examine. This list must have at least one 286 * element, namely @skip. 287 * @member: name of the list_head structure within typeof(*pos). 288 */ 289 #define list_for_each_entry_rcu_rr(pos, skip, head, member) \ 290 for ((pos) = (skip); \ 291 (pos = (pos)->member.next != (head) ? list_entry_rcu( \ 292 (pos)->member.next, typeof(*pos), member) : \ 293 list_entry_rcu((pos)->member.next->next, typeof(*pos), member)), \ 294 (pos) != (skip); ) 295 296 /* 297 * Called after a driver tag has been freed to check whether a hctx needs to 298 * be restarted. Restarts @hctx if its tag set is not shared. Restarts hardware 299 * queues in a round-robin fashion if the tag set of @hctx is shared with other 300 * hardware queues. 301 */ 302 void blk_mq_sched_restart(struct blk_mq_hw_ctx *const hctx) 303 { 304 struct blk_mq_tags *const tags = hctx->tags; 305 struct blk_mq_tag_set *const set = hctx->queue->tag_set; 306 struct request_queue *const queue = hctx->queue, *q; 307 struct blk_mq_hw_ctx *hctx2; 308 unsigned int i, j; 309 310 if (set->flags & BLK_MQ_F_TAG_SHARED) { 311 /* 312 * If this is 0, then we know that no hardware queues 313 * have RESTART marked. We're done. 314 */ 315 if (!atomic_read(&queue->shared_hctx_restart)) 316 return; 317 318 rcu_read_lock(); 319 list_for_each_entry_rcu_rr(q, queue, &set->tag_list, 320 tag_set_list) { 321 queue_for_each_hw_ctx(q, hctx2, i) 322 if (hctx2->tags == tags && 323 blk_mq_sched_restart_hctx(hctx2)) 324 goto done; 325 } 326 j = hctx->queue_num + 1; 327 for (i = 0; i < queue->nr_hw_queues; i++, j++) { 328 if (j == queue->nr_hw_queues) 329 j = 0; 330 hctx2 = queue->queue_hw_ctx[j]; 331 if (hctx2->tags == tags && 332 blk_mq_sched_restart_hctx(hctx2)) 333 break; 334 } 335 done: 336 rcu_read_unlock(); 337 } else { 338 blk_mq_sched_restart_hctx(hctx); 339 } 340 } 341 342 /* 343 * Add flush/fua to the queue. If we fail getting a driver tag, then 344 * punt to the requeue list. Requeue will re-invoke us from a context 345 * that's safe to block from. 346 */ 347 static void blk_mq_sched_insert_flush(struct blk_mq_hw_ctx *hctx, 348 struct request *rq, bool can_block) 349 { 350 if (blk_mq_get_driver_tag(rq, &hctx, can_block)) { 351 blk_insert_flush(rq); 352 blk_mq_run_hw_queue(hctx, true); 353 } else 354 blk_mq_add_to_requeue_list(rq, false, true); 355 } 356 357 void blk_mq_sched_insert_request(struct request *rq, bool at_head, 358 bool run_queue, bool async, bool can_block) 359 { 360 struct request_queue *q = rq->q; 361 struct elevator_queue *e = q->elevator; 362 struct blk_mq_ctx *ctx = rq->mq_ctx; 363 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 364 365 if (rq->tag == -1 && op_is_flush(rq->cmd_flags)) { 366 blk_mq_sched_insert_flush(hctx, rq, can_block); 367 return; 368 } 369 370 if (e && blk_mq_sched_bypass_insert(hctx, rq)) 371 goto run; 372 373 if (e && e->type->ops.mq.insert_requests) { 374 LIST_HEAD(list); 375 376 list_add(&rq->queuelist, &list); 377 e->type->ops.mq.insert_requests(hctx, &list, at_head); 378 } else { 379 spin_lock(&ctx->lock); 380 __blk_mq_insert_request(hctx, rq, at_head); 381 spin_unlock(&ctx->lock); 382 } 383 384 run: 385 if (run_queue) 386 blk_mq_run_hw_queue(hctx, async); 387 } 388 389 void blk_mq_sched_insert_requests(struct request_queue *q, 390 struct blk_mq_ctx *ctx, 391 struct list_head *list, bool run_queue_async) 392 { 393 struct blk_mq_hw_ctx *hctx = blk_mq_map_queue(q, ctx->cpu); 394 struct elevator_queue *e = hctx->queue->elevator; 395 396 if (e) { 397 struct request *rq, *next; 398 399 /* 400 * We bypass requests that already have a driver tag assigned, 401 * which should only be flushes. Flushes are only ever inserted 402 * as single requests, so we shouldn't ever hit the 403 * WARN_ON_ONCE() below (but let's handle it just in case). 404 */ 405 list_for_each_entry_safe(rq, next, list, queuelist) { 406 if (WARN_ON_ONCE(rq->tag != -1)) { 407 list_del_init(&rq->queuelist); 408 blk_mq_sched_bypass_insert(hctx, rq); 409 } 410 } 411 } 412 413 if (e && e->type->ops.mq.insert_requests) 414 e->type->ops.mq.insert_requests(hctx, list, false); 415 else 416 blk_mq_insert_requests(hctx, ctx, list); 417 418 blk_mq_run_hw_queue(hctx, run_queue_async); 419 } 420 421 static void blk_mq_sched_free_tags(struct blk_mq_tag_set *set, 422 struct blk_mq_hw_ctx *hctx, 423 unsigned int hctx_idx) 424 { 425 if (hctx->sched_tags) { 426 blk_mq_free_rqs(set, hctx->sched_tags, hctx_idx); 427 blk_mq_free_rq_map(hctx->sched_tags); 428 hctx->sched_tags = NULL; 429 } 430 } 431 432 static int blk_mq_sched_alloc_tags(struct request_queue *q, 433 struct blk_mq_hw_ctx *hctx, 434 unsigned int hctx_idx) 435 { 436 struct blk_mq_tag_set *set = q->tag_set; 437 int ret; 438 439 hctx->sched_tags = blk_mq_alloc_rq_map(set, hctx_idx, q->nr_requests, 440 set->reserved_tags); 441 if (!hctx->sched_tags) 442 return -ENOMEM; 443 444 ret = blk_mq_alloc_rqs(set, hctx->sched_tags, hctx_idx, q->nr_requests); 445 if (ret) 446 blk_mq_sched_free_tags(set, hctx, hctx_idx); 447 448 return ret; 449 } 450 451 static void blk_mq_sched_tags_teardown(struct request_queue *q) 452 { 453 struct blk_mq_tag_set *set = q->tag_set; 454 struct blk_mq_hw_ctx *hctx; 455 int i; 456 457 queue_for_each_hw_ctx(q, hctx, i) 458 blk_mq_sched_free_tags(set, hctx, i); 459 } 460 461 int blk_mq_sched_init_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 462 unsigned int hctx_idx) 463 { 464 struct elevator_queue *e = q->elevator; 465 int ret; 466 467 if (!e) 468 return 0; 469 470 ret = blk_mq_sched_alloc_tags(q, hctx, hctx_idx); 471 if (ret) 472 return ret; 473 474 if (e->type->ops.mq.init_hctx) { 475 ret = e->type->ops.mq.init_hctx(hctx, hctx_idx); 476 if (ret) { 477 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 478 return ret; 479 } 480 } 481 482 blk_mq_debugfs_register_sched_hctx(q, hctx); 483 484 return 0; 485 } 486 487 void blk_mq_sched_exit_hctx(struct request_queue *q, struct blk_mq_hw_ctx *hctx, 488 unsigned int hctx_idx) 489 { 490 struct elevator_queue *e = q->elevator; 491 492 if (!e) 493 return; 494 495 blk_mq_debugfs_unregister_sched_hctx(hctx); 496 497 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 498 e->type->ops.mq.exit_hctx(hctx, hctx_idx); 499 hctx->sched_data = NULL; 500 } 501 502 blk_mq_sched_free_tags(q->tag_set, hctx, hctx_idx); 503 } 504 505 int blk_mq_init_sched(struct request_queue *q, struct elevator_type *e) 506 { 507 struct blk_mq_hw_ctx *hctx; 508 struct elevator_queue *eq; 509 unsigned int i; 510 int ret; 511 512 if (!e) { 513 q->elevator = NULL; 514 return 0; 515 } 516 517 /* 518 * Default to double of smaller one between hw queue_depth and 128, 519 * since we don't split into sync/async like the old code did. 520 * Additionally, this is a per-hw queue depth. 521 */ 522 q->nr_requests = 2 * min_t(unsigned int, q->tag_set->queue_depth, 523 BLKDEV_MAX_RQ); 524 525 queue_for_each_hw_ctx(q, hctx, i) { 526 ret = blk_mq_sched_alloc_tags(q, hctx, i); 527 if (ret) 528 goto err; 529 } 530 531 ret = e->ops.mq.init_sched(q, e); 532 if (ret) 533 goto err; 534 535 blk_mq_debugfs_register_sched(q); 536 537 queue_for_each_hw_ctx(q, hctx, i) { 538 if (e->ops.mq.init_hctx) { 539 ret = e->ops.mq.init_hctx(hctx, i); 540 if (ret) { 541 eq = q->elevator; 542 blk_mq_exit_sched(q, eq); 543 kobject_put(&eq->kobj); 544 return ret; 545 } 546 } 547 blk_mq_debugfs_register_sched_hctx(q, hctx); 548 } 549 550 return 0; 551 552 err: 553 blk_mq_sched_tags_teardown(q); 554 q->elevator = NULL; 555 return ret; 556 } 557 558 void blk_mq_exit_sched(struct request_queue *q, struct elevator_queue *e) 559 { 560 struct blk_mq_hw_ctx *hctx; 561 unsigned int i; 562 563 queue_for_each_hw_ctx(q, hctx, i) { 564 blk_mq_debugfs_unregister_sched_hctx(hctx); 565 if (e->type->ops.mq.exit_hctx && hctx->sched_data) { 566 e->type->ops.mq.exit_hctx(hctx, i); 567 hctx->sched_data = NULL; 568 } 569 } 570 blk_mq_debugfs_unregister_sched(q); 571 if (e->type->ops.mq.exit_sched) 572 e->type->ops.mq.exit_sched(e); 573 blk_mq_sched_tags_teardown(q); 574 q->elevator = NULL; 575 } 576 577 int blk_mq_sched_init(struct request_queue *q) 578 { 579 int ret; 580 581 mutex_lock(&q->sysfs_lock); 582 ret = elevator_init(q, NULL); 583 mutex_unlock(&q->sysfs_lock); 584 585 return ret; 586 } 587