1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * MQ Deadline i/o scheduler - adaptation of the legacy deadline scheduler, 4 * for the blk-mq scheduling framework 5 * 6 * Copyright (C) 2016 Jens Axboe <axboe@kernel.dk> 7 */ 8 #include <linux/kernel.h> 9 #include <linux/fs.h> 10 #include <linux/blkdev.h> 11 #include <linux/blk-mq.h> 12 #include <linux/bio.h> 13 #include <linux/module.h> 14 #include <linux/slab.h> 15 #include <linux/init.h> 16 #include <linux/compiler.h> 17 #include <linux/rbtree.h> 18 #include <linux/sbitmap.h> 19 20 #include <trace/events/block.h> 21 22 #include "elevator.h" 23 #include "blk.h" 24 #include "blk-mq.h" 25 #include "blk-mq-debugfs.h" 26 #include "blk-mq-tag.h" 27 #include "blk-mq-sched.h" 28 29 /* 30 * See Documentation/block/deadline-iosched.rst 31 */ 32 static const int read_expire = HZ / 2; /* max time before a read is submitted. */ 33 static const int write_expire = 5 * HZ; /* ditto for writes, these limits are SOFT! */ 34 /* 35 * Time after which to dispatch lower priority requests even if higher 36 * priority requests are pending. 37 */ 38 static const int prio_aging_expire = 10 * HZ; 39 static const int writes_starved = 2; /* max times reads can starve a write */ 40 static const int fifo_batch = 16; /* # of sequential requests treated as one 41 by the above parameters. For throughput. */ 42 43 enum dd_data_dir { 44 DD_READ = READ, 45 DD_WRITE = WRITE, 46 }; 47 48 enum { DD_DIR_COUNT = 2 }; 49 50 enum dd_prio { 51 DD_RT_PRIO = 0, 52 DD_BE_PRIO = 1, 53 DD_IDLE_PRIO = 2, 54 DD_PRIO_MAX = 2, 55 }; 56 57 enum { DD_PRIO_COUNT = 3 }; 58 59 /* 60 * I/O statistics per I/O priority. It is fine if these counters overflow. 61 * What matters is that these counters are at least as wide as 62 * log2(max_outstanding_requests). 63 */ 64 struct io_stats_per_prio { 65 uint32_t inserted; 66 uint32_t merged; 67 uint32_t dispatched; 68 atomic_t completed; 69 }; 70 71 /* 72 * Deadline scheduler data per I/O priority (enum dd_prio). Requests are 73 * present on both sort_list[] and fifo_list[]. 74 */ 75 struct dd_per_prio { 76 struct list_head dispatch; 77 struct rb_root sort_list[DD_DIR_COUNT]; 78 struct list_head fifo_list[DD_DIR_COUNT]; 79 /* Next request in FIFO order. Read, write or both are NULL. */ 80 struct request *next_rq[DD_DIR_COUNT]; 81 struct io_stats_per_prio stats; 82 }; 83 84 struct deadline_data { 85 /* 86 * run time data 87 */ 88 89 struct dd_per_prio per_prio[DD_PRIO_COUNT]; 90 91 /* Data direction of latest dispatched request. */ 92 enum dd_data_dir last_dir; 93 unsigned int batching; /* number of sequential requests made */ 94 unsigned int starved; /* times reads have starved writes */ 95 96 /* 97 * settings that change how the i/o scheduler behaves 98 */ 99 int fifo_expire[DD_DIR_COUNT]; 100 int fifo_batch; 101 int writes_starved; 102 int front_merges; 103 u32 async_depth; 104 int prio_aging_expire; 105 106 spinlock_t lock; 107 spinlock_t zone_lock; 108 }; 109 110 /* Maps an I/O priority class to a deadline scheduler priority. */ 111 static const enum dd_prio ioprio_class_to_prio[] = { 112 [IOPRIO_CLASS_NONE] = DD_BE_PRIO, 113 [IOPRIO_CLASS_RT] = DD_RT_PRIO, 114 [IOPRIO_CLASS_BE] = DD_BE_PRIO, 115 [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, 116 }; 117 118 static inline struct rb_root * 119 deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) 120 { 121 return &per_prio->sort_list[rq_data_dir(rq)]; 122 } 123 124 /* 125 * Returns the I/O priority class (IOPRIO_CLASS_*) that has been assigned to a 126 * request. 127 */ 128 static u8 dd_rq_ioclass(struct request *rq) 129 { 130 return IOPRIO_PRIO_CLASS(req_get_ioprio(rq)); 131 } 132 133 /* 134 * get the request before `rq' in sector-sorted order 135 */ 136 static inline struct request * 137 deadline_earlier_request(struct request *rq) 138 { 139 struct rb_node *node = rb_prev(&rq->rb_node); 140 141 if (node) 142 return rb_entry_rq(node); 143 144 return NULL; 145 } 146 147 /* 148 * get the request after `rq' in sector-sorted order 149 */ 150 static inline struct request * 151 deadline_latter_request(struct request *rq) 152 { 153 struct rb_node *node = rb_next(&rq->rb_node); 154 155 if (node) 156 return rb_entry_rq(node); 157 158 return NULL; 159 } 160 161 static void 162 deadline_add_rq_rb(struct dd_per_prio *per_prio, struct request *rq) 163 { 164 struct rb_root *root = deadline_rb_root(per_prio, rq); 165 166 elv_rb_add(root, rq); 167 } 168 169 static inline void 170 deadline_del_rq_rb(struct dd_per_prio *per_prio, struct request *rq) 171 { 172 const enum dd_data_dir data_dir = rq_data_dir(rq); 173 174 if (per_prio->next_rq[data_dir] == rq) 175 per_prio->next_rq[data_dir] = deadline_latter_request(rq); 176 177 elv_rb_del(deadline_rb_root(per_prio, rq), rq); 178 } 179 180 /* 181 * remove rq from rbtree and fifo. 182 */ 183 static void deadline_remove_request(struct request_queue *q, 184 struct dd_per_prio *per_prio, 185 struct request *rq) 186 { 187 list_del_init(&rq->queuelist); 188 189 /* 190 * We might not be on the rbtree, if we are doing an insert merge 191 */ 192 if (!RB_EMPTY_NODE(&rq->rb_node)) 193 deadline_del_rq_rb(per_prio, rq); 194 195 elv_rqhash_del(q, rq); 196 if (q->last_merge == rq) 197 q->last_merge = NULL; 198 } 199 200 static void dd_request_merged(struct request_queue *q, struct request *req, 201 enum elv_merge type) 202 { 203 struct deadline_data *dd = q->elevator->elevator_data; 204 const u8 ioprio_class = dd_rq_ioclass(req); 205 const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 206 struct dd_per_prio *per_prio = &dd->per_prio[prio]; 207 208 /* 209 * if the merge was a front merge, we need to reposition request 210 */ 211 if (type == ELEVATOR_FRONT_MERGE) { 212 elv_rb_del(deadline_rb_root(per_prio, req), req); 213 deadline_add_rq_rb(per_prio, req); 214 } 215 } 216 217 /* 218 * Callback function that is invoked after @next has been merged into @req. 219 */ 220 static void dd_merged_requests(struct request_queue *q, struct request *req, 221 struct request *next) 222 { 223 struct deadline_data *dd = q->elevator->elevator_data; 224 const u8 ioprio_class = dd_rq_ioclass(next); 225 const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 226 227 lockdep_assert_held(&dd->lock); 228 229 dd->per_prio[prio].stats.merged++; 230 231 /* 232 * if next expires before rq, assign its expire time to rq 233 * and move into next position (next will be deleted) in fifo 234 */ 235 if (!list_empty(&req->queuelist) && !list_empty(&next->queuelist)) { 236 if (time_before((unsigned long)next->fifo_time, 237 (unsigned long)req->fifo_time)) { 238 list_move(&req->queuelist, &next->queuelist); 239 req->fifo_time = next->fifo_time; 240 } 241 } 242 243 /* 244 * kill knowledge of next, this one is a goner 245 */ 246 deadline_remove_request(q, &dd->per_prio[prio], next); 247 } 248 249 /* 250 * move an entry to dispatch queue 251 */ 252 static void 253 deadline_move_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 254 struct request *rq) 255 { 256 const enum dd_data_dir data_dir = rq_data_dir(rq); 257 258 per_prio->next_rq[data_dir] = deadline_latter_request(rq); 259 260 /* 261 * take it off the sort and fifo list 262 */ 263 deadline_remove_request(rq->q, per_prio, rq); 264 } 265 266 /* Number of requests queued for a given priority level. */ 267 static u32 dd_queued(struct deadline_data *dd, enum dd_prio prio) 268 { 269 const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; 270 271 lockdep_assert_held(&dd->lock); 272 273 return stats->inserted - atomic_read(&stats->completed); 274 } 275 276 /* 277 * deadline_check_fifo returns 0 if there are no expired requests on the fifo, 278 * 1 otherwise. Requires !list_empty(&dd->fifo_list[data_dir]) 279 */ 280 static inline int deadline_check_fifo(struct dd_per_prio *per_prio, 281 enum dd_data_dir data_dir) 282 { 283 struct request *rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); 284 285 /* 286 * rq is expired! 287 */ 288 if (time_after_eq(jiffies, (unsigned long)rq->fifo_time)) 289 return 1; 290 291 return 0; 292 } 293 294 /* 295 * Check if rq has a sequential request preceding it. 296 */ 297 static bool deadline_is_seq_write(struct deadline_data *dd, struct request *rq) 298 { 299 struct request *prev = deadline_earlier_request(rq); 300 301 if (!prev) 302 return false; 303 304 return blk_rq_pos(prev) + blk_rq_sectors(prev) == blk_rq_pos(rq); 305 } 306 307 /* 308 * Skip all write requests that are sequential from @rq, even if we cross 309 * a zone boundary. 310 */ 311 static struct request *deadline_skip_seq_writes(struct deadline_data *dd, 312 struct request *rq) 313 { 314 sector_t pos = blk_rq_pos(rq); 315 sector_t skipped_sectors = 0; 316 317 while (rq) { 318 if (blk_rq_pos(rq) != pos + skipped_sectors) 319 break; 320 skipped_sectors += blk_rq_sectors(rq); 321 rq = deadline_latter_request(rq); 322 } 323 324 return rq; 325 } 326 327 /* 328 * For the specified data direction, return the next request to 329 * dispatch using arrival ordered lists. 330 */ 331 static struct request * 332 deadline_fifo_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 333 enum dd_data_dir data_dir) 334 { 335 struct request *rq; 336 unsigned long flags; 337 338 if (list_empty(&per_prio->fifo_list[data_dir])) 339 return NULL; 340 341 rq = rq_entry_fifo(per_prio->fifo_list[data_dir].next); 342 if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 343 return rq; 344 345 /* 346 * Look for a write request that can be dispatched, that is one with 347 * an unlocked target zone. For some HDDs, breaking a sequential 348 * write stream can lead to lower throughput, so make sure to preserve 349 * sequential write streams, even if that stream crosses into the next 350 * zones and these zones are unlocked. 351 */ 352 spin_lock_irqsave(&dd->zone_lock, flags); 353 list_for_each_entry(rq, &per_prio->fifo_list[DD_WRITE], queuelist) { 354 if (blk_req_can_dispatch_to_zone(rq) && 355 (blk_queue_nonrot(rq->q) || 356 !deadline_is_seq_write(dd, rq))) 357 goto out; 358 } 359 rq = NULL; 360 out: 361 spin_unlock_irqrestore(&dd->zone_lock, flags); 362 363 return rq; 364 } 365 366 /* 367 * For the specified data direction, return the next request to 368 * dispatch using sector position sorted lists. 369 */ 370 static struct request * 371 deadline_next_request(struct deadline_data *dd, struct dd_per_prio *per_prio, 372 enum dd_data_dir data_dir) 373 { 374 struct request *rq; 375 unsigned long flags; 376 377 rq = per_prio->next_rq[data_dir]; 378 if (!rq) 379 return NULL; 380 381 if (data_dir == DD_READ || !blk_queue_is_zoned(rq->q)) 382 return rq; 383 384 /* 385 * Look for a write request that can be dispatched, that is one with 386 * an unlocked target zone. For some HDDs, breaking a sequential 387 * write stream can lead to lower throughput, so make sure to preserve 388 * sequential write streams, even if that stream crosses into the next 389 * zones and these zones are unlocked. 390 */ 391 spin_lock_irqsave(&dd->zone_lock, flags); 392 while (rq) { 393 if (blk_req_can_dispatch_to_zone(rq)) 394 break; 395 if (blk_queue_nonrot(rq->q)) 396 rq = deadline_latter_request(rq); 397 else 398 rq = deadline_skip_seq_writes(dd, rq); 399 } 400 spin_unlock_irqrestore(&dd->zone_lock, flags); 401 402 return rq; 403 } 404 405 /* 406 * Returns true if and only if @rq started after @latest_start where 407 * @latest_start is in jiffies. 408 */ 409 static bool started_after(struct deadline_data *dd, struct request *rq, 410 unsigned long latest_start) 411 { 412 unsigned long start_time = (unsigned long)rq->fifo_time; 413 414 start_time -= dd->fifo_expire[rq_data_dir(rq)]; 415 416 return time_after(start_time, latest_start); 417 } 418 419 /* 420 * deadline_dispatch_requests selects the best request according to 421 * read/write expire, fifo_batch, etc and with a start time <= @latest_start. 422 */ 423 static struct request *__dd_dispatch_request(struct deadline_data *dd, 424 struct dd_per_prio *per_prio, 425 unsigned long latest_start) 426 { 427 struct request *rq, *next_rq; 428 enum dd_data_dir data_dir; 429 enum dd_prio prio; 430 u8 ioprio_class; 431 432 lockdep_assert_held(&dd->lock); 433 434 if (!list_empty(&per_prio->dispatch)) { 435 rq = list_first_entry(&per_prio->dispatch, struct request, 436 queuelist); 437 if (started_after(dd, rq, latest_start)) 438 return NULL; 439 list_del_init(&rq->queuelist); 440 goto done; 441 } 442 443 /* 444 * batches are currently reads XOR writes 445 */ 446 rq = deadline_next_request(dd, per_prio, dd->last_dir); 447 if (rq && dd->batching < dd->fifo_batch) 448 /* we have a next request are still entitled to batch */ 449 goto dispatch_request; 450 451 /* 452 * at this point we are not running a batch. select the appropriate 453 * data direction (read / write) 454 */ 455 456 if (!list_empty(&per_prio->fifo_list[DD_READ])) { 457 BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_READ])); 458 459 if (deadline_fifo_request(dd, per_prio, DD_WRITE) && 460 (dd->starved++ >= dd->writes_starved)) 461 goto dispatch_writes; 462 463 data_dir = DD_READ; 464 465 goto dispatch_find_request; 466 } 467 468 /* 469 * there are either no reads or writes have been starved 470 */ 471 472 if (!list_empty(&per_prio->fifo_list[DD_WRITE])) { 473 dispatch_writes: 474 BUG_ON(RB_EMPTY_ROOT(&per_prio->sort_list[DD_WRITE])); 475 476 dd->starved = 0; 477 478 data_dir = DD_WRITE; 479 480 goto dispatch_find_request; 481 } 482 483 return NULL; 484 485 dispatch_find_request: 486 /* 487 * we are not running a batch, find best request for selected data_dir 488 */ 489 next_rq = deadline_next_request(dd, per_prio, data_dir); 490 if (deadline_check_fifo(per_prio, data_dir) || !next_rq) { 491 /* 492 * A deadline has expired, the last request was in the other 493 * direction, or we have run out of higher-sectored requests. 494 * Start again from the request with the earliest expiry time. 495 */ 496 rq = deadline_fifo_request(dd, per_prio, data_dir); 497 } else { 498 /* 499 * The last req was the same dir and we have a next request in 500 * sort order. No expired requests so continue on from here. 501 */ 502 rq = next_rq; 503 } 504 505 /* 506 * For a zoned block device, if we only have writes queued and none of 507 * them can be dispatched, rq will be NULL. 508 */ 509 if (!rq) 510 return NULL; 511 512 dd->last_dir = data_dir; 513 dd->batching = 0; 514 515 dispatch_request: 516 if (started_after(dd, rq, latest_start)) 517 return NULL; 518 519 /* 520 * rq is the selected appropriate request. 521 */ 522 dd->batching++; 523 deadline_move_request(dd, per_prio, rq); 524 done: 525 ioprio_class = dd_rq_ioclass(rq); 526 prio = ioprio_class_to_prio[ioprio_class]; 527 dd->per_prio[prio].stats.dispatched++; 528 /* 529 * If the request needs its target zone locked, do it. 530 */ 531 blk_req_zone_write_lock(rq); 532 rq->rq_flags |= RQF_STARTED; 533 return rq; 534 } 535 536 /* 537 * Check whether there are any requests with priority other than DD_RT_PRIO 538 * that were inserted more than prio_aging_expire jiffies ago. 539 */ 540 static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, 541 unsigned long now) 542 { 543 struct request *rq; 544 enum dd_prio prio; 545 int prio_cnt; 546 547 lockdep_assert_held(&dd->lock); 548 549 prio_cnt = !!dd_queued(dd, DD_RT_PRIO) + !!dd_queued(dd, DD_BE_PRIO) + 550 !!dd_queued(dd, DD_IDLE_PRIO); 551 if (prio_cnt < 2) 552 return NULL; 553 554 for (prio = DD_BE_PRIO; prio <= DD_PRIO_MAX; prio++) { 555 rq = __dd_dispatch_request(dd, &dd->per_prio[prio], 556 now - dd->prio_aging_expire); 557 if (rq) 558 return rq; 559 } 560 561 return NULL; 562 } 563 564 /* 565 * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). 566 * 567 * One confusing aspect here is that we get called for a specific 568 * hardware queue, but we may return a request that is for a 569 * different hardware queue. This is because mq-deadline has shared 570 * state for all hardware queues, in terms of sorting, FIFOs, etc. 571 */ 572 static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) 573 { 574 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 575 const unsigned long now = jiffies; 576 struct request *rq; 577 enum dd_prio prio; 578 579 spin_lock(&dd->lock); 580 rq = dd_dispatch_prio_aged_requests(dd, now); 581 if (rq) 582 goto unlock; 583 584 /* 585 * Next, dispatch requests in priority order. Ignore lower priority 586 * requests if any higher priority requests are pending. 587 */ 588 for (prio = 0; prio <= DD_PRIO_MAX; prio++) { 589 rq = __dd_dispatch_request(dd, &dd->per_prio[prio], now); 590 if (rq || dd_queued(dd, prio)) 591 break; 592 } 593 594 unlock: 595 spin_unlock(&dd->lock); 596 597 return rq; 598 } 599 600 /* 601 * Called by __blk_mq_alloc_request(). The shallow_depth value set by this 602 * function is used by __blk_mq_get_tag(). 603 */ 604 static void dd_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) 605 { 606 struct deadline_data *dd = data->q->elevator->elevator_data; 607 608 /* Do not throttle synchronous reads. */ 609 if (op_is_sync(opf) && !op_is_write(opf)) 610 return; 611 612 /* 613 * Throttle asynchronous requests and writes such that these requests 614 * do not block the allocation of synchronous requests. 615 */ 616 data->shallow_depth = dd->async_depth; 617 } 618 619 /* Called by blk_mq_update_nr_requests(). */ 620 static void dd_depth_updated(struct blk_mq_hw_ctx *hctx) 621 { 622 struct request_queue *q = hctx->queue; 623 struct deadline_data *dd = q->elevator->elevator_data; 624 struct blk_mq_tags *tags = hctx->sched_tags; 625 626 dd->async_depth = max(1UL, 3 * q->nr_requests / 4); 627 628 sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, dd->async_depth); 629 } 630 631 /* Called by blk_mq_init_hctx() and blk_mq_init_sched(). */ 632 static int dd_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 633 { 634 dd_depth_updated(hctx); 635 return 0; 636 } 637 638 static void dd_exit_sched(struct elevator_queue *e) 639 { 640 struct deadline_data *dd = e->elevator_data; 641 enum dd_prio prio; 642 643 for (prio = 0; prio <= DD_PRIO_MAX; prio++) { 644 struct dd_per_prio *per_prio = &dd->per_prio[prio]; 645 const struct io_stats_per_prio *stats = &per_prio->stats; 646 uint32_t queued; 647 648 WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_READ])); 649 WARN_ON_ONCE(!list_empty(&per_prio->fifo_list[DD_WRITE])); 650 651 spin_lock(&dd->lock); 652 queued = dd_queued(dd, prio); 653 spin_unlock(&dd->lock); 654 655 WARN_ONCE(queued != 0, 656 "statistics for priority %d: i %u m %u d %u c %u\n", 657 prio, stats->inserted, stats->merged, 658 stats->dispatched, atomic_read(&stats->completed)); 659 } 660 661 kfree(dd); 662 } 663 664 /* 665 * initialize elevator private data (deadline_data). 666 */ 667 static int dd_init_sched(struct request_queue *q, struct elevator_type *e) 668 { 669 struct deadline_data *dd; 670 struct elevator_queue *eq; 671 enum dd_prio prio; 672 int ret = -ENOMEM; 673 674 eq = elevator_alloc(q, e); 675 if (!eq) 676 return ret; 677 678 dd = kzalloc_node(sizeof(*dd), GFP_KERNEL, q->node); 679 if (!dd) 680 goto put_eq; 681 682 eq->elevator_data = dd; 683 684 for (prio = 0; prio <= DD_PRIO_MAX; prio++) { 685 struct dd_per_prio *per_prio = &dd->per_prio[prio]; 686 687 INIT_LIST_HEAD(&per_prio->dispatch); 688 INIT_LIST_HEAD(&per_prio->fifo_list[DD_READ]); 689 INIT_LIST_HEAD(&per_prio->fifo_list[DD_WRITE]); 690 per_prio->sort_list[DD_READ] = RB_ROOT; 691 per_prio->sort_list[DD_WRITE] = RB_ROOT; 692 } 693 dd->fifo_expire[DD_READ] = read_expire; 694 dd->fifo_expire[DD_WRITE] = write_expire; 695 dd->writes_starved = writes_starved; 696 dd->front_merges = 1; 697 dd->last_dir = DD_WRITE; 698 dd->fifo_batch = fifo_batch; 699 dd->prio_aging_expire = prio_aging_expire; 700 spin_lock_init(&dd->lock); 701 spin_lock_init(&dd->zone_lock); 702 703 /* We dispatch from request queue wide instead of hw queue */ 704 blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); 705 706 q->elevator = eq; 707 return 0; 708 709 put_eq: 710 kobject_put(&eq->kobj); 711 return ret; 712 } 713 714 /* 715 * Try to merge @bio into an existing request. If @bio has been merged into 716 * an existing request, store the pointer to that request into *@rq. 717 */ 718 static int dd_request_merge(struct request_queue *q, struct request **rq, 719 struct bio *bio) 720 { 721 struct deadline_data *dd = q->elevator->elevator_data; 722 const u8 ioprio_class = IOPRIO_PRIO_CLASS(bio->bi_ioprio); 723 const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 724 struct dd_per_prio *per_prio = &dd->per_prio[prio]; 725 sector_t sector = bio_end_sector(bio); 726 struct request *__rq; 727 728 if (!dd->front_merges) 729 return ELEVATOR_NO_MERGE; 730 731 __rq = elv_rb_find(&per_prio->sort_list[bio_data_dir(bio)], sector); 732 if (__rq) { 733 BUG_ON(sector != blk_rq_pos(__rq)); 734 735 if (elv_bio_merge_ok(__rq, bio)) { 736 *rq = __rq; 737 if (blk_discard_mergable(__rq)) 738 return ELEVATOR_DISCARD_MERGE; 739 return ELEVATOR_FRONT_MERGE; 740 } 741 } 742 743 return ELEVATOR_NO_MERGE; 744 } 745 746 /* 747 * Attempt to merge a bio into an existing request. This function is called 748 * before @bio is associated with a request. 749 */ 750 static bool dd_bio_merge(struct request_queue *q, struct bio *bio, 751 unsigned int nr_segs) 752 { 753 struct deadline_data *dd = q->elevator->elevator_data; 754 struct request *free = NULL; 755 bool ret; 756 757 spin_lock(&dd->lock); 758 ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); 759 spin_unlock(&dd->lock); 760 761 if (free) 762 blk_mq_free_request(free); 763 764 return ret; 765 } 766 767 /* 768 * add rq to rbtree and fifo 769 */ 770 static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, 771 bool at_head) 772 { 773 struct request_queue *q = hctx->queue; 774 struct deadline_data *dd = q->elevator->elevator_data; 775 const enum dd_data_dir data_dir = rq_data_dir(rq); 776 u16 ioprio = req_get_ioprio(rq); 777 u8 ioprio_class = IOPRIO_PRIO_CLASS(ioprio); 778 struct dd_per_prio *per_prio; 779 enum dd_prio prio; 780 LIST_HEAD(free); 781 782 lockdep_assert_held(&dd->lock); 783 784 /* 785 * This may be a requeue of a write request that has locked its 786 * target zone. If it is the case, this releases the zone lock. 787 */ 788 blk_req_zone_write_unlock(rq); 789 790 prio = ioprio_class_to_prio[ioprio_class]; 791 per_prio = &dd->per_prio[prio]; 792 if (!rq->elv.priv[0]) { 793 per_prio->stats.inserted++; 794 rq->elv.priv[0] = (void *)(uintptr_t)1; 795 } 796 797 if (blk_mq_sched_try_insert_merge(q, rq, &free)) { 798 blk_mq_free_requests(&free); 799 return; 800 } 801 802 trace_block_rq_insert(rq); 803 804 if (at_head) { 805 list_add(&rq->queuelist, &per_prio->dispatch); 806 rq->fifo_time = jiffies; 807 } else { 808 deadline_add_rq_rb(per_prio, rq); 809 810 if (rq_mergeable(rq)) { 811 elv_rqhash_add(q, rq); 812 if (!q->last_merge) 813 q->last_merge = rq; 814 } 815 816 /* 817 * set expire time and add to fifo list 818 */ 819 rq->fifo_time = jiffies + dd->fifo_expire[data_dir]; 820 list_add_tail(&rq->queuelist, &per_prio->fifo_list[data_dir]); 821 } 822 } 823 824 /* 825 * Called from blk_mq_sched_insert_request() or blk_mq_sched_insert_requests(). 826 */ 827 static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, 828 struct list_head *list, bool at_head) 829 { 830 struct request_queue *q = hctx->queue; 831 struct deadline_data *dd = q->elevator->elevator_data; 832 833 spin_lock(&dd->lock); 834 while (!list_empty(list)) { 835 struct request *rq; 836 837 rq = list_first_entry(list, struct request, queuelist); 838 list_del_init(&rq->queuelist); 839 dd_insert_request(hctx, rq, at_head); 840 } 841 spin_unlock(&dd->lock); 842 } 843 844 /* Callback from inside blk_mq_rq_ctx_init(). */ 845 static void dd_prepare_request(struct request *rq) 846 { 847 rq->elv.priv[0] = NULL; 848 } 849 850 static bool dd_has_write_work(struct blk_mq_hw_ctx *hctx) 851 { 852 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 853 enum dd_prio p; 854 855 for (p = 0; p <= DD_PRIO_MAX; p++) 856 if (!list_empty_careful(&dd->per_prio[p].fifo_list[DD_WRITE])) 857 return true; 858 859 return false; 860 } 861 862 /* 863 * Callback from inside blk_mq_free_request(). 864 * 865 * For zoned block devices, write unlock the target zone of 866 * completed write requests. Do this while holding the zone lock 867 * spinlock so that the zone is never unlocked while deadline_fifo_request() 868 * or deadline_next_request() are executing. This function is called for 869 * all requests, whether or not these requests complete successfully. 870 * 871 * For a zoned block device, __dd_dispatch_request() may have stopped 872 * dispatching requests if all the queued requests are write requests directed 873 * at zones that are already locked due to on-going write requests. To ensure 874 * write request dispatch progress in this case, mark the queue as needing a 875 * restart to ensure that the queue is run again after completion of the 876 * request and zones being unlocked. 877 */ 878 static void dd_finish_request(struct request *rq) 879 { 880 struct request_queue *q = rq->q; 881 struct deadline_data *dd = q->elevator->elevator_data; 882 const u8 ioprio_class = dd_rq_ioclass(rq); 883 const enum dd_prio prio = ioprio_class_to_prio[ioprio_class]; 884 struct dd_per_prio *per_prio = &dd->per_prio[prio]; 885 886 /* 887 * The block layer core may call dd_finish_request() without having 888 * called dd_insert_requests(). Skip requests that bypassed I/O 889 * scheduling. See also blk_mq_request_bypass_insert(). 890 */ 891 if (!rq->elv.priv[0]) 892 return; 893 894 atomic_inc(&per_prio->stats.completed); 895 896 if (blk_queue_is_zoned(q)) { 897 unsigned long flags; 898 899 spin_lock_irqsave(&dd->zone_lock, flags); 900 blk_req_zone_write_unlock(rq); 901 spin_unlock_irqrestore(&dd->zone_lock, flags); 902 903 if (dd_has_write_work(rq->mq_hctx)) 904 blk_mq_sched_mark_restart_hctx(rq->mq_hctx); 905 } 906 } 907 908 static bool dd_has_work_for_prio(struct dd_per_prio *per_prio) 909 { 910 return !list_empty_careful(&per_prio->dispatch) || 911 !list_empty_careful(&per_prio->fifo_list[DD_READ]) || 912 !list_empty_careful(&per_prio->fifo_list[DD_WRITE]); 913 } 914 915 static bool dd_has_work(struct blk_mq_hw_ctx *hctx) 916 { 917 struct deadline_data *dd = hctx->queue->elevator->elevator_data; 918 enum dd_prio prio; 919 920 for (prio = 0; prio <= DD_PRIO_MAX; prio++) 921 if (dd_has_work_for_prio(&dd->per_prio[prio])) 922 return true; 923 924 return false; 925 } 926 927 /* 928 * sysfs parts below 929 */ 930 #define SHOW_INT(__FUNC, __VAR) \ 931 static ssize_t __FUNC(struct elevator_queue *e, char *page) \ 932 { \ 933 struct deadline_data *dd = e->elevator_data; \ 934 \ 935 return sysfs_emit(page, "%d\n", __VAR); \ 936 } 937 #define SHOW_JIFFIES(__FUNC, __VAR) SHOW_INT(__FUNC, jiffies_to_msecs(__VAR)) 938 SHOW_JIFFIES(deadline_read_expire_show, dd->fifo_expire[DD_READ]); 939 SHOW_JIFFIES(deadline_write_expire_show, dd->fifo_expire[DD_WRITE]); 940 SHOW_JIFFIES(deadline_prio_aging_expire_show, dd->prio_aging_expire); 941 SHOW_INT(deadline_writes_starved_show, dd->writes_starved); 942 SHOW_INT(deadline_front_merges_show, dd->front_merges); 943 SHOW_INT(deadline_async_depth_show, dd->async_depth); 944 SHOW_INT(deadline_fifo_batch_show, dd->fifo_batch); 945 #undef SHOW_INT 946 #undef SHOW_JIFFIES 947 948 #define STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, __CONV) \ 949 static ssize_t __FUNC(struct elevator_queue *e, const char *page, size_t count) \ 950 { \ 951 struct deadline_data *dd = e->elevator_data; \ 952 int __data, __ret; \ 953 \ 954 __ret = kstrtoint(page, 0, &__data); \ 955 if (__ret < 0) \ 956 return __ret; \ 957 if (__data < (MIN)) \ 958 __data = (MIN); \ 959 else if (__data > (MAX)) \ 960 __data = (MAX); \ 961 *(__PTR) = __CONV(__data); \ 962 return count; \ 963 } 964 #define STORE_INT(__FUNC, __PTR, MIN, MAX) \ 965 STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, ) 966 #define STORE_JIFFIES(__FUNC, __PTR, MIN, MAX) \ 967 STORE_FUNCTION(__FUNC, __PTR, MIN, MAX, msecs_to_jiffies) 968 STORE_JIFFIES(deadline_read_expire_store, &dd->fifo_expire[DD_READ], 0, INT_MAX); 969 STORE_JIFFIES(deadline_write_expire_store, &dd->fifo_expire[DD_WRITE], 0, INT_MAX); 970 STORE_JIFFIES(deadline_prio_aging_expire_store, &dd->prio_aging_expire, 0, INT_MAX); 971 STORE_INT(deadline_writes_starved_store, &dd->writes_starved, INT_MIN, INT_MAX); 972 STORE_INT(deadline_front_merges_store, &dd->front_merges, 0, 1); 973 STORE_INT(deadline_async_depth_store, &dd->async_depth, 1, INT_MAX); 974 STORE_INT(deadline_fifo_batch_store, &dd->fifo_batch, 0, INT_MAX); 975 #undef STORE_FUNCTION 976 #undef STORE_INT 977 #undef STORE_JIFFIES 978 979 #define DD_ATTR(name) \ 980 __ATTR(name, 0644, deadline_##name##_show, deadline_##name##_store) 981 982 static struct elv_fs_entry deadline_attrs[] = { 983 DD_ATTR(read_expire), 984 DD_ATTR(write_expire), 985 DD_ATTR(writes_starved), 986 DD_ATTR(front_merges), 987 DD_ATTR(async_depth), 988 DD_ATTR(fifo_batch), 989 DD_ATTR(prio_aging_expire), 990 __ATTR_NULL 991 }; 992 993 #ifdef CONFIG_BLK_DEBUG_FS 994 #define DEADLINE_DEBUGFS_DDIR_ATTRS(prio, data_dir, name) \ 995 static void *deadline_##name##_fifo_start(struct seq_file *m, \ 996 loff_t *pos) \ 997 __acquires(&dd->lock) \ 998 { \ 999 struct request_queue *q = m->private; \ 1000 struct deadline_data *dd = q->elevator->elevator_data; \ 1001 struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ 1002 \ 1003 spin_lock(&dd->lock); \ 1004 return seq_list_start(&per_prio->fifo_list[data_dir], *pos); \ 1005 } \ 1006 \ 1007 static void *deadline_##name##_fifo_next(struct seq_file *m, void *v, \ 1008 loff_t *pos) \ 1009 { \ 1010 struct request_queue *q = m->private; \ 1011 struct deadline_data *dd = q->elevator->elevator_data; \ 1012 struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ 1013 \ 1014 return seq_list_next(v, &per_prio->fifo_list[data_dir], pos); \ 1015 } \ 1016 \ 1017 static void deadline_##name##_fifo_stop(struct seq_file *m, void *v) \ 1018 __releases(&dd->lock) \ 1019 { \ 1020 struct request_queue *q = m->private; \ 1021 struct deadline_data *dd = q->elevator->elevator_data; \ 1022 \ 1023 spin_unlock(&dd->lock); \ 1024 } \ 1025 \ 1026 static const struct seq_operations deadline_##name##_fifo_seq_ops = { \ 1027 .start = deadline_##name##_fifo_start, \ 1028 .next = deadline_##name##_fifo_next, \ 1029 .stop = deadline_##name##_fifo_stop, \ 1030 .show = blk_mq_debugfs_rq_show, \ 1031 }; \ 1032 \ 1033 static int deadline_##name##_next_rq_show(void *data, \ 1034 struct seq_file *m) \ 1035 { \ 1036 struct request_queue *q = data; \ 1037 struct deadline_data *dd = q->elevator->elevator_data; \ 1038 struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ 1039 struct request *rq = per_prio->next_rq[data_dir]; \ 1040 \ 1041 if (rq) \ 1042 __blk_mq_debugfs_rq_show(m, rq); \ 1043 return 0; \ 1044 } 1045 1046 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_READ, read0); 1047 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_RT_PRIO, DD_WRITE, write0); 1048 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_READ, read1); 1049 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_BE_PRIO, DD_WRITE, write1); 1050 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_READ, read2); 1051 DEADLINE_DEBUGFS_DDIR_ATTRS(DD_IDLE_PRIO, DD_WRITE, write2); 1052 #undef DEADLINE_DEBUGFS_DDIR_ATTRS 1053 1054 static int deadline_batching_show(void *data, struct seq_file *m) 1055 { 1056 struct request_queue *q = data; 1057 struct deadline_data *dd = q->elevator->elevator_data; 1058 1059 seq_printf(m, "%u\n", dd->batching); 1060 return 0; 1061 } 1062 1063 static int deadline_starved_show(void *data, struct seq_file *m) 1064 { 1065 struct request_queue *q = data; 1066 struct deadline_data *dd = q->elevator->elevator_data; 1067 1068 seq_printf(m, "%u\n", dd->starved); 1069 return 0; 1070 } 1071 1072 static int dd_async_depth_show(void *data, struct seq_file *m) 1073 { 1074 struct request_queue *q = data; 1075 struct deadline_data *dd = q->elevator->elevator_data; 1076 1077 seq_printf(m, "%u\n", dd->async_depth); 1078 return 0; 1079 } 1080 1081 static int dd_queued_show(void *data, struct seq_file *m) 1082 { 1083 struct request_queue *q = data; 1084 struct deadline_data *dd = q->elevator->elevator_data; 1085 u32 rt, be, idle; 1086 1087 spin_lock(&dd->lock); 1088 rt = dd_queued(dd, DD_RT_PRIO); 1089 be = dd_queued(dd, DD_BE_PRIO); 1090 idle = dd_queued(dd, DD_IDLE_PRIO); 1091 spin_unlock(&dd->lock); 1092 1093 seq_printf(m, "%u %u %u\n", rt, be, idle); 1094 1095 return 0; 1096 } 1097 1098 /* Number of requests owned by the block driver for a given priority. */ 1099 static u32 dd_owned_by_driver(struct deadline_data *dd, enum dd_prio prio) 1100 { 1101 const struct io_stats_per_prio *stats = &dd->per_prio[prio].stats; 1102 1103 lockdep_assert_held(&dd->lock); 1104 1105 return stats->dispatched + stats->merged - 1106 atomic_read(&stats->completed); 1107 } 1108 1109 static int dd_owned_by_driver_show(void *data, struct seq_file *m) 1110 { 1111 struct request_queue *q = data; 1112 struct deadline_data *dd = q->elevator->elevator_data; 1113 u32 rt, be, idle; 1114 1115 spin_lock(&dd->lock); 1116 rt = dd_owned_by_driver(dd, DD_RT_PRIO); 1117 be = dd_owned_by_driver(dd, DD_BE_PRIO); 1118 idle = dd_owned_by_driver(dd, DD_IDLE_PRIO); 1119 spin_unlock(&dd->lock); 1120 1121 seq_printf(m, "%u %u %u\n", rt, be, idle); 1122 1123 return 0; 1124 } 1125 1126 #define DEADLINE_DISPATCH_ATTR(prio) \ 1127 static void *deadline_dispatch##prio##_start(struct seq_file *m, \ 1128 loff_t *pos) \ 1129 __acquires(&dd->lock) \ 1130 { \ 1131 struct request_queue *q = m->private; \ 1132 struct deadline_data *dd = q->elevator->elevator_data; \ 1133 struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ 1134 \ 1135 spin_lock(&dd->lock); \ 1136 return seq_list_start(&per_prio->dispatch, *pos); \ 1137 } \ 1138 \ 1139 static void *deadline_dispatch##prio##_next(struct seq_file *m, \ 1140 void *v, loff_t *pos) \ 1141 { \ 1142 struct request_queue *q = m->private; \ 1143 struct deadline_data *dd = q->elevator->elevator_data; \ 1144 struct dd_per_prio *per_prio = &dd->per_prio[prio]; \ 1145 \ 1146 return seq_list_next(v, &per_prio->dispatch, pos); \ 1147 } \ 1148 \ 1149 static void deadline_dispatch##prio##_stop(struct seq_file *m, void *v) \ 1150 __releases(&dd->lock) \ 1151 { \ 1152 struct request_queue *q = m->private; \ 1153 struct deadline_data *dd = q->elevator->elevator_data; \ 1154 \ 1155 spin_unlock(&dd->lock); \ 1156 } \ 1157 \ 1158 static const struct seq_operations deadline_dispatch##prio##_seq_ops = { \ 1159 .start = deadline_dispatch##prio##_start, \ 1160 .next = deadline_dispatch##prio##_next, \ 1161 .stop = deadline_dispatch##prio##_stop, \ 1162 .show = blk_mq_debugfs_rq_show, \ 1163 } 1164 1165 DEADLINE_DISPATCH_ATTR(0); 1166 DEADLINE_DISPATCH_ATTR(1); 1167 DEADLINE_DISPATCH_ATTR(2); 1168 #undef DEADLINE_DISPATCH_ATTR 1169 1170 #define DEADLINE_QUEUE_DDIR_ATTRS(name) \ 1171 {#name "_fifo_list", 0400, \ 1172 .seq_ops = &deadline_##name##_fifo_seq_ops} 1173 #define DEADLINE_NEXT_RQ_ATTR(name) \ 1174 {#name "_next_rq", 0400, deadline_##name##_next_rq_show} 1175 static const struct blk_mq_debugfs_attr deadline_queue_debugfs_attrs[] = { 1176 DEADLINE_QUEUE_DDIR_ATTRS(read0), 1177 DEADLINE_QUEUE_DDIR_ATTRS(write0), 1178 DEADLINE_QUEUE_DDIR_ATTRS(read1), 1179 DEADLINE_QUEUE_DDIR_ATTRS(write1), 1180 DEADLINE_QUEUE_DDIR_ATTRS(read2), 1181 DEADLINE_QUEUE_DDIR_ATTRS(write2), 1182 DEADLINE_NEXT_RQ_ATTR(read0), 1183 DEADLINE_NEXT_RQ_ATTR(write0), 1184 DEADLINE_NEXT_RQ_ATTR(read1), 1185 DEADLINE_NEXT_RQ_ATTR(write1), 1186 DEADLINE_NEXT_RQ_ATTR(read2), 1187 DEADLINE_NEXT_RQ_ATTR(write2), 1188 {"batching", 0400, deadline_batching_show}, 1189 {"starved", 0400, deadline_starved_show}, 1190 {"async_depth", 0400, dd_async_depth_show}, 1191 {"dispatch0", 0400, .seq_ops = &deadline_dispatch0_seq_ops}, 1192 {"dispatch1", 0400, .seq_ops = &deadline_dispatch1_seq_ops}, 1193 {"dispatch2", 0400, .seq_ops = &deadline_dispatch2_seq_ops}, 1194 {"owned_by_driver", 0400, dd_owned_by_driver_show}, 1195 {"queued", 0400, dd_queued_show}, 1196 {}, 1197 }; 1198 #undef DEADLINE_QUEUE_DDIR_ATTRS 1199 #endif 1200 1201 static struct elevator_type mq_deadline = { 1202 .ops = { 1203 .depth_updated = dd_depth_updated, 1204 .limit_depth = dd_limit_depth, 1205 .insert_requests = dd_insert_requests, 1206 .dispatch_request = dd_dispatch_request, 1207 .prepare_request = dd_prepare_request, 1208 .finish_request = dd_finish_request, 1209 .next_request = elv_rb_latter_request, 1210 .former_request = elv_rb_former_request, 1211 .bio_merge = dd_bio_merge, 1212 .request_merge = dd_request_merge, 1213 .requests_merged = dd_merged_requests, 1214 .request_merged = dd_request_merged, 1215 .has_work = dd_has_work, 1216 .init_sched = dd_init_sched, 1217 .exit_sched = dd_exit_sched, 1218 .init_hctx = dd_init_hctx, 1219 }, 1220 1221 #ifdef CONFIG_BLK_DEBUG_FS 1222 .queue_debugfs_attrs = deadline_queue_debugfs_attrs, 1223 #endif 1224 .elevator_attrs = deadline_attrs, 1225 .elevator_name = "mq-deadline", 1226 .elevator_alias = "deadline", 1227 .elevator_features = ELEVATOR_F_ZBD_SEQ_WRITE, 1228 .elevator_owner = THIS_MODULE, 1229 }; 1230 MODULE_ALIAS("mq-deadline-iosched"); 1231 1232 static int __init deadline_init(void) 1233 { 1234 return elv_register(&mq_deadline); 1235 } 1236 1237 static void __exit deadline_exit(void) 1238 { 1239 elv_unregister(&mq_deadline); 1240 } 1241 1242 module_init(deadline_init); 1243 module_exit(deadline_exit); 1244 1245 MODULE_AUTHOR("Jens Axboe, Damien Le Moal and Bart Van Assche"); 1246 MODULE_LICENSE("GPL"); 1247 MODULE_DESCRIPTION("MQ deadline IO scheduler"); 1248