1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Block device elevator/IO-scheduler. 4 * 5 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 6 * 7 * 30042000 Jens Axboe <axboe@kernel.dk> : 8 * 9 * Split the elevator a bit so that it is possible to choose a different 10 * one or even write a new "plug in". There are three pieces: 11 * - elevator_fn, inserts a new request in the queue list 12 * - elevator_merge_fn, decides whether a new buffer can be merged with 13 * an existing request 14 * - elevator_dequeue_fn, called when a request is taken off the active list 15 * 16 * 20082000 Dave Jones <davej@suse.de> : 17 * Removed tests for max-bomb-segments, which was breaking elvtune 18 * when run without -bN 19 * 20 * Jens: 21 * - Rework again to work with bio instead of buffer_heads 22 * - loose bi_dev comparisons, partition handling is right now 23 * - completely modularize elevator setup and teardown 24 * 25 */ 26 #include <linux/kernel.h> 27 #include <linux/fs.h> 28 #include <linux/blkdev.h> 29 #include <linux/elevator.h> 30 #include <linux/bio.h> 31 #include <linux/module.h> 32 #include <linux/slab.h> 33 #include <linux/init.h> 34 #include <linux/compiler.h> 35 #include <linux/blktrace_api.h> 36 #include <linux/hash.h> 37 #include <linux/uaccess.h> 38 #include <linux/pm_runtime.h> 39 #include <linux/blk-cgroup.h> 40 41 #include <trace/events/block.h> 42 43 #include "blk.h" 44 #include "blk-mq-sched.h" 45 #include "blk-pm.h" 46 #include "blk-wbt.h" 47 48 static DEFINE_SPINLOCK(elv_list_lock); 49 static LIST_HEAD(elv_list); 50 51 /* 52 * Merge hash stuff. 53 */ 54 #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 55 56 /* 57 * Query io scheduler to see if the current process issuing bio may be 58 * merged with rq. 59 */ 60 static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) 61 { 62 struct request_queue *q = rq->q; 63 struct elevator_queue *e = q->elevator; 64 65 if (e->type->ops.allow_merge) 66 return e->type->ops.allow_merge(q, rq, bio); 67 68 return 1; 69 } 70 71 /* 72 * can we safely merge with this request? 73 */ 74 bool elv_bio_merge_ok(struct request *rq, struct bio *bio) 75 { 76 if (!blk_rq_merge_ok(rq, bio)) 77 return false; 78 79 if (!elv_iosched_allow_bio_merge(rq, bio)) 80 return false; 81 82 return true; 83 } 84 EXPORT_SYMBOL(elv_bio_merge_ok); 85 86 static bool elevator_match(const struct elevator_type *e, const char *name) 87 { 88 if (!strcmp(e->elevator_name, name)) 89 return true; 90 if (e->elevator_alias && !strcmp(e->elevator_alias, name)) 91 return true; 92 93 return false; 94 } 95 96 /* 97 * Return scheduler with name 'name' 98 */ 99 static struct elevator_type *elevator_find(const char *name) 100 { 101 struct elevator_type *e; 102 103 list_for_each_entry(e, &elv_list, list) { 104 if (elevator_match(e, name)) 105 return e; 106 } 107 108 return NULL; 109 } 110 111 static void elevator_put(struct elevator_type *e) 112 { 113 module_put(e->elevator_owner); 114 } 115 116 static struct elevator_type *elevator_get(struct request_queue *q, 117 const char *name, bool try_loading) 118 { 119 struct elevator_type *e; 120 121 spin_lock(&elv_list_lock); 122 123 e = elevator_find(name); 124 if (!e && try_loading) { 125 spin_unlock(&elv_list_lock); 126 request_module("%s-iosched", name); 127 spin_lock(&elv_list_lock); 128 e = elevator_find(name); 129 } 130 131 if (e && !try_module_get(e->elevator_owner)) 132 e = NULL; 133 134 spin_unlock(&elv_list_lock); 135 return e; 136 } 137 138 static char chosen_elevator[ELV_NAME_MAX]; 139 140 static int __init elevator_setup(char *str) 141 { 142 /* 143 * Be backwards-compatible with previous kernels, so users 144 * won't get the wrong elevator. 145 */ 146 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 147 return 1; 148 } 149 150 __setup("elevator=", elevator_setup); 151 152 static struct kobj_type elv_ktype; 153 154 struct elevator_queue *elevator_alloc(struct request_queue *q, 155 struct elevator_type *e) 156 { 157 struct elevator_queue *eq; 158 159 eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); 160 if (unlikely(!eq)) 161 return NULL; 162 163 eq->type = e; 164 kobject_init(&eq->kobj, &elv_ktype); 165 mutex_init(&eq->sysfs_lock); 166 hash_init(eq->hash); 167 168 return eq; 169 } 170 EXPORT_SYMBOL(elevator_alloc); 171 172 static void elevator_release(struct kobject *kobj) 173 { 174 struct elevator_queue *e; 175 176 e = container_of(kobj, struct elevator_queue, kobj); 177 elevator_put(e->type); 178 kfree(e); 179 } 180 181 void elevator_exit(struct request_queue *q, struct elevator_queue *e) 182 { 183 mutex_lock(&e->sysfs_lock); 184 if (e->type->ops.exit_sched) 185 blk_mq_exit_sched(q, e); 186 mutex_unlock(&e->sysfs_lock); 187 188 kobject_put(&e->kobj); 189 } 190 191 static inline void __elv_rqhash_del(struct request *rq) 192 { 193 hash_del(&rq->hash); 194 rq->rq_flags &= ~RQF_HASHED; 195 } 196 197 void elv_rqhash_del(struct request_queue *q, struct request *rq) 198 { 199 if (ELV_ON_HASH(rq)) 200 __elv_rqhash_del(rq); 201 } 202 EXPORT_SYMBOL_GPL(elv_rqhash_del); 203 204 void elv_rqhash_add(struct request_queue *q, struct request *rq) 205 { 206 struct elevator_queue *e = q->elevator; 207 208 BUG_ON(ELV_ON_HASH(rq)); 209 hash_add(e->hash, &rq->hash, rq_hash_key(rq)); 210 rq->rq_flags |= RQF_HASHED; 211 } 212 EXPORT_SYMBOL_GPL(elv_rqhash_add); 213 214 void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 215 { 216 __elv_rqhash_del(rq); 217 elv_rqhash_add(q, rq); 218 } 219 220 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) 221 { 222 struct elevator_queue *e = q->elevator; 223 struct hlist_node *next; 224 struct request *rq; 225 226 hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { 227 BUG_ON(!ELV_ON_HASH(rq)); 228 229 if (unlikely(!rq_mergeable(rq))) { 230 __elv_rqhash_del(rq); 231 continue; 232 } 233 234 if (rq_hash_key(rq) == offset) 235 return rq; 236 } 237 238 return NULL; 239 } 240 241 /* 242 * RB-tree support functions for inserting/lookup/removal of requests 243 * in a sorted RB tree. 244 */ 245 void elv_rb_add(struct rb_root *root, struct request *rq) 246 { 247 struct rb_node **p = &root->rb_node; 248 struct rb_node *parent = NULL; 249 struct request *__rq; 250 251 while (*p) { 252 parent = *p; 253 __rq = rb_entry(parent, struct request, rb_node); 254 255 if (blk_rq_pos(rq) < blk_rq_pos(__rq)) 256 p = &(*p)->rb_left; 257 else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) 258 p = &(*p)->rb_right; 259 } 260 261 rb_link_node(&rq->rb_node, parent, p); 262 rb_insert_color(&rq->rb_node, root); 263 } 264 EXPORT_SYMBOL(elv_rb_add); 265 266 void elv_rb_del(struct rb_root *root, struct request *rq) 267 { 268 BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); 269 rb_erase(&rq->rb_node, root); 270 RB_CLEAR_NODE(&rq->rb_node); 271 } 272 EXPORT_SYMBOL(elv_rb_del); 273 274 struct request *elv_rb_find(struct rb_root *root, sector_t sector) 275 { 276 struct rb_node *n = root->rb_node; 277 struct request *rq; 278 279 while (n) { 280 rq = rb_entry(n, struct request, rb_node); 281 282 if (sector < blk_rq_pos(rq)) 283 n = n->rb_left; 284 else if (sector > blk_rq_pos(rq)) 285 n = n->rb_right; 286 else 287 return rq; 288 } 289 290 return NULL; 291 } 292 EXPORT_SYMBOL(elv_rb_find); 293 294 enum elv_merge elv_merge(struct request_queue *q, struct request **req, 295 struct bio *bio) 296 { 297 struct elevator_queue *e = q->elevator; 298 struct request *__rq; 299 300 /* 301 * Levels of merges: 302 * nomerges: No merges at all attempted 303 * noxmerges: Only simple one-hit cache try 304 * merges: All merge tries attempted 305 */ 306 if (blk_queue_nomerges(q) || !bio_mergeable(bio)) 307 return ELEVATOR_NO_MERGE; 308 309 /* 310 * First try one-hit cache. 311 */ 312 if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { 313 enum elv_merge ret = blk_try_merge(q->last_merge, bio); 314 315 if (ret != ELEVATOR_NO_MERGE) { 316 *req = q->last_merge; 317 return ret; 318 } 319 } 320 321 if (blk_queue_noxmerges(q)) 322 return ELEVATOR_NO_MERGE; 323 324 /* 325 * See if our hash lookup can find a potential backmerge. 326 */ 327 __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); 328 if (__rq && elv_bio_merge_ok(__rq, bio)) { 329 *req = __rq; 330 return ELEVATOR_BACK_MERGE; 331 } 332 333 if (e->type->ops.request_merge) 334 return e->type->ops.request_merge(q, req, bio); 335 336 return ELEVATOR_NO_MERGE; 337 } 338 339 /* 340 * Attempt to do an insertion back merge. Only check for the case where 341 * we can append 'rq' to an existing request, so we can throw 'rq' away 342 * afterwards. 343 * 344 * Returns true if we merged, false otherwise 345 */ 346 bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) 347 { 348 struct request *__rq; 349 bool ret; 350 351 if (blk_queue_nomerges(q)) 352 return false; 353 354 /* 355 * First try one-hit cache. 356 */ 357 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) 358 return true; 359 360 if (blk_queue_noxmerges(q)) 361 return false; 362 363 ret = false; 364 /* 365 * See if our hash lookup can find a potential backmerge. 366 */ 367 while (1) { 368 __rq = elv_rqhash_find(q, blk_rq_pos(rq)); 369 if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) 370 break; 371 372 /* The merged request could be merged with others, try again */ 373 ret = true; 374 rq = __rq; 375 } 376 377 return ret; 378 } 379 380 void elv_merged_request(struct request_queue *q, struct request *rq, 381 enum elv_merge type) 382 { 383 struct elevator_queue *e = q->elevator; 384 385 if (e->type->ops.request_merged) 386 e->type->ops.request_merged(q, rq, type); 387 388 if (type == ELEVATOR_BACK_MERGE) 389 elv_rqhash_reposition(q, rq); 390 391 q->last_merge = rq; 392 } 393 394 void elv_merge_requests(struct request_queue *q, struct request *rq, 395 struct request *next) 396 { 397 struct elevator_queue *e = q->elevator; 398 399 if (e->type->ops.requests_merged) 400 e->type->ops.requests_merged(q, rq, next); 401 402 elv_rqhash_reposition(q, rq); 403 q->last_merge = rq; 404 } 405 406 struct request *elv_latter_request(struct request_queue *q, struct request *rq) 407 { 408 struct elevator_queue *e = q->elevator; 409 410 if (e->type->ops.next_request) 411 return e->type->ops.next_request(q, rq); 412 413 return NULL; 414 } 415 416 struct request *elv_former_request(struct request_queue *q, struct request *rq) 417 { 418 struct elevator_queue *e = q->elevator; 419 420 if (e->type->ops.former_request) 421 return e->type->ops.former_request(q, rq); 422 423 return NULL; 424 } 425 426 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 427 428 static ssize_t 429 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 430 { 431 struct elv_fs_entry *entry = to_elv(attr); 432 struct elevator_queue *e; 433 ssize_t error; 434 435 if (!entry->show) 436 return -EIO; 437 438 e = container_of(kobj, struct elevator_queue, kobj); 439 mutex_lock(&e->sysfs_lock); 440 error = e->type ? entry->show(e, page) : -ENOENT; 441 mutex_unlock(&e->sysfs_lock); 442 return error; 443 } 444 445 static ssize_t 446 elv_attr_store(struct kobject *kobj, struct attribute *attr, 447 const char *page, size_t length) 448 { 449 struct elv_fs_entry *entry = to_elv(attr); 450 struct elevator_queue *e; 451 ssize_t error; 452 453 if (!entry->store) 454 return -EIO; 455 456 e = container_of(kobj, struct elevator_queue, kobj); 457 mutex_lock(&e->sysfs_lock); 458 error = e->type ? entry->store(e, page, length) : -ENOENT; 459 mutex_unlock(&e->sysfs_lock); 460 return error; 461 } 462 463 static const struct sysfs_ops elv_sysfs_ops = { 464 .show = elv_attr_show, 465 .store = elv_attr_store, 466 }; 467 468 static struct kobj_type elv_ktype = { 469 .sysfs_ops = &elv_sysfs_ops, 470 .release = elevator_release, 471 }; 472 473 int elv_register_queue(struct request_queue *q) 474 { 475 struct elevator_queue *e = q->elevator; 476 int error; 477 478 lockdep_assert_held(&q->sysfs_lock); 479 480 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 481 if (!error) { 482 struct elv_fs_entry *attr = e->type->elevator_attrs; 483 if (attr) { 484 while (attr->attr.name) { 485 if (sysfs_create_file(&e->kobj, &attr->attr)) 486 break; 487 attr++; 488 } 489 } 490 kobject_uevent(&e->kobj, KOBJ_ADD); 491 e->registered = 1; 492 } 493 return error; 494 } 495 496 void elv_unregister_queue(struct request_queue *q) 497 { 498 lockdep_assert_held(&q->sysfs_lock); 499 500 if (q) { 501 struct elevator_queue *e = q->elevator; 502 503 kobject_uevent(&e->kobj, KOBJ_REMOVE); 504 kobject_del(&e->kobj); 505 e->registered = 0; 506 /* Re-enable throttling in case elevator disabled it */ 507 wbt_enable_default(q); 508 } 509 } 510 511 int elv_register(struct elevator_type *e) 512 { 513 /* create icq_cache if requested */ 514 if (e->icq_size) { 515 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || 516 WARN_ON(e->icq_align < __alignof__(struct io_cq))) 517 return -EINVAL; 518 519 snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), 520 "%s_io_cq", e->elevator_name); 521 e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, 522 e->icq_align, 0, NULL); 523 if (!e->icq_cache) 524 return -ENOMEM; 525 } 526 527 /* register, don't allow duplicate names */ 528 spin_lock(&elv_list_lock); 529 if (elevator_find(e->elevator_name)) { 530 spin_unlock(&elv_list_lock); 531 kmem_cache_destroy(e->icq_cache); 532 return -EBUSY; 533 } 534 list_add_tail(&e->list, &elv_list); 535 spin_unlock(&elv_list_lock); 536 537 printk(KERN_INFO "io scheduler %s registered\n", e->elevator_name); 538 539 return 0; 540 } 541 EXPORT_SYMBOL_GPL(elv_register); 542 543 void elv_unregister(struct elevator_type *e) 544 { 545 /* unregister */ 546 spin_lock(&elv_list_lock); 547 list_del_init(&e->list); 548 spin_unlock(&elv_list_lock); 549 550 /* 551 * Destroy icq_cache if it exists. icq's are RCU managed. Make 552 * sure all RCU operations are complete before proceeding. 553 */ 554 if (e->icq_cache) { 555 rcu_barrier(); 556 kmem_cache_destroy(e->icq_cache); 557 e->icq_cache = NULL; 558 } 559 } 560 EXPORT_SYMBOL_GPL(elv_unregister); 561 562 int elevator_switch_mq(struct request_queue *q, 563 struct elevator_type *new_e) 564 { 565 int ret; 566 567 lockdep_assert_held(&q->sysfs_lock); 568 569 if (q->elevator) { 570 if (q->elevator->registered) 571 elv_unregister_queue(q); 572 ioc_clear_queue(q); 573 elevator_exit(q, q->elevator); 574 } 575 576 ret = blk_mq_init_sched(q, new_e); 577 if (ret) 578 goto out; 579 580 if (new_e) { 581 ret = elv_register_queue(q); 582 if (ret) { 583 elevator_exit(q, q->elevator); 584 goto out; 585 } 586 } 587 588 if (new_e) 589 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 590 else 591 blk_add_trace_msg(q, "elv switch: none"); 592 593 out: 594 return ret; 595 } 596 597 /* 598 * For blk-mq devices, we default to using mq-deadline, if available, for single 599 * queue devices. If deadline isn't available OR we have multiple queues, 600 * default to "none". 601 */ 602 int elevator_init_mq(struct request_queue *q) 603 { 604 struct elevator_type *e; 605 int err = 0; 606 607 if (q->nr_hw_queues != 1) 608 return 0; 609 610 /* 611 * q->sysfs_lock must be held to provide mutual exclusion between 612 * elevator_switch() and here. 613 */ 614 mutex_lock(&q->sysfs_lock); 615 if (unlikely(q->elevator)) 616 goto out_unlock; 617 618 e = elevator_get(q, "mq-deadline", false); 619 if (!e) 620 goto out_unlock; 621 622 err = blk_mq_init_sched(q, e); 623 if (err) 624 elevator_put(e); 625 out_unlock: 626 mutex_unlock(&q->sysfs_lock); 627 return err; 628 } 629 630 631 /* 632 * switch to new_e io scheduler. be careful not to introduce deadlocks - 633 * we don't free the old io scheduler, before we have allocated what we 634 * need for the new one. this way we have a chance of going back to the old 635 * one, if the new one fails init for some reason. 636 */ 637 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 638 { 639 int err; 640 641 lockdep_assert_held(&q->sysfs_lock); 642 643 blk_mq_freeze_queue(q); 644 blk_mq_quiesce_queue(q); 645 646 err = elevator_switch_mq(q, new_e); 647 648 blk_mq_unquiesce_queue(q); 649 blk_mq_unfreeze_queue(q); 650 651 return err; 652 } 653 654 /* 655 * Switch this queue to the given IO scheduler. 656 */ 657 static int __elevator_change(struct request_queue *q, const char *name) 658 { 659 char elevator_name[ELV_NAME_MAX]; 660 struct elevator_type *e; 661 662 /* Make sure queue is not in the middle of being removed */ 663 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 664 return -ENOENT; 665 666 /* 667 * Special case for mq, turn off scheduling 668 */ 669 if (!strncmp(name, "none", 4)) { 670 if (!q->elevator) 671 return 0; 672 return elevator_switch(q, NULL); 673 } 674 675 strlcpy(elevator_name, name, sizeof(elevator_name)); 676 e = elevator_get(q, strstrip(elevator_name), true); 677 if (!e) 678 return -EINVAL; 679 680 if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { 681 elevator_put(e); 682 return 0; 683 } 684 685 return elevator_switch(q, e); 686 } 687 688 static inline bool elv_support_iosched(struct request_queue *q) 689 { 690 if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) 691 return false; 692 return true; 693 } 694 695 ssize_t elv_iosched_store(struct request_queue *q, const char *name, 696 size_t count) 697 { 698 int ret; 699 700 if (!queue_is_mq(q) || !elv_support_iosched(q)) 701 return count; 702 703 ret = __elevator_change(q, name); 704 if (!ret) 705 return count; 706 707 return ret; 708 } 709 710 ssize_t elv_iosched_show(struct request_queue *q, char *name) 711 { 712 struct elevator_queue *e = q->elevator; 713 struct elevator_type *elv = NULL; 714 struct elevator_type *__e; 715 int len = 0; 716 717 if (!queue_is_mq(q)) 718 return sprintf(name, "none\n"); 719 720 if (!q->elevator) 721 len += sprintf(name+len, "[none] "); 722 else 723 elv = e->type; 724 725 spin_lock(&elv_list_lock); 726 list_for_each_entry(__e, &elv_list, list) { 727 if (elv && elevator_match(elv, __e->elevator_name)) { 728 len += sprintf(name+len, "[%s] ", elv->elevator_name); 729 continue; 730 } 731 if (elv_support_iosched(q)) 732 len += sprintf(name+len, "%s ", __e->elevator_name); 733 } 734 spin_unlock(&elv_list_lock); 735 736 if (q->elevator) 737 len += sprintf(name+len, "none"); 738 739 len += sprintf(len+name, "\n"); 740 return len; 741 } 742 743 struct request *elv_rb_former_request(struct request_queue *q, 744 struct request *rq) 745 { 746 struct rb_node *rbprev = rb_prev(&rq->rb_node); 747 748 if (rbprev) 749 return rb_entry_rq(rbprev); 750 751 return NULL; 752 } 753 EXPORT_SYMBOL(elv_rb_former_request); 754 755 struct request *elv_rb_latter_request(struct request_queue *q, 756 struct request *rq) 757 { 758 struct rb_node *rbnext = rb_next(&rq->rb_node); 759 760 if (rbnext) 761 return rb_entry_rq(rbnext); 762 763 return NULL; 764 } 765 EXPORT_SYMBOL(elv_rb_latter_request); 766