1 /* 2 * Block device elevator/IO-scheduler. 3 * 4 * Copyright (C) 2000 Andrea Arcangeli <andrea@suse.de> SuSE 5 * 6 * 30042000 Jens Axboe <axboe@kernel.dk> : 7 * 8 * Split the elevator a bit so that it is possible to choose a different 9 * one or even write a new "plug in". There are three pieces: 10 * - elevator_fn, inserts a new request in the queue list 11 * - elevator_merge_fn, decides whether a new buffer can be merged with 12 * an existing request 13 * - elevator_dequeue_fn, called when a request is taken off the active list 14 * 15 * 20082000 Dave Jones <davej@suse.de> : 16 * Removed tests for max-bomb-segments, which was breaking elvtune 17 * when run without -bN 18 * 19 * Jens: 20 * - Rework again to work with bio instead of buffer_heads 21 * - loose bi_dev comparisons, partition handling is right now 22 * - completely modularize elevator setup and teardown 23 * 24 */ 25 #include <linux/kernel.h> 26 #include <linux/fs.h> 27 #include <linux/blkdev.h> 28 #include <linux/elevator.h> 29 #include <linux/bio.h> 30 #include <linux/module.h> 31 #include <linux/slab.h> 32 #include <linux/init.h> 33 #include <linux/compiler.h> 34 #include <linux/blktrace_api.h> 35 #include <linux/hash.h> 36 #include <linux/uaccess.h> 37 #include <linux/pm_runtime.h> 38 #include <linux/blk-cgroup.h> 39 40 #include <trace/events/block.h> 41 42 #include "blk.h" 43 #include "blk-mq-sched.h" 44 #include "blk-pm.h" 45 #include "blk-wbt.h" 46 47 static DEFINE_SPINLOCK(elv_list_lock); 48 static LIST_HEAD(elv_list); 49 50 /* 51 * Merge hash stuff. 52 */ 53 #define rq_hash_key(rq) (blk_rq_pos(rq) + blk_rq_sectors(rq)) 54 55 /* 56 * Query io scheduler to see if the current process issuing bio may be 57 * merged with rq. 58 */ 59 static int elv_iosched_allow_bio_merge(struct request *rq, struct bio *bio) 60 { 61 struct request_queue *q = rq->q; 62 struct elevator_queue *e = q->elevator; 63 64 if (e->type->ops.allow_merge) 65 return e->type->ops.allow_merge(q, rq, bio); 66 67 return 1; 68 } 69 70 /* 71 * can we safely merge with this request? 72 */ 73 bool elv_bio_merge_ok(struct request *rq, struct bio *bio) 74 { 75 if (!blk_rq_merge_ok(rq, bio)) 76 return false; 77 78 if (!elv_iosched_allow_bio_merge(rq, bio)) 79 return false; 80 81 return true; 82 } 83 EXPORT_SYMBOL(elv_bio_merge_ok); 84 85 static bool elevator_match(const struct elevator_type *e, const char *name) 86 { 87 if (!strcmp(e->elevator_name, name)) 88 return true; 89 if (e->elevator_alias && !strcmp(e->elevator_alias, name)) 90 return true; 91 92 return false; 93 } 94 95 /* 96 * Return scheduler with name 'name' 97 */ 98 static struct elevator_type *elevator_find(const char *name) 99 { 100 struct elevator_type *e; 101 102 list_for_each_entry(e, &elv_list, list) { 103 if (elevator_match(e, name)) 104 return e; 105 } 106 107 return NULL; 108 } 109 110 static void elevator_put(struct elevator_type *e) 111 { 112 module_put(e->elevator_owner); 113 } 114 115 static struct elevator_type *elevator_get(struct request_queue *q, 116 const char *name, bool try_loading) 117 { 118 struct elevator_type *e; 119 120 spin_lock(&elv_list_lock); 121 122 e = elevator_find(name); 123 if (!e && try_loading) { 124 spin_unlock(&elv_list_lock); 125 request_module("%s-iosched", name); 126 spin_lock(&elv_list_lock); 127 e = elevator_find(name); 128 } 129 130 if (e && !try_module_get(e->elevator_owner)) 131 e = NULL; 132 133 spin_unlock(&elv_list_lock); 134 return e; 135 } 136 137 static char chosen_elevator[ELV_NAME_MAX]; 138 139 static int __init elevator_setup(char *str) 140 { 141 /* 142 * Be backwards-compatible with previous kernels, so users 143 * won't get the wrong elevator. 144 */ 145 strncpy(chosen_elevator, str, sizeof(chosen_elevator) - 1); 146 return 1; 147 } 148 149 __setup("elevator=", elevator_setup); 150 151 static struct kobj_type elv_ktype; 152 153 struct elevator_queue *elevator_alloc(struct request_queue *q, 154 struct elevator_type *e) 155 { 156 struct elevator_queue *eq; 157 158 eq = kzalloc_node(sizeof(*eq), GFP_KERNEL, q->node); 159 if (unlikely(!eq)) 160 return NULL; 161 162 eq->type = e; 163 kobject_init(&eq->kobj, &elv_ktype); 164 mutex_init(&eq->sysfs_lock); 165 hash_init(eq->hash); 166 167 return eq; 168 } 169 EXPORT_SYMBOL(elevator_alloc); 170 171 static void elevator_release(struct kobject *kobj) 172 { 173 struct elevator_queue *e; 174 175 e = container_of(kobj, struct elevator_queue, kobj); 176 elevator_put(e->type); 177 kfree(e); 178 } 179 180 void elevator_exit(struct request_queue *q, struct elevator_queue *e) 181 { 182 mutex_lock(&e->sysfs_lock); 183 if (e->type->ops.exit_sched) 184 blk_mq_exit_sched(q, e); 185 mutex_unlock(&e->sysfs_lock); 186 187 kobject_put(&e->kobj); 188 } 189 190 static inline void __elv_rqhash_del(struct request *rq) 191 { 192 hash_del(&rq->hash); 193 rq->rq_flags &= ~RQF_HASHED; 194 } 195 196 void elv_rqhash_del(struct request_queue *q, struct request *rq) 197 { 198 if (ELV_ON_HASH(rq)) 199 __elv_rqhash_del(rq); 200 } 201 EXPORT_SYMBOL_GPL(elv_rqhash_del); 202 203 void elv_rqhash_add(struct request_queue *q, struct request *rq) 204 { 205 struct elevator_queue *e = q->elevator; 206 207 BUG_ON(ELV_ON_HASH(rq)); 208 hash_add(e->hash, &rq->hash, rq_hash_key(rq)); 209 rq->rq_flags |= RQF_HASHED; 210 } 211 EXPORT_SYMBOL_GPL(elv_rqhash_add); 212 213 void elv_rqhash_reposition(struct request_queue *q, struct request *rq) 214 { 215 __elv_rqhash_del(rq); 216 elv_rqhash_add(q, rq); 217 } 218 219 struct request *elv_rqhash_find(struct request_queue *q, sector_t offset) 220 { 221 struct elevator_queue *e = q->elevator; 222 struct hlist_node *next; 223 struct request *rq; 224 225 hash_for_each_possible_safe(e->hash, rq, next, hash, offset) { 226 BUG_ON(!ELV_ON_HASH(rq)); 227 228 if (unlikely(!rq_mergeable(rq))) { 229 __elv_rqhash_del(rq); 230 continue; 231 } 232 233 if (rq_hash_key(rq) == offset) 234 return rq; 235 } 236 237 return NULL; 238 } 239 240 /* 241 * RB-tree support functions for inserting/lookup/removal of requests 242 * in a sorted RB tree. 243 */ 244 void elv_rb_add(struct rb_root *root, struct request *rq) 245 { 246 struct rb_node **p = &root->rb_node; 247 struct rb_node *parent = NULL; 248 struct request *__rq; 249 250 while (*p) { 251 parent = *p; 252 __rq = rb_entry(parent, struct request, rb_node); 253 254 if (blk_rq_pos(rq) < blk_rq_pos(__rq)) 255 p = &(*p)->rb_left; 256 else if (blk_rq_pos(rq) >= blk_rq_pos(__rq)) 257 p = &(*p)->rb_right; 258 } 259 260 rb_link_node(&rq->rb_node, parent, p); 261 rb_insert_color(&rq->rb_node, root); 262 } 263 EXPORT_SYMBOL(elv_rb_add); 264 265 void elv_rb_del(struct rb_root *root, struct request *rq) 266 { 267 BUG_ON(RB_EMPTY_NODE(&rq->rb_node)); 268 rb_erase(&rq->rb_node, root); 269 RB_CLEAR_NODE(&rq->rb_node); 270 } 271 EXPORT_SYMBOL(elv_rb_del); 272 273 struct request *elv_rb_find(struct rb_root *root, sector_t sector) 274 { 275 struct rb_node *n = root->rb_node; 276 struct request *rq; 277 278 while (n) { 279 rq = rb_entry(n, struct request, rb_node); 280 281 if (sector < blk_rq_pos(rq)) 282 n = n->rb_left; 283 else if (sector > blk_rq_pos(rq)) 284 n = n->rb_right; 285 else 286 return rq; 287 } 288 289 return NULL; 290 } 291 EXPORT_SYMBOL(elv_rb_find); 292 293 enum elv_merge elv_merge(struct request_queue *q, struct request **req, 294 struct bio *bio) 295 { 296 struct elevator_queue *e = q->elevator; 297 struct request *__rq; 298 299 /* 300 * Levels of merges: 301 * nomerges: No merges at all attempted 302 * noxmerges: Only simple one-hit cache try 303 * merges: All merge tries attempted 304 */ 305 if (blk_queue_nomerges(q) || !bio_mergeable(bio)) 306 return ELEVATOR_NO_MERGE; 307 308 /* 309 * First try one-hit cache. 310 */ 311 if (q->last_merge && elv_bio_merge_ok(q->last_merge, bio)) { 312 enum elv_merge ret = blk_try_merge(q->last_merge, bio); 313 314 if (ret != ELEVATOR_NO_MERGE) { 315 *req = q->last_merge; 316 return ret; 317 } 318 } 319 320 if (blk_queue_noxmerges(q)) 321 return ELEVATOR_NO_MERGE; 322 323 /* 324 * See if our hash lookup can find a potential backmerge. 325 */ 326 __rq = elv_rqhash_find(q, bio->bi_iter.bi_sector); 327 if (__rq && elv_bio_merge_ok(__rq, bio)) { 328 *req = __rq; 329 return ELEVATOR_BACK_MERGE; 330 } 331 332 if (e->type->ops.request_merge) 333 return e->type->ops.request_merge(q, req, bio); 334 335 return ELEVATOR_NO_MERGE; 336 } 337 338 /* 339 * Attempt to do an insertion back merge. Only check for the case where 340 * we can append 'rq' to an existing request, so we can throw 'rq' away 341 * afterwards. 342 * 343 * Returns true if we merged, false otherwise 344 */ 345 bool elv_attempt_insert_merge(struct request_queue *q, struct request *rq) 346 { 347 struct request *__rq; 348 bool ret; 349 350 if (blk_queue_nomerges(q)) 351 return false; 352 353 /* 354 * First try one-hit cache. 355 */ 356 if (q->last_merge && blk_attempt_req_merge(q, q->last_merge, rq)) 357 return true; 358 359 if (blk_queue_noxmerges(q)) 360 return false; 361 362 ret = false; 363 /* 364 * See if our hash lookup can find a potential backmerge. 365 */ 366 while (1) { 367 __rq = elv_rqhash_find(q, blk_rq_pos(rq)); 368 if (!__rq || !blk_attempt_req_merge(q, __rq, rq)) 369 break; 370 371 /* The merged request could be merged with others, try again */ 372 ret = true; 373 rq = __rq; 374 } 375 376 return ret; 377 } 378 379 void elv_merged_request(struct request_queue *q, struct request *rq, 380 enum elv_merge type) 381 { 382 struct elevator_queue *e = q->elevator; 383 384 if (e->type->ops.request_merged) 385 e->type->ops.request_merged(q, rq, type); 386 387 if (type == ELEVATOR_BACK_MERGE) 388 elv_rqhash_reposition(q, rq); 389 390 q->last_merge = rq; 391 } 392 393 void elv_merge_requests(struct request_queue *q, struct request *rq, 394 struct request *next) 395 { 396 struct elevator_queue *e = q->elevator; 397 398 if (e->type->ops.requests_merged) 399 e->type->ops.requests_merged(q, rq, next); 400 401 elv_rqhash_reposition(q, rq); 402 q->last_merge = rq; 403 } 404 405 struct request *elv_latter_request(struct request_queue *q, struct request *rq) 406 { 407 struct elevator_queue *e = q->elevator; 408 409 if (e->type->ops.next_request) 410 return e->type->ops.next_request(q, rq); 411 412 return NULL; 413 } 414 415 struct request *elv_former_request(struct request_queue *q, struct request *rq) 416 { 417 struct elevator_queue *e = q->elevator; 418 419 if (e->type->ops.former_request) 420 return e->type->ops.former_request(q, rq); 421 422 return NULL; 423 } 424 425 #define to_elv(atr) container_of((atr), struct elv_fs_entry, attr) 426 427 static ssize_t 428 elv_attr_show(struct kobject *kobj, struct attribute *attr, char *page) 429 { 430 struct elv_fs_entry *entry = to_elv(attr); 431 struct elevator_queue *e; 432 ssize_t error; 433 434 if (!entry->show) 435 return -EIO; 436 437 e = container_of(kobj, struct elevator_queue, kobj); 438 mutex_lock(&e->sysfs_lock); 439 error = e->type ? entry->show(e, page) : -ENOENT; 440 mutex_unlock(&e->sysfs_lock); 441 return error; 442 } 443 444 static ssize_t 445 elv_attr_store(struct kobject *kobj, struct attribute *attr, 446 const char *page, size_t length) 447 { 448 struct elv_fs_entry *entry = to_elv(attr); 449 struct elevator_queue *e; 450 ssize_t error; 451 452 if (!entry->store) 453 return -EIO; 454 455 e = container_of(kobj, struct elevator_queue, kobj); 456 mutex_lock(&e->sysfs_lock); 457 error = e->type ? entry->store(e, page, length) : -ENOENT; 458 mutex_unlock(&e->sysfs_lock); 459 return error; 460 } 461 462 static const struct sysfs_ops elv_sysfs_ops = { 463 .show = elv_attr_show, 464 .store = elv_attr_store, 465 }; 466 467 static struct kobj_type elv_ktype = { 468 .sysfs_ops = &elv_sysfs_ops, 469 .release = elevator_release, 470 }; 471 472 int elv_register_queue(struct request_queue *q) 473 { 474 struct elevator_queue *e = q->elevator; 475 int error; 476 477 lockdep_assert_held(&q->sysfs_lock); 478 479 error = kobject_add(&e->kobj, &q->kobj, "%s", "iosched"); 480 if (!error) { 481 struct elv_fs_entry *attr = e->type->elevator_attrs; 482 if (attr) { 483 while (attr->attr.name) { 484 if (sysfs_create_file(&e->kobj, &attr->attr)) 485 break; 486 attr++; 487 } 488 } 489 kobject_uevent(&e->kobj, KOBJ_ADD); 490 e->registered = 1; 491 } 492 return error; 493 } 494 495 void elv_unregister_queue(struct request_queue *q) 496 { 497 lockdep_assert_held(&q->sysfs_lock); 498 499 if (q) { 500 struct elevator_queue *e = q->elevator; 501 502 kobject_uevent(&e->kobj, KOBJ_REMOVE); 503 kobject_del(&e->kobj); 504 e->registered = 0; 505 /* Re-enable throttling in case elevator disabled it */ 506 wbt_enable_default(q); 507 } 508 } 509 510 int elv_register(struct elevator_type *e) 511 { 512 char *def = ""; 513 514 /* create icq_cache if requested */ 515 if (e->icq_size) { 516 if (WARN_ON(e->icq_size < sizeof(struct io_cq)) || 517 WARN_ON(e->icq_align < __alignof__(struct io_cq))) 518 return -EINVAL; 519 520 snprintf(e->icq_cache_name, sizeof(e->icq_cache_name), 521 "%s_io_cq", e->elevator_name); 522 e->icq_cache = kmem_cache_create(e->icq_cache_name, e->icq_size, 523 e->icq_align, 0, NULL); 524 if (!e->icq_cache) 525 return -ENOMEM; 526 } 527 528 /* register, don't allow duplicate names */ 529 spin_lock(&elv_list_lock); 530 if (elevator_find(e->elevator_name)) { 531 spin_unlock(&elv_list_lock); 532 kmem_cache_destroy(e->icq_cache); 533 return -EBUSY; 534 } 535 list_add_tail(&e->list, &elv_list); 536 spin_unlock(&elv_list_lock); 537 538 printk(KERN_INFO "io scheduler %s registered%s\n", e->elevator_name, 539 def); 540 return 0; 541 } 542 EXPORT_SYMBOL_GPL(elv_register); 543 544 void elv_unregister(struct elevator_type *e) 545 { 546 /* unregister */ 547 spin_lock(&elv_list_lock); 548 list_del_init(&e->list); 549 spin_unlock(&elv_list_lock); 550 551 /* 552 * Destroy icq_cache if it exists. icq's are RCU managed. Make 553 * sure all RCU operations are complete before proceeding. 554 */ 555 if (e->icq_cache) { 556 rcu_barrier(); 557 kmem_cache_destroy(e->icq_cache); 558 e->icq_cache = NULL; 559 } 560 } 561 EXPORT_SYMBOL_GPL(elv_unregister); 562 563 int elevator_switch_mq(struct request_queue *q, 564 struct elevator_type *new_e) 565 { 566 int ret; 567 568 lockdep_assert_held(&q->sysfs_lock); 569 570 if (q->elevator) { 571 if (q->elevator->registered) 572 elv_unregister_queue(q); 573 ioc_clear_queue(q); 574 elevator_exit(q, q->elevator); 575 } 576 577 ret = blk_mq_init_sched(q, new_e); 578 if (ret) 579 goto out; 580 581 if (new_e) { 582 ret = elv_register_queue(q); 583 if (ret) { 584 elevator_exit(q, q->elevator); 585 goto out; 586 } 587 } 588 589 if (new_e) 590 blk_add_trace_msg(q, "elv switch: %s", new_e->elevator_name); 591 else 592 blk_add_trace_msg(q, "elv switch: none"); 593 594 out: 595 return ret; 596 } 597 598 /* 599 * For blk-mq devices, we default to using mq-deadline, if available, for single 600 * queue devices. If deadline isn't available OR we have multiple queues, 601 * default to "none". 602 */ 603 int elevator_init_mq(struct request_queue *q) 604 { 605 struct elevator_type *e; 606 int err = 0; 607 608 if (q->nr_hw_queues != 1) 609 return 0; 610 611 /* 612 * q->sysfs_lock must be held to provide mutual exclusion between 613 * elevator_switch() and here. 614 */ 615 mutex_lock(&q->sysfs_lock); 616 if (unlikely(q->elevator)) 617 goto out_unlock; 618 619 e = elevator_get(q, "mq-deadline", false); 620 if (!e) 621 goto out_unlock; 622 623 err = blk_mq_init_sched(q, e); 624 if (err) 625 elevator_put(e); 626 out_unlock: 627 mutex_unlock(&q->sysfs_lock); 628 return err; 629 } 630 631 632 /* 633 * switch to new_e io scheduler. be careful not to introduce deadlocks - 634 * we don't free the old io scheduler, before we have allocated what we 635 * need for the new one. this way we have a chance of going back to the old 636 * one, if the new one fails init for some reason. 637 */ 638 static int elevator_switch(struct request_queue *q, struct elevator_type *new_e) 639 { 640 int err; 641 642 lockdep_assert_held(&q->sysfs_lock); 643 644 blk_mq_freeze_queue(q); 645 blk_mq_quiesce_queue(q); 646 647 err = elevator_switch_mq(q, new_e); 648 649 blk_mq_unquiesce_queue(q); 650 blk_mq_unfreeze_queue(q); 651 652 return err; 653 } 654 655 /* 656 * Switch this queue to the given IO scheduler. 657 */ 658 static int __elevator_change(struct request_queue *q, const char *name) 659 { 660 char elevator_name[ELV_NAME_MAX]; 661 struct elevator_type *e; 662 663 /* Make sure queue is not in the middle of being removed */ 664 if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 665 return -ENOENT; 666 667 /* 668 * Special case for mq, turn off scheduling 669 */ 670 if (!strncmp(name, "none", 4)) { 671 if (!q->elevator) 672 return 0; 673 return elevator_switch(q, NULL); 674 } 675 676 strlcpy(elevator_name, name, sizeof(elevator_name)); 677 e = elevator_get(q, strstrip(elevator_name), true); 678 if (!e) 679 return -EINVAL; 680 681 if (q->elevator && elevator_match(q->elevator->type, elevator_name)) { 682 elevator_put(e); 683 return 0; 684 } 685 686 return elevator_switch(q, e); 687 } 688 689 static inline bool elv_support_iosched(struct request_queue *q) 690 { 691 if (q->tag_set && (q->tag_set->flags & BLK_MQ_F_NO_SCHED)) 692 return false; 693 return true; 694 } 695 696 ssize_t elv_iosched_store(struct request_queue *q, const char *name, 697 size_t count) 698 { 699 int ret; 700 701 if (!queue_is_mq(q) || !elv_support_iosched(q)) 702 return count; 703 704 ret = __elevator_change(q, name); 705 if (!ret) 706 return count; 707 708 return ret; 709 } 710 711 ssize_t elv_iosched_show(struct request_queue *q, char *name) 712 { 713 struct elevator_queue *e = q->elevator; 714 struct elevator_type *elv = NULL; 715 struct elevator_type *__e; 716 int len = 0; 717 718 if (!queue_is_mq(q)) 719 return sprintf(name, "none\n"); 720 721 if (!q->elevator) 722 len += sprintf(name+len, "[none] "); 723 else 724 elv = e->type; 725 726 spin_lock(&elv_list_lock); 727 list_for_each_entry(__e, &elv_list, list) { 728 if (elv && elevator_match(elv, __e->elevator_name)) { 729 len += sprintf(name+len, "[%s] ", elv->elevator_name); 730 continue; 731 } 732 if (elv_support_iosched(q)) 733 len += sprintf(name+len, "%s ", __e->elevator_name); 734 } 735 spin_unlock(&elv_list_lock); 736 737 if (q->elevator) 738 len += sprintf(name+len, "none"); 739 740 len += sprintf(len+name, "\n"); 741 return len; 742 } 743 744 struct request *elv_rb_former_request(struct request_queue *q, 745 struct request *rq) 746 { 747 struct rb_node *rbprev = rb_prev(&rq->rb_node); 748 749 if (rbprev) 750 return rb_entry_rq(rbprev); 751 752 return NULL; 753 } 754 EXPORT_SYMBOL(elv_rb_former_request); 755 756 struct request *elv_rb_latter_request(struct request_queue *q, 757 struct request *rq) 758 { 759 struct rb_node *rbnext = rb_next(&rq->rb_node); 760 761 if (rbnext) 762 return rb_entry_rq(rbnext); 763 764 return NULL; 765 } 766 EXPORT_SYMBOL(elv_rb_latter_request); 767