1 /* 2 * The Kyber I/O scheduler. Controls latency by throttling queue depths using 3 * scalable techniques. 4 * 5 * Copyright (C) 2017 Facebook 6 * 7 * This program is free software; you can redistribute it and/or 8 * modify it under the terms of the GNU General Public 9 * License v2 as published by the Free Software Foundation. 10 * 11 * This program is distributed in the hope that it will be useful, 12 * but WITHOUT ANY WARRANTY; without even the implied warranty of 13 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 14 * General Public License for more details. 15 * 16 * You should have received a copy of the GNU General Public License 17 * along with this program. If not, see <https://www.gnu.org/licenses/>. 18 */ 19 20 #include <linux/kernel.h> 21 #include <linux/blkdev.h> 22 #include <linux/blk-mq.h> 23 #include <linux/elevator.h> 24 #include <linux/module.h> 25 #include <linux/sbitmap.h> 26 27 #include "blk.h" 28 #include "blk-mq.h" 29 #include "blk-mq-debugfs.h" 30 #include "blk-mq-sched.h" 31 #include "blk-mq-tag.h" 32 #include "blk-stat.h" 33 34 /* Scheduling domains. */ 35 enum { 36 KYBER_READ, 37 KYBER_SYNC_WRITE, 38 KYBER_OTHER, /* Async writes, discard, etc. */ 39 KYBER_NUM_DOMAINS, 40 }; 41 42 enum { 43 KYBER_MIN_DEPTH = 256, 44 45 /* 46 * In order to prevent starvation of synchronous requests by a flood of 47 * asynchronous requests, we reserve 25% of requests for synchronous 48 * operations. 49 */ 50 KYBER_ASYNC_PERCENT = 75, 51 }; 52 53 /* 54 * Initial device-wide depths for each scheduling domain. 55 * 56 * Even for fast devices with lots of tags like NVMe, you can saturate 57 * the device with only a fraction of the maximum possible queue depth. 58 * So, we cap these to a reasonable value. 59 */ 60 static const unsigned int kyber_depth[] = { 61 [KYBER_READ] = 256, 62 [KYBER_SYNC_WRITE] = 128, 63 [KYBER_OTHER] = 64, 64 }; 65 66 /* 67 * Scheduling domain batch sizes. We favor reads. 68 */ 69 static const unsigned int kyber_batch_size[] = { 70 [KYBER_READ] = 16, 71 [KYBER_SYNC_WRITE] = 8, 72 [KYBER_OTHER] = 8, 73 }; 74 75 /* 76 * There is a same mapping between ctx & hctx and kcq & khd, 77 * we use request->mq_ctx->index_hw to index the kcq in khd. 78 */ 79 struct kyber_ctx_queue { 80 /* 81 * Used to ensure operations on rq_list and kcq_map to be an atmoic one. 82 * Also protect the rqs on rq_list when merge. 83 */ 84 spinlock_t lock; 85 struct list_head rq_list[KYBER_NUM_DOMAINS]; 86 } ____cacheline_aligned_in_smp; 87 88 struct kyber_queue_data { 89 struct request_queue *q; 90 91 struct blk_stat_callback *cb; 92 93 /* 94 * The device is divided into multiple scheduling domains based on the 95 * request type. Each domain has a fixed number of in-flight requests of 96 * that type device-wide, limited by these tokens. 97 */ 98 struct sbitmap_queue domain_tokens[KYBER_NUM_DOMAINS]; 99 100 /* 101 * Async request percentage, converted to per-word depth for 102 * sbitmap_get_shallow(). 103 */ 104 unsigned int async_depth; 105 106 /* Target latencies in nanoseconds. */ 107 u64 read_lat_nsec, write_lat_nsec; 108 }; 109 110 struct kyber_hctx_data { 111 spinlock_t lock; 112 struct list_head rqs[KYBER_NUM_DOMAINS]; 113 unsigned int cur_domain; 114 unsigned int batching; 115 struct kyber_ctx_queue *kcqs; 116 struct sbitmap kcq_map[KYBER_NUM_DOMAINS]; 117 wait_queue_entry_t domain_wait[KYBER_NUM_DOMAINS]; 118 struct sbq_wait_state *domain_ws[KYBER_NUM_DOMAINS]; 119 atomic_t wait_index[KYBER_NUM_DOMAINS]; 120 }; 121 122 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 123 void *key); 124 125 static unsigned int kyber_sched_domain(unsigned int op) 126 { 127 if ((op & REQ_OP_MASK) == REQ_OP_READ) 128 return KYBER_READ; 129 else if ((op & REQ_OP_MASK) == REQ_OP_WRITE && op_is_sync(op)) 130 return KYBER_SYNC_WRITE; 131 else 132 return KYBER_OTHER; 133 } 134 135 enum { 136 NONE = 0, 137 GOOD = 1, 138 GREAT = 2, 139 BAD = -1, 140 AWFUL = -2, 141 }; 142 143 #define IS_GOOD(status) ((status) > 0) 144 #define IS_BAD(status) ((status) < 0) 145 146 static int kyber_lat_status(struct blk_stat_callback *cb, 147 unsigned int sched_domain, u64 target) 148 { 149 u64 latency; 150 151 if (!cb->stat[sched_domain].nr_samples) 152 return NONE; 153 154 latency = cb->stat[sched_domain].mean; 155 if (latency >= 2 * target) 156 return AWFUL; 157 else if (latency > target) 158 return BAD; 159 else if (latency <= target / 2) 160 return GREAT; 161 else /* (latency <= target) */ 162 return GOOD; 163 } 164 165 /* 166 * Adjust the read or synchronous write depth given the status of reads and 167 * writes. The goal is that the latencies of the two domains are fair (i.e., if 168 * one is good, then the other is good). 169 */ 170 static void kyber_adjust_rw_depth(struct kyber_queue_data *kqd, 171 unsigned int sched_domain, int this_status, 172 int other_status) 173 { 174 unsigned int orig_depth, depth; 175 176 /* 177 * If this domain had no samples, or reads and writes are both good or 178 * both bad, don't adjust the depth. 179 */ 180 if (this_status == NONE || 181 (IS_GOOD(this_status) && IS_GOOD(other_status)) || 182 (IS_BAD(this_status) && IS_BAD(other_status))) 183 return; 184 185 orig_depth = depth = kqd->domain_tokens[sched_domain].sb.depth; 186 187 if (other_status == NONE) { 188 depth++; 189 } else { 190 switch (this_status) { 191 case GOOD: 192 if (other_status == AWFUL) 193 depth -= max(depth / 4, 1U); 194 else 195 depth -= max(depth / 8, 1U); 196 break; 197 case GREAT: 198 if (other_status == AWFUL) 199 depth /= 2; 200 else 201 depth -= max(depth / 4, 1U); 202 break; 203 case BAD: 204 depth++; 205 break; 206 case AWFUL: 207 if (other_status == GREAT) 208 depth += 2; 209 else 210 depth++; 211 break; 212 } 213 } 214 215 depth = clamp(depth, 1U, kyber_depth[sched_domain]); 216 if (depth != orig_depth) 217 sbitmap_queue_resize(&kqd->domain_tokens[sched_domain], depth); 218 } 219 220 /* 221 * Adjust the depth of other requests given the status of reads and synchronous 222 * writes. As long as either domain is doing fine, we don't throttle, but if 223 * both domains are doing badly, we throttle heavily. 224 */ 225 static void kyber_adjust_other_depth(struct kyber_queue_data *kqd, 226 int read_status, int write_status, 227 bool have_samples) 228 { 229 unsigned int orig_depth, depth; 230 int status; 231 232 orig_depth = depth = kqd->domain_tokens[KYBER_OTHER].sb.depth; 233 234 if (read_status == NONE && write_status == NONE) { 235 depth += 2; 236 } else if (have_samples) { 237 if (read_status == NONE) 238 status = write_status; 239 else if (write_status == NONE) 240 status = read_status; 241 else 242 status = max(read_status, write_status); 243 switch (status) { 244 case GREAT: 245 depth += 2; 246 break; 247 case GOOD: 248 depth++; 249 break; 250 case BAD: 251 depth -= max(depth / 4, 1U); 252 break; 253 case AWFUL: 254 depth /= 2; 255 break; 256 } 257 } 258 259 depth = clamp(depth, 1U, kyber_depth[KYBER_OTHER]); 260 if (depth != orig_depth) 261 sbitmap_queue_resize(&kqd->domain_tokens[KYBER_OTHER], depth); 262 } 263 264 /* 265 * Apply heuristics for limiting queue depths based on gathered latency 266 * statistics. 267 */ 268 static void kyber_stat_timer_fn(struct blk_stat_callback *cb) 269 { 270 struct kyber_queue_data *kqd = cb->data; 271 int read_status, write_status; 272 273 read_status = kyber_lat_status(cb, KYBER_READ, kqd->read_lat_nsec); 274 write_status = kyber_lat_status(cb, KYBER_SYNC_WRITE, kqd->write_lat_nsec); 275 276 kyber_adjust_rw_depth(kqd, KYBER_READ, read_status, write_status); 277 kyber_adjust_rw_depth(kqd, KYBER_SYNC_WRITE, write_status, read_status); 278 kyber_adjust_other_depth(kqd, read_status, write_status, 279 cb->stat[KYBER_OTHER].nr_samples != 0); 280 281 /* 282 * Continue monitoring latencies if we aren't hitting the targets or 283 * we're still throttling other requests. 284 */ 285 if (!blk_stat_is_active(kqd->cb) && 286 ((IS_BAD(read_status) || IS_BAD(write_status) || 287 kqd->domain_tokens[KYBER_OTHER].sb.depth < kyber_depth[KYBER_OTHER]))) 288 blk_stat_activate_msecs(kqd->cb, 100); 289 } 290 291 static unsigned int kyber_sched_tags_shift(struct kyber_queue_data *kqd) 292 { 293 /* 294 * All of the hardware queues have the same depth, so we can just grab 295 * the shift of the first one. 296 */ 297 return kqd->q->queue_hw_ctx[0]->sched_tags->bitmap_tags.sb.shift; 298 } 299 300 static int kyber_bucket_fn(const struct request *rq) 301 { 302 return kyber_sched_domain(rq->cmd_flags); 303 } 304 305 static struct kyber_queue_data *kyber_queue_data_alloc(struct request_queue *q) 306 { 307 struct kyber_queue_data *kqd; 308 unsigned int max_tokens; 309 unsigned int shift; 310 int ret = -ENOMEM; 311 int i; 312 313 kqd = kmalloc_node(sizeof(*kqd), GFP_KERNEL, q->node); 314 if (!kqd) 315 goto err; 316 kqd->q = q; 317 318 kqd->cb = blk_stat_alloc_callback(kyber_stat_timer_fn, kyber_bucket_fn, 319 KYBER_NUM_DOMAINS, kqd); 320 if (!kqd->cb) 321 goto err_kqd; 322 323 /* 324 * The maximum number of tokens for any scheduling domain is at least 325 * the queue depth of a single hardware queue. If the hardware doesn't 326 * have many tags, still provide a reasonable number. 327 */ 328 max_tokens = max_t(unsigned int, q->tag_set->queue_depth, 329 KYBER_MIN_DEPTH); 330 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 331 WARN_ON(!kyber_depth[i]); 332 WARN_ON(!kyber_batch_size[i]); 333 ret = sbitmap_queue_init_node(&kqd->domain_tokens[i], 334 max_tokens, -1, false, GFP_KERNEL, 335 q->node); 336 if (ret) { 337 while (--i >= 0) 338 sbitmap_queue_free(&kqd->domain_tokens[i]); 339 goto err_cb; 340 } 341 sbitmap_queue_resize(&kqd->domain_tokens[i], kyber_depth[i]); 342 } 343 344 shift = kyber_sched_tags_shift(kqd); 345 kqd->async_depth = (1U << shift) * KYBER_ASYNC_PERCENT / 100U; 346 347 kqd->read_lat_nsec = 2000000ULL; 348 kqd->write_lat_nsec = 10000000ULL; 349 350 return kqd; 351 352 err_cb: 353 blk_stat_free_callback(kqd->cb); 354 err_kqd: 355 kfree(kqd); 356 err: 357 return ERR_PTR(ret); 358 } 359 360 static int kyber_init_sched(struct request_queue *q, struct elevator_type *e) 361 { 362 struct kyber_queue_data *kqd; 363 struct elevator_queue *eq; 364 365 eq = elevator_alloc(q, e); 366 if (!eq) 367 return -ENOMEM; 368 369 kqd = kyber_queue_data_alloc(q); 370 if (IS_ERR(kqd)) { 371 kobject_put(&eq->kobj); 372 return PTR_ERR(kqd); 373 } 374 375 eq->elevator_data = kqd; 376 q->elevator = eq; 377 378 blk_stat_add_callback(q, kqd->cb); 379 380 return 0; 381 } 382 383 static void kyber_exit_sched(struct elevator_queue *e) 384 { 385 struct kyber_queue_data *kqd = e->elevator_data; 386 struct request_queue *q = kqd->q; 387 int i; 388 389 blk_stat_remove_callback(q, kqd->cb); 390 391 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 392 sbitmap_queue_free(&kqd->domain_tokens[i]); 393 blk_stat_free_callback(kqd->cb); 394 kfree(kqd); 395 } 396 397 static void kyber_ctx_queue_init(struct kyber_ctx_queue *kcq) 398 { 399 unsigned int i; 400 401 spin_lock_init(&kcq->lock); 402 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 403 INIT_LIST_HEAD(&kcq->rq_list[i]); 404 } 405 406 static int kyber_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 407 { 408 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 409 struct kyber_hctx_data *khd; 410 int i; 411 412 khd = kmalloc_node(sizeof(*khd), GFP_KERNEL, hctx->numa_node); 413 if (!khd) 414 return -ENOMEM; 415 416 khd->kcqs = kmalloc_array_node(hctx->nr_ctx, 417 sizeof(struct kyber_ctx_queue), 418 GFP_KERNEL, hctx->numa_node); 419 if (!khd->kcqs) 420 goto err_khd; 421 422 for (i = 0; i < hctx->nr_ctx; i++) 423 kyber_ctx_queue_init(&khd->kcqs[i]); 424 425 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 426 if (sbitmap_init_node(&khd->kcq_map[i], hctx->nr_ctx, 427 ilog2(8), GFP_KERNEL, hctx->numa_node)) { 428 while (--i >= 0) 429 sbitmap_free(&khd->kcq_map[i]); 430 goto err_kcqs; 431 } 432 } 433 434 spin_lock_init(&khd->lock); 435 436 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 437 INIT_LIST_HEAD(&khd->rqs[i]); 438 init_waitqueue_func_entry(&khd->domain_wait[i], 439 kyber_domain_wake); 440 khd->domain_wait[i].private = hctx; 441 INIT_LIST_HEAD(&khd->domain_wait[i].entry); 442 atomic_set(&khd->wait_index[i], 0); 443 } 444 445 khd->cur_domain = 0; 446 khd->batching = 0; 447 448 hctx->sched_data = khd; 449 sbitmap_queue_min_shallow_depth(&hctx->sched_tags->bitmap_tags, 450 kqd->async_depth); 451 452 return 0; 453 454 err_kcqs: 455 kfree(khd->kcqs); 456 err_khd: 457 kfree(khd); 458 return -ENOMEM; 459 } 460 461 static void kyber_exit_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) 462 { 463 struct kyber_hctx_data *khd = hctx->sched_data; 464 int i; 465 466 for (i = 0; i < KYBER_NUM_DOMAINS; i++) 467 sbitmap_free(&khd->kcq_map[i]); 468 kfree(khd->kcqs); 469 kfree(hctx->sched_data); 470 } 471 472 static int rq_get_domain_token(struct request *rq) 473 { 474 return (long)rq->elv.priv[0]; 475 } 476 477 static void rq_set_domain_token(struct request *rq, int token) 478 { 479 rq->elv.priv[0] = (void *)(long)token; 480 } 481 482 static void rq_clear_domain_token(struct kyber_queue_data *kqd, 483 struct request *rq) 484 { 485 unsigned int sched_domain; 486 int nr; 487 488 nr = rq_get_domain_token(rq); 489 if (nr != -1) { 490 sched_domain = kyber_sched_domain(rq->cmd_flags); 491 sbitmap_queue_clear(&kqd->domain_tokens[sched_domain], nr, 492 rq->mq_ctx->cpu); 493 } 494 } 495 496 static void kyber_limit_depth(unsigned int op, struct blk_mq_alloc_data *data) 497 { 498 /* 499 * We use the scheduler tags as per-hardware queue queueing tokens. 500 * Async requests can be limited at this stage. 501 */ 502 if (!op_is_sync(op)) { 503 struct kyber_queue_data *kqd = data->q->elevator->elevator_data; 504 505 data->shallow_depth = kqd->async_depth; 506 } 507 } 508 509 static bool kyber_bio_merge(struct blk_mq_hw_ctx *hctx, struct bio *bio) 510 { 511 struct kyber_hctx_data *khd = hctx->sched_data; 512 struct blk_mq_ctx *ctx = blk_mq_get_ctx(hctx->queue); 513 struct kyber_ctx_queue *kcq = &khd->kcqs[ctx->index_hw]; 514 unsigned int sched_domain = kyber_sched_domain(bio->bi_opf); 515 struct list_head *rq_list = &kcq->rq_list[sched_domain]; 516 bool merged; 517 518 spin_lock(&kcq->lock); 519 merged = blk_mq_bio_list_merge(hctx->queue, rq_list, bio); 520 spin_unlock(&kcq->lock); 521 blk_mq_put_ctx(ctx); 522 523 return merged; 524 } 525 526 static void kyber_prepare_request(struct request *rq, struct bio *bio) 527 { 528 rq_set_domain_token(rq, -1); 529 } 530 531 static void kyber_insert_requests(struct blk_mq_hw_ctx *hctx, 532 struct list_head *rq_list, bool at_head) 533 { 534 struct kyber_hctx_data *khd = hctx->sched_data; 535 struct request *rq, *next; 536 537 list_for_each_entry_safe(rq, next, rq_list, queuelist) { 538 unsigned int sched_domain = kyber_sched_domain(rq->cmd_flags); 539 struct kyber_ctx_queue *kcq = &khd->kcqs[rq->mq_ctx->index_hw]; 540 struct list_head *head = &kcq->rq_list[sched_domain]; 541 542 spin_lock(&kcq->lock); 543 if (at_head) 544 list_move(&rq->queuelist, head); 545 else 546 list_move_tail(&rq->queuelist, head); 547 sbitmap_set_bit(&khd->kcq_map[sched_domain], 548 rq->mq_ctx->index_hw); 549 blk_mq_sched_request_inserted(rq); 550 spin_unlock(&kcq->lock); 551 } 552 } 553 554 static void kyber_finish_request(struct request *rq) 555 { 556 struct kyber_queue_data *kqd = rq->q->elevator->elevator_data; 557 558 rq_clear_domain_token(kqd, rq); 559 } 560 561 static void kyber_completed_request(struct request *rq) 562 { 563 struct request_queue *q = rq->q; 564 struct kyber_queue_data *kqd = q->elevator->elevator_data; 565 unsigned int sched_domain; 566 u64 now, latency, target; 567 568 /* 569 * Check if this request met our latency goal. If not, quickly gather 570 * some statistics and start throttling. 571 */ 572 sched_domain = kyber_sched_domain(rq->cmd_flags); 573 switch (sched_domain) { 574 case KYBER_READ: 575 target = kqd->read_lat_nsec; 576 break; 577 case KYBER_SYNC_WRITE: 578 target = kqd->write_lat_nsec; 579 break; 580 default: 581 return; 582 } 583 584 /* If we are already monitoring latencies, don't check again. */ 585 if (blk_stat_is_active(kqd->cb)) 586 return; 587 588 now = ktime_get_ns(); 589 if (now < rq->io_start_time_ns) 590 return; 591 592 latency = now - rq->io_start_time_ns; 593 594 if (latency > target) 595 blk_stat_activate_msecs(kqd->cb, 10); 596 } 597 598 struct flush_kcq_data { 599 struct kyber_hctx_data *khd; 600 unsigned int sched_domain; 601 struct list_head *list; 602 }; 603 604 static bool flush_busy_kcq(struct sbitmap *sb, unsigned int bitnr, void *data) 605 { 606 struct flush_kcq_data *flush_data = data; 607 struct kyber_ctx_queue *kcq = &flush_data->khd->kcqs[bitnr]; 608 609 spin_lock(&kcq->lock); 610 list_splice_tail_init(&kcq->rq_list[flush_data->sched_domain], 611 flush_data->list); 612 sbitmap_clear_bit(sb, bitnr); 613 spin_unlock(&kcq->lock); 614 615 return true; 616 } 617 618 static void kyber_flush_busy_kcqs(struct kyber_hctx_data *khd, 619 unsigned int sched_domain, 620 struct list_head *list) 621 { 622 struct flush_kcq_data data = { 623 .khd = khd, 624 .sched_domain = sched_domain, 625 .list = list, 626 }; 627 628 sbitmap_for_each_set(&khd->kcq_map[sched_domain], 629 flush_busy_kcq, &data); 630 } 631 632 static int kyber_domain_wake(wait_queue_entry_t *wait, unsigned mode, int flags, 633 void *key) 634 { 635 struct blk_mq_hw_ctx *hctx = READ_ONCE(wait->private); 636 637 list_del_init(&wait->entry); 638 blk_mq_run_hw_queue(hctx, true); 639 return 1; 640 } 641 642 static int kyber_get_domain_token(struct kyber_queue_data *kqd, 643 struct kyber_hctx_data *khd, 644 struct blk_mq_hw_ctx *hctx) 645 { 646 unsigned int sched_domain = khd->cur_domain; 647 struct sbitmap_queue *domain_tokens = &kqd->domain_tokens[sched_domain]; 648 wait_queue_entry_t *wait = &khd->domain_wait[sched_domain]; 649 struct sbq_wait_state *ws; 650 int nr; 651 652 nr = __sbitmap_queue_get(domain_tokens); 653 654 /* 655 * If we failed to get a domain token, make sure the hardware queue is 656 * run when one becomes available. Note that this is serialized on 657 * khd->lock, but we still need to be careful about the waker. 658 */ 659 if (nr < 0 && list_empty_careful(&wait->entry)) { 660 ws = sbq_wait_ptr(domain_tokens, 661 &khd->wait_index[sched_domain]); 662 khd->domain_ws[sched_domain] = ws; 663 add_wait_queue(&ws->wait, wait); 664 665 /* 666 * Try again in case a token was freed before we got on the wait 667 * queue. 668 */ 669 nr = __sbitmap_queue_get(domain_tokens); 670 } 671 672 /* 673 * If we got a token while we were on the wait queue, remove ourselves 674 * from the wait queue to ensure that all wake ups make forward 675 * progress. It's possible that the waker already deleted the entry 676 * between the !list_empty_careful() check and us grabbing the lock, but 677 * list_del_init() is okay with that. 678 */ 679 if (nr >= 0 && !list_empty_careful(&wait->entry)) { 680 ws = khd->domain_ws[sched_domain]; 681 spin_lock_irq(&ws->wait.lock); 682 list_del_init(&wait->entry); 683 spin_unlock_irq(&ws->wait.lock); 684 } 685 686 return nr; 687 } 688 689 static struct request * 690 kyber_dispatch_cur_domain(struct kyber_queue_data *kqd, 691 struct kyber_hctx_data *khd, 692 struct blk_mq_hw_ctx *hctx) 693 { 694 struct list_head *rqs; 695 struct request *rq; 696 int nr; 697 698 rqs = &khd->rqs[khd->cur_domain]; 699 700 /* 701 * If we already have a flushed request, then we just need to get a 702 * token for it. Otherwise, if there are pending requests in the kcqs, 703 * flush the kcqs, but only if we can get a token. If not, we should 704 * leave the requests in the kcqs so that they can be merged. Note that 705 * khd->lock serializes the flushes, so if we observed any bit set in 706 * the kcq_map, we will always get a request. 707 */ 708 rq = list_first_entry_or_null(rqs, struct request, queuelist); 709 if (rq) { 710 nr = kyber_get_domain_token(kqd, khd, hctx); 711 if (nr >= 0) { 712 khd->batching++; 713 rq_set_domain_token(rq, nr); 714 list_del_init(&rq->queuelist); 715 return rq; 716 } 717 } else if (sbitmap_any_bit_set(&khd->kcq_map[khd->cur_domain])) { 718 nr = kyber_get_domain_token(kqd, khd, hctx); 719 if (nr >= 0) { 720 kyber_flush_busy_kcqs(khd, khd->cur_domain, rqs); 721 rq = list_first_entry(rqs, struct request, queuelist); 722 khd->batching++; 723 rq_set_domain_token(rq, nr); 724 list_del_init(&rq->queuelist); 725 return rq; 726 } 727 } 728 729 /* There were either no pending requests or no tokens. */ 730 return NULL; 731 } 732 733 static struct request *kyber_dispatch_request(struct blk_mq_hw_ctx *hctx) 734 { 735 struct kyber_queue_data *kqd = hctx->queue->elevator->elevator_data; 736 struct kyber_hctx_data *khd = hctx->sched_data; 737 struct request *rq; 738 int i; 739 740 spin_lock(&khd->lock); 741 742 /* 743 * First, if we are still entitled to batch, try to dispatch a request 744 * from the batch. 745 */ 746 if (khd->batching < kyber_batch_size[khd->cur_domain]) { 747 rq = kyber_dispatch_cur_domain(kqd, khd, hctx); 748 if (rq) 749 goto out; 750 } 751 752 /* 753 * Either, 754 * 1. We were no longer entitled to a batch. 755 * 2. The domain we were batching didn't have any requests. 756 * 3. The domain we were batching was out of tokens. 757 * 758 * Start another batch. Note that this wraps back around to the original 759 * domain if no other domains have requests or tokens. 760 */ 761 khd->batching = 0; 762 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 763 if (khd->cur_domain == KYBER_NUM_DOMAINS - 1) 764 khd->cur_domain = 0; 765 else 766 khd->cur_domain++; 767 768 rq = kyber_dispatch_cur_domain(kqd, khd, hctx); 769 if (rq) 770 goto out; 771 } 772 773 rq = NULL; 774 out: 775 spin_unlock(&khd->lock); 776 return rq; 777 } 778 779 static bool kyber_has_work(struct blk_mq_hw_ctx *hctx) 780 { 781 struct kyber_hctx_data *khd = hctx->sched_data; 782 int i; 783 784 for (i = 0; i < KYBER_NUM_DOMAINS; i++) { 785 if (!list_empty_careful(&khd->rqs[i]) || 786 sbitmap_any_bit_set(&khd->kcq_map[i])) 787 return true; 788 } 789 790 return false; 791 } 792 793 #define KYBER_LAT_SHOW_STORE(op) \ 794 static ssize_t kyber_##op##_lat_show(struct elevator_queue *e, \ 795 char *page) \ 796 { \ 797 struct kyber_queue_data *kqd = e->elevator_data; \ 798 \ 799 return sprintf(page, "%llu\n", kqd->op##_lat_nsec); \ 800 } \ 801 \ 802 static ssize_t kyber_##op##_lat_store(struct elevator_queue *e, \ 803 const char *page, size_t count) \ 804 { \ 805 struct kyber_queue_data *kqd = e->elevator_data; \ 806 unsigned long long nsec; \ 807 int ret; \ 808 \ 809 ret = kstrtoull(page, 10, &nsec); \ 810 if (ret) \ 811 return ret; \ 812 \ 813 kqd->op##_lat_nsec = nsec; \ 814 \ 815 return count; \ 816 } 817 KYBER_LAT_SHOW_STORE(read); 818 KYBER_LAT_SHOW_STORE(write); 819 #undef KYBER_LAT_SHOW_STORE 820 821 #define KYBER_LAT_ATTR(op) __ATTR(op##_lat_nsec, 0644, kyber_##op##_lat_show, kyber_##op##_lat_store) 822 static struct elv_fs_entry kyber_sched_attrs[] = { 823 KYBER_LAT_ATTR(read), 824 KYBER_LAT_ATTR(write), 825 __ATTR_NULL 826 }; 827 #undef KYBER_LAT_ATTR 828 829 #ifdef CONFIG_BLK_DEBUG_FS 830 #define KYBER_DEBUGFS_DOMAIN_ATTRS(domain, name) \ 831 static int kyber_##name##_tokens_show(void *data, struct seq_file *m) \ 832 { \ 833 struct request_queue *q = data; \ 834 struct kyber_queue_data *kqd = q->elevator->elevator_data; \ 835 \ 836 sbitmap_queue_show(&kqd->domain_tokens[domain], m); \ 837 return 0; \ 838 } \ 839 \ 840 static void *kyber_##name##_rqs_start(struct seq_file *m, loff_t *pos) \ 841 __acquires(&khd->lock) \ 842 { \ 843 struct blk_mq_hw_ctx *hctx = m->private; \ 844 struct kyber_hctx_data *khd = hctx->sched_data; \ 845 \ 846 spin_lock(&khd->lock); \ 847 return seq_list_start(&khd->rqs[domain], *pos); \ 848 } \ 849 \ 850 static void *kyber_##name##_rqs_next(struct seq_file *m, void *v, \ 851 loff_t *pos) \ 852 { \ 853 struct blk_mq_hw_ctx *hctx = m->private; \ 854 struct kyber_hctx_data *khd = hctx->sched_data; \ 855 \ 856 return seq_list_next(v, &khd->rqs[domain], pos); \ 857 } \ 858 \ 859 static void kyber_##name##_rqs_stop(struct seq_file *m, void *v) \ 860 __releases(&khd->lock) \ 861 { \ 862 struct blk_mq_hw_ctx *hctx = m->private; \ 863 struct kyber_hctx_data *khd = hctx->sched_data; \ 864 \ 865 spin_unlock(&khd->lock); \ 866 } \ 867 \ 868 static const struct seq_operations kyber_##name##_rqs_seq_ops = { \ 869 .start = kyber_##name##_rqs_start, \ 870 .next = kyber_##name##_rqs_next, \ 871 .stop = kyber_##name##_rqs_stop, \ 872 .show = blk_mq_debugfs_rq_show, \ 873 }; \ 874 \ 875 static int kyber_##name##_waiting_show(void *data, struct seq_file *m) \ 876 { \ 877 struct blk_mq_hw_ctx *hctx = data; \ 878 struct kyber_hctx_data *khd = hctx->sched_data; \ 879 wait_queue_entry_t *wait = &khd->domain_wait[domain]; \ 880 \ 881 seq_printf(m, "%d\n", !list_empty_careful(&wait->entry)); \ 882 return 0; \ 883 } 884 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_READ, read) 885 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_SYNC_WRITE, sync_write) 886 KYBER_DEBUGFS_DOMAIN_ATTRS(KYBER_OTHER, other) 887 #undef KYBER_DEBUGFS_DOMAIN_ATTRS 888 889 static int kyber_async_depth_show(void *data, struct seq_file *m) 890 { 891 struct request_queue *q = data; 892 struct kyber_queue_data *kqd = q->elevator->elevator_data; 893 894 seq_printf(m, "%u\n", kqd->async_depth); 895 return 0; 896 } 897 898 static int kyber_cur_domain_show(void *data, struct seq_file *m) 899 { 900 struct blk_mq_hw_ctx *hctx = data; 901 struct kyber_hctx_data *khd = hctx->sched_data; 902 903 switch (khd->cur_domain) { 904 case KYBER_READ: 905 seq_puts(m, "READ\n"); 906 break; 907 case KYBER_SYNC_WRITE: 908 seq_puts(m, "SYNC_WRITE\n"); 909 break; 910 case KYBER_OTHER: 911 seq_puts(m, "OTHER\n"); 912 break; 913 default: 914 seq_printf(m, "%u\n", khd->cur_domain); 915 break; 916 } 917 return 0; 918 } 919 920 static int kyber_batching_show(void *data, struct seq_file *m) 921 { 922 struct blk_mq_hw_ctx *hctx = data; 923 struct kyber_hctx_data *khd = hctx->sched_data; 924 925 seq_printf(m, "%u\n", khd->batching); 926 return 0; 927 } 928 929 #define KYBER_QUEUE_DOMAIN_ATTRS(name) \ 930 {#name "_tokens", 0400, kyber_##name##_tokens_show} 931 static const struct blk_mq_debugfs_attr kyber_queue_debugfs_attrs[] = { 932 KYBER_QUEUE_DOMAIN_ATTRS(read), 933 KYBER_QUEUE_DOMAIN_ATTRS(sync_write), 934 KYBER_QUEUE_DOMAIN_ATTRS(other), 935 {"async_depth", 0400, kyber_async_depth_show}, 936 {}, 937 }; 938 #undef KYBER_QUEUE_DOMAIN_ATTRS 939 940 #define KYBER_HCTX_DOMAIN_ATTRS(name) \ 941 {#name "_rqs", 0400, .seq_ops = &kyber_##name##_rqs_seq_ops}, \ 942 {#name "_waiting", 0400, kyber_##name##_waiting_show} 943 static const struct blk_mq_debugfs_attr kyber_hctx_debugfs_attrs[] = { 944 KYBER_HCTX_DOMAIN_ATTRS(read), 945 KYBER_HCTX_DOMAIN_ATTRS(sync_write), 946 KYBER_HCTX_DOMAIN_ATTRS(other), 947 {"cur_domain", 0400, kyber_cur_domain_show}, 948 {"batching", 0400, kyber_batching_show}, 949 {}, 950 }; 951 #undef KYBER_HCTX_DOMAIN_ATTRS 952 #endif 953 954 static struct elevator_type kyber_sched = { 955 .ops.mq = { 956 .init_sched = kyber_init_sched, 957 .exit_sched = kyber_exit_sched, 958 .init_hctx = kyber_init_hctx, 959 .exit_hctx = kyber_exit_hctx, 960 .limit_depth = kyber_limit_depth, 961 .bio_merge = kyber_bio_merge, 962 .prepare_request = kyber_prepare_request, 963 .insert_requests = kyber_insert_requests, 964 .finish_request = kyber_finish_request, 965 .requeue_request = kyber_finish_request, 966 .completed_request = kyber_completed_request, 967 .dispatch_request = kyber_dispatch_request, 968 .has_work = kyber_has_work, 969 }, 970 .uses_mq = true, 971 #ifdef CONFIG_BLK_DEBUG_FS 972 .queue_debugfs_attrs = kyber_queue_debugfs_attrs, 973 .hctx_debugfs_attrs = kyber_hctx_debugfs_attrs, 974 #endif 975 .elevator_attrs = kyber_sched_attrs, 976 .elevator_name = "kyber", 977 .elevator_owner = THIS_MODULE, 978 }; 979 980 static int __init kyber_init(void) 981 { 982 return elv_register(&kyber_sched); 983 } 984 985 static void __exit kyber_exit(void) 986 { 987 elv_unregister(&kyber_sched); 988 } 989 990 module_init(kyber_init); 991 module_exit(kyber_exit); 992 993 MODULE_AUTHOR("Omar Sandoval"); 994 MODULE_LICENSE("GPL"); 995 MODULE_DESCRIPTION("Kyber I/O scheduler"); 996