1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Header file for the BFQ I/O scheduler: data structures and 4 * prototypes of interface functions among BFQ components. 5 */ 6 #ifndef _BFQ_H 7 #define _BFQ_H 8 9 #include <linux/blktrace_api.h> 10 #include <linux/hrtimer.h> 11 12 #include "blk-cgroup-rwstat.h" 13 14 #define BFQ_IOPRIO_CLASSES 3 15 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) 16 17 #define BFQ_MIN_WEIGHT 1 18 #define BFQ_MAX_WEIGHT 1000 19 #define BFQ_WEIGHT_CONVERSION_COEFF 10 20 21 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 22 23 #define BFQ_WEIGHT_LEGACY_DFL 100 24 #define BFQ_DEFAULT_GRP_IOPRIO 0 25 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE 26 27 #define MAX_BFQQ_NAME_LENGTH 16 28 29 /* 30 * Soft real-time applications are extremely more latency sensitive 31 * than interactive ones. Over-raise the weight of the former to 32 * privilege them against the latter. 33 */ 34 #define BFQ_SOFTRT_WEIGHT_FACTOR 100 35 36 /* 37 * Maximum number of actuators supported. This constant is used simply 38 * to define the size of the static array that will contain 39 * per-actuator data. The current value is hopefully a good upper 40 * bound to the possible number of actuators of any actual drive. 41 */ 42 #define BFQ_MAX_ACTUATORS 8 43 44 struct bfq_entity; 45 46 /** 47 * struct bfq_service_tree - per ioprio_class service tree. 48 * 49 * Each service tree represents a B-WF2Q+ scheduler on its own. Each 50 * ioprio_class has its own independent scheduler, and so its own 51 * bfq_service_tree. All the fields are protected by the queue lock 52 * of the containing bfqd. 53 */ 54 struct bfq_service_tree { 55 /* tree for active entities (i.e., those backlogged) */ 56 struct rb_root active; 57 /* tree for idle entities (i.e., not backlogged, with V < F_i)*/ 58 struct rb_root idle; 59 60 /* idle entity with minimum F_i */ 61 struct bfq_entity *first_idle; 62 /* idle entity with maximum F_i */ 63 struct bfq_entity *last_idle; 64 65 /* scheduler virtual time */ 66 u64 vtime; 67 /* scheduler weight sum; active and idle entities contribute to it */ 68 unsigned long wsum; 69 }; 70 71 /** 72 * struct bfq_sched_data - multi-class scheduler. 73 * 74 * bfq_sched_data is the basic scheduler queue. It supports three 75 * ioprio_classes, and can be used either as a toplevel queue or as an 76 * intermediate queue in a hierarchical setup. 77 * 78 * The supported ioprio_classes are the same as in CFQ, in descending 79 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. 80 * Requests from higher priority queues are served before all the 81 * requests from lower priority queues; among requests of the same 82 * queue requests are served according to B-WF2Q+. 83 * 84 * The schedule is implemented by the service trees, plus the field 85 * @next_in_service, which points to the entity on the active trees 86 * that will be served next, if 1) no changes in the schedule occurs 87 * before the current in-service entity is expired, 2) the in-service 88 * queue becomes idle when it expires, and 3) if the entity pointed by 89 * in_service_entity is not a queue, then the in-service child entity 90 * of the entity pointed by in_service_entity becomes idle on 91 * expiration. This peculiar definition allows for the following 92 * optimization, not yet exploited: while a given entity is still in 93 * service, we already know which is the best candidate for next 94 * service among the other active entities in the same parent 95 * entity. We can then quickly compare the timestamps of the 96 * in-service entity with those of such best candidate. 97 * 98 * All fields are protected by the lock of the containing bfqd. 99 */ 100 struct bfq_sched_data { 101 /* entity in service */ 102 struct bfq_entity *in_service_entity; 103 /* head-of-line entity (see comments above) */ 104 struct bfq_entity *next_in_service; 105 /* array of service trees, one per ioprio_class */ 106 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; 107 /* last time CLASS_IDLE was served */ 108 unsigned long bfq_class_idle_last_service; 109 110 }; 111 112 /** 113 * struct bfq_weight_counter - counter of the number of all active queues 114 * with a given weight. 115 */ 116 struct bfq_weight_counter { 117 unsigned int weight; /* weight of the queues this counter refers to */ 118 unsigned int num_active; /* nr of active queues with this weight */ 119 /* 120 * Weights tree member (see bfq_data's @queue_weights_tree) 121 */ 122 struct rb_node weights_node; 123 }; 124 125 /** 126 * struct bfq_entity - schedulable entity. 127 * 128 * A bfq_entity is used to represent either a bfq_queue (leaf node in the 129 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each 130 * entity belongs to the sched_data of the parent group in the cgroup 131 * hierarchy. Non-leaf entities have also their own sched_data, stored 132 * in @my_sched_data. 133 * 134 * Each entity stores independently its priority values; this would 135 * allow different weights on different devices, but this 136 * functionality is not exported to userspace by now. Priorities and 137 * weights are updated lazily, first storing the new values into the 138 * new_* fields, then setting the @prio_changed flag. As soon as 139 * there is a transition in the entity state that allows the priority 140 * update to take place the effective and the requested priority 141 * values are synchronized. 142 * 143 * Unless cgroups are used, the weight value is calculated from the 144 * ioprio to export the same interface as CFQ. When dealing with 145 * "well-behaved" queues (i.e., queues that do not spend too much 146 * time to consume their budget and have true sequential behavior, and 147 * when there are no external factors breaking anticipation) the 148 * relative weights at each level of the cgroups hierarchy should be 149 * guaranteed. All the fields are protected by the queue lock of the 150 * containing bfqd. 151 */ 152 struct bfq_entity { 153 /* service_tree member */ 154 struct rb_node rb_node; 155 156 /* 157 * Flag, true if the entity is on a tree (either the active or 158 * the idle one of its service_tree) or is in service. 159 */ 160 bool on_st_or_in_serv; 161 162 /* B-WF2Q+ start and finish timestamps [sectors/weight] */ 163 u64 start, finish; 164 165 /* tree the entity is enqueued into; %NULL if not on a tree */ 166 struct rb_root *tree; 167 168 /* 169 * minimum start time of the (active) subtree rooted at this 170 * entity; used for O(log N) lookups into active trees 171 */ 172 u64 min_start; 173 174 /* amount of service received during the last service slot */ 175 int service; 176 177 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ 178 int budget; 179 180 /* Number of requests allocated in the subtree of this entity */ 181 int allocated; 182 183 /* device weight, if non-zero, it overrides the default weight of 184 * bfq_group_data */ 185 int dev_weight; 186 /* weight of the queue */ 187 int weight; 188 /* next weight if a change is in progress */ 189 int new_weight; 190 191 /* original weight, used to implement weight boosting */ 192 int orig_weight; 193 194 /* parent entity, for hierarchical scheduling */ 195 struct bfq_entity *parent; 196 197 /* 198 * For non-leaf nodes in the hierarchy, the associated 199 * scheduler queue, %NULL on leaf nodes. 200 */ 201 struct bfq_sched_data *my_sched_data; 202 /* the scheduler queue this entity belongs to */ 203 struct bfq_sched_data *sched_data; 204 205 /* flag, set to request a weight, ioprio or ioprio_class change */ 206 int prio_changed; 207 208 #ifdef CONFIG_BFQ_GROUP_IOSCHED 209 /* flag, set if the entity is counted in groups_with_pending_reqs */ 210 bool in_groups_with_pending_reqs; 211 #endif 212 213 /* last child queue of entity created (for non-leaf entities) */ 214 struct bfq_queue *last_bfqq_created; 215 }; 216 217 struct bfq_group; 218 219 /** 220 * struct bfq_ttime - per process thinktime stats. 221 */ 222 struct bfq_ttime { 223 /* completion time of the last request */ 224 u64 last_end_request; 225 226 /* total process thinktime */ 227 u64 ttime_total; 228 /* number of thinktime samples */ 229 unsigned long ttime_samples; 230 /* average process thinktime */ 231 u64 ttime_mean; 232 }; 233 234 /** 235 * struct bfq_queue - leaf schedulable entity. 236 * 237 * A bfq_queue is a leaf request queue; it can be associated with an 238 * io_context or more, if it is async or shared between cooperating 239 * processes. Besides, it contains I/O requests for only one actuator 240 * (an io_context is associated with a different bfq_queue for each 241 * actuator it generates I/O for). @cgroup holds a reference to the 242 * cgroup, to be sure that it does not disappear while a bfqq still 243 * references it (mostly to avoid races between request issuing and 244 * task migration followed by cgroup destruction). All the fields are 245 * protected by the queue lock of the containing bfqd. 246 */ 247 struct bfq_queue { 248 /* reference counter */ 249 int ref; 250 /* counter of references from other queues for delayed stable merge */ 251 int stable_ref; 252 /* parent bfq_data */ 253 struct bfq_data *bfqd; 254 255 /* current ioprio and ioprio class */ 256 unsigned short ioprio, ioprio_class; 257 /* next ioprio and ioprio class if a change is in progress */ 258 unsigned short new_ioprio, new_ioprio_class; 259 260 /* last total-service-time sample, see bfq_update_inject_limit() */ 261 u64 last_serv_time_ns; 262 /* limit for request injection */ 263 unsigned int inject_limit; 264 /* last time the inject limit has been decreased, in jiffies */ 265 unsigned long decrease_time_jif; 266 267 /* 268 * Shared bfq_queue if queue is cooperating with one or more 269 * other queues. 270 */ 271 struct bfq_queue *new_bfqq; 272 /* request-position tree member (see bfq_group's @rq_pos_tree) */ 273 struct rb_node pos_node; 274 /* request-position tree root (see bfq_group's @rq_pos_tree) */ 275 struct rb_root *pos_root; 276 277 /* sorted list of pending requests */ 278 struct rb_root sort_list; 279 /* if fifo isn't expired, next request to serve */ 280 struct request *next_rq; 281 /* number of sync and async requests queued */ 282 int queued[2]; 283 /* number of pending metadata requests */ 284 int meta_pending; 285 /* fifo list of requests in sort_list */ 286 struct list_head fifo; 287 288 /* entity representing this queue in the scheduler */ 289 struct bfq_entity entity; 290 291 /* pointer to the weight counter associated with this entity */ 292 struct bfq_weight_counter *weight_counter; 293 294 /* maximum budget allowed from the feedback mechanism */ 295 int max_budget; 296 /* budget expiration (in jiffies) */ 297 unsigned long budget_timeout; 298 299 /* number of requests on the dispatch list or inside driver */ 300 int dispatched; 301 302 /* status flags */ 303 unsigned long flags; 304 305 /* node for active/idle bfqq list inside parent bfqd */ 306 struct list_head bfqq_list; 307 308 /* associated @bfq_ttime struct */ 309 struct bfq_ttime ttime; 310 311 /* when bfqq started to do I/O within the last observation window */ 312 u64 io_start_time; 313 /* how long bfqq has remained empty during the last observ. window */ 314 u64 tot_idle_time; 315 316 /* bit vector: a 1 for each seeky requests in history */ 317 u32 seek_history; 318 319 /* node for the device's burst list */ 320 struct hlist_node burst_list_node; 321 322 /* position of the last request enqueued */ 323 sector_t last_request_pos; 324 325 /* Number of consecutive pairs of request completion and 326 * arrival, such that the queue becomes idle after the 327 * completion, but the next request arrives within an idle 328 * time slice; used only if the queue's IO_bound flag has been 329 * cleared. 330 */ 331 unsigned int requests_within_timer; 332 333 /* pid of the process owning the queue, used for logging purposes */ 334 pid_t pid; 335 336 /* 337 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL 338 * if the queue is shared. 339 */ 340 struct bfq_io_cq *bic; 341 342 /* current maximum weight-raising time for this queue */ 343 unsigned long wr_cur_max_time; 344 /* 345 * Minimum time instant such that, only if a new request is 346 * enqueued after this time instant in an idle @bfq_queue with 347 * no outstanding requests, then the task associated with the 348 * queue it is deemed as soft real-time (see the comments on 349 * the function bfq_bfqq_softrt_next_start()) 350 */ 351 unsigned long soft_rt_next_start; 352 /* 353 * Start time of the current weight-raising period if 354 * the @bfq-queue is being weight-raised, otherwise 355 * finish time of the last weight-raising period. 356 */ 357 unsigned long last_wr_start_finish; 358 /* factor by which the weight of this queue is multiplied */ 359 unsigned int wr_coeff; 360 /* 361 * Time of the last transition of the @bfq_queue from idle to 362 * backlogged. 363 */ 364 unsigned long last_idle_bklogged; 365 /* 366 * Cumulative service received from the @bfq_queue since the 367 * last transition from idle to backlogged. 368 */ 369 unsigned long service_from_backlogged; 370 /* 371 * Cumulative service received from the @bfq_queue since its 372 * last transition to weight-raised state. 373 */ 374 unsigned long service_from_wr; 375 376 /* 377 * Value of wr start time when switching to soft rt 378 */ 379 unsigned long wr_start_at_switch_to_srt; 380 381 unsigned long split_time; /* time of last split */ 382 383 unsigned long first_IO_time; /* time of first I/O for this queue */ 384 unsigned long creation_time; /* when this queue is created */ 385 386 /* 387 * Pointer to the waker queue for this queue, i.e., to the 388 * queue Q such that this queue happens to get new I/O right 389 * after some I/O request of Q is completed. For details, see 390 * the comments on the choice of the queue for injection in 391 * bfq_select_queue(). 392 */ 393 struct bfq_queue *waker_bfqq; 394 /* pointer to the curr. tentative waker queue, see bfq_check_waker() */ 395 struct bfq_queue *tentative_waker_bfqq; 396 /* number of times the same tentative waker has been detected */ 397 unsigned int num_waker_detections; 398 /* time when we started considering this waker */ 399 u64 waker_detection_started; 400 401 /* node for woken_list, see below */ 402 struct hlist_node woken_list_node; 403 /* 404 * Head of the list of the woken queues for this queue, i.e., 405 * of the list of the queues for which this queue is a waker 406 * queue. This list is used to reset the waker_bfqq pointer in 407 * the woken queues when this queue exits. 408 */ 409 struct hlist_head woken_list; 410 411 /* index of the actuator this queue is associated with */ 412 unsigned int actuator_idx; 413 }; 414 415 /** 416 * struct bfq_data - bfqq data unique and persistent for associated bfq_io_cq 417 */ 418 struct bfq_iocq_bfqq_data { 419 /* 420 * Snapshot of the has_short_time flag before merging; taken 421 * to remember its values while the queue is merged, so as to 422 * be able to restore it in case of split. 423 */ 424 bool saved_has_short_ttime; 425 /* 426 * Same purpose as the previous two fields for the I/O bound 427 * classification of a queue. 428 */ 429 bool saved_IO_bound; 430 431 u64 saved_io_start_time; 432 u64 saved_tot_idle_time; 433 434 /* 435 * Same purpose as the previous fields for the values of the 436 * field keeping the queue's belonging to a large burst 437 */ 438 bool saved_in_large_burst; 439 /* 440 * True if the queue belonged to a burst list before its merge 441 * with another cooperating queue. 442 */ 443 bool was_in_burst_list; 444 445 /* 446 * Save the weight when a merge occurs, to be able 447 * to restore it in case of split. If the weight is not 448 * correctly resumed when the queue is recycled, 449 * then the weight of the recycled queue could differ 450 * from the weight of the original queue. 451 */ 452 unsigned int saved_weight; 453 454 /* 455 * Similar to previous fields: save wr information. 456 */ 457 unsigned long saved_wr_coeff; 458 unsigned long saved_last_wr_start_finish; 459 unsigned long saved_service_from_wr; 460 unsigned long saved_wr_start_at_switch_to_srt; 461 unsigned int saved_wr_cur_max_time; 462 struct bfq_ttime saved_ttime; 463 464 /* Save also injection state */ 465 u64 saved_last_serv_time_ns; 466 unsigned int saved_inject_limit; 467 unsigned long saved_decrease_time_jif; 468 469 /* candidate queue for a stable merge (due to close creation time) */ 470 struct bfq_queue *stable_merge_bfqq; 471 472 bool stably_merged; /* non splittable if true */ 473 }; 474 475 /** 476 * struct bfq_io_cq - per (request_queue, io_context) structure. 477 */ 478 struct bfq_io_cq { 479 /* associated io_cq structure */ 480 struct io_cq icq; /* must be the first member */ 481 /* 482 * Matrix of associated process queues: first row for async 483 * queues, second row sync queues. Each row contains one 484 * column for each actuator. An I/O request generated by the 485 * process is inserted into the queue pointed by bfqq[i][j] if 486 * the request is to be served by the j-th actuator of the 487 * drive, where i==0 or i==1, depending on whether the request 488 * is async or sync. So there is a distinct queue for each 489 * actuator. 490 */ 491 struct bfq_queue *bfqq[2][BFQ_MAX_ACTUATORS]; 492 /* per (request_queue, blkcg) ioprio */ 493 int ioprio; 494 #ifdef CONFIG_BFQ_GROUP_IOSCHED 495 uint64_t blkcg_serial_nr; /* the current blkcg serial */ 496 #endif 497 498 /* 499 * Persistent data for associated synchronous process queues 500 * (one queue per actuator, see field bfqq above). In 501 * particular, each of these queues may undergo a merge. 502 */ 503 struct bfq_iocq_bfqq_data bfqq_data[BFQ_MAX_ACTUATORS]; 504 505 unsigned int requests; /* Number of requests this process has in flight */ 506 }; 507 508 /** 509 * struct bfq_data - per-device data structure. 510 * 511 * All the fields are protected by @lock. 512 */ 513 struct bfq_data { 514 /* device request queue */ 515 struct request_queue *queue; 516 /* dispatch queue */ 517 struct list_head dispatch; 518 519 /* root bfq_group for the device */ 520 struct bfq_group *root_group; 521 522 /* 523 * rbtree of weight counters of @bfq_queues, sorted by 524 * weight. Used to keep track of whether all @bfq_queues have 525 * the same weight. The tree contains one counter for each 526 * distinct weight associated to some active and not 527 * weight-raised @bfq_queue (see the comments to the functions 528 * bfq_weights_tree_[add|remove] for further details). 529 */ 530 struct rb_root_cached queue_weights_tree; 531 532 #ifdef CONFIG_BFQ_GROUP_IOSCHED 533 /* 534 * Number of groups with at least one process that 535 * has at least one request waiting for completion. Note that 536 * this accounts for also requests already dispatched, but not 537 * yet completed. Therefore this number of groups may differ 538 * (be larger) than the number of active groups, as a group is 539 * considered active only if its corresponding entity has 540 * queues with at least one request queued. This 541 * number is used to decide whether a scenario is symmetric. 542 * For a detailed explanation see comments on the computation 543 * of the variable asymmetric_scenario in the function 544 * bfq_better_to_idle(). 545 * 546 * However, it is hard to compute this number exactly, for 547 * groups with multiple processes. Consider a group 548 * that is inactive, i.e., that has no process with 549 * pending I/O inside BFQ queues. Then suppose that 550 * num_groups_with_pending_reqs is still accounting for this 551 * group, because the group has processes with some 552 * I/O request still in flight. num_groups_with_pending_reqs 553 * should be decremented when the in-flight request of the 554 * last process is finally completed (assuming that 555 * nothing else has changed for the group in the meantime, in 556 * terms of composition of the group and active/inactive state of child 557 * groups and processes). To accomplish this, an additional 558 * pending-request counter must be added to entities, and must 559 * be updated correctly. To avoid this additional field and operations, 560 * we resort to the following tradeoff between simplicity and 561 * accuracy: for an inactive group that is still counted in 562 * num_groups_with_pending_reqs, we decrement 563 * num_groups_with_pending_reqs when the first 564 * process of the group remains with no request waiting for 565 * completion. 566 * 567 * Even this simpler decrement strategy requires a little 568 * carefulness: to avoid multiple decrements, we flag a group, 569 * more precisely an entity representing a group, as still 570 * counted in num_groups_with_pending_reqs when it becomes 571 * inactive. Then, when the first queue of the 572 * entity remains with no request waiting for completion, 573 * num_groups_with_pending_reqs is decremented, and this flag 574 * is reset. After this flag is reset for the entity, 575 * num_groups_with_pending_reqs won't be decremented any 576 * longer in case a new queue of the entity remains 577 * with no request waiting for completion. 578 */ 579 unsigned int num_groups_with_pending_reqs; 580 #endif 581 582 /* 583 * Per-class (RT, BE, IDLE) number of bfq_queues containing 584 * requests (including the queue in service, even if it is 585 * idling). 586 */ 587 unsigned int busy_queues[3]; 588 /* number of weight-raised busy @bfq_queues */ 589 int wr_busy_queues; 590 /* number of queued requests */ 591 int queued; 592 /* number of requests dispatched and waiting for completion */ 593 int tot_rq_in_driver; 594 /* 595 * number of requests dispatched and waiting for completion 596 * for each actuator 597 */ 598 int rq_in_driver[BFQ_MAX_ACTUATORS]; 599 600 /* true if the device is non rotational and performs queueing */ 601 bool nonrot_with_queueing; 602 603 /* 604 * Maximum number of requests in driver in the last 605 * @hw_tag_samples completed requests. 606 */ 607 int max_rq_in_driver; 608 /* number of samples used to calculate hw_tag */ 609 int hw_tag_samples; 610 /* flag set to one if the driver is showing a queueing behavior */ 611 int hw_tag; 612 613 /* number of budgets assigned */ 614 int budgets_assigned; 615 616 /* 617 * Timer set when idling (waiting) for the next request from 618 * the queue in service. 619 */ 620 struct hrtimer idle_slice_timer; 621 622 /* bfq_queue in service */ 623 struct bfq_queue *in_service_queue; 624 625 /* on-disk position of the last served request */ 626 sector_t last_position; 627 628 /* position of the last served request for the in-service queue */ 629 sector_t in_serv_last_pos; 630 631 /* time of last request completion (ns) */ 632 u64 last_completion; 633 634 /* bfqq owning the last completed rq */ 635 struct bfq_queue *last_completed_rq_bfqq; 636 637 /* last bfqq created, among those in the root group */ 638 struct bfq_queue *last_bfqq_created; 639 640 /* time of last transition from empty to non-empty (ns) */ 641 u64 last_empty_occupied_ns; 642 643 /* 644 * Flag set to activate the sampling of the total service time 645 * of a just-arrived first I/O request (see 646 * bfq_update_inject_limit()). This will cause the setting of 647 * waited_rq when the request is finally dispatched. 648 */ 649 bool wait_dispatch; 650 /* 651 * If set, then bfq_update_inject_limit() is invoked when 652 * waited_rq is eventually completed. 653 */ 654 struct request *waited_rq; 655 /* 656 * True if some request has been injected during the last service hole. 657 */ 658 bool rqs_injected; 659 660 /* time of first rq dispatch in current observation interval (ns) */ 661 u64 first_dispatch; 662 /* time of last rq dispatch in current observation interval (ns) */ 663 u64 last_dispatch; 664 665 /* beginning of the last budget */ 666 ktime_t last_budget_start; 667 /* beginning of the last idle slice */ 668 ktime_t last_idling_start; 669 unsigned long last_idling_start_jiffies; 670 671 /* number of samples in current observation interval */ 672 int peak_rate_samples; 673 /* num of samples of seq dispatches in current observation interval */ 674 u32 sequential_samples; 675 /* total num of sectors transferred in current observation interval */ 676 u64 tot_sectors_dispatched; 677 /* max rq size seen during current observation interval (sectors) */ 678 u32 last_rq_max_size; 679 /* time elapsed from first dispatch in current observ. interval (us) */ 680 u64 delta_from_first; 681 /* 682 * Current estimate of the device peak rate, measured in 683 * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by 684 * BFQ_RATE_SHIFT is performed to increase precision in 685 * fixed-point calculations. 686 */ 687 u32 peak_rate; 688 689 /* maximum budget allotted to a bfq_queue before rescheduling */ 690 int bfq_max_budget; 691 692 /* 693 * List of all the bfq_queues active for a specific actuator 694 * on the device. Keeping active queues separate on a 695 * per-actuator basis helps implementing per-actuator 696 * injection more efficiently. 697 */ 698 struct list_head active_list[BFQ_MAX_ACTUATORS]; 699 /* list of all the bfq_queues idle on the device */ 700 struct list_head idle_list; 701 702 /* 703 * Timeout for async/sync requests; when it fires, requests 704 * are served in fifo order. 705 */ 706 u64 bfq_fifo_expire[2]; 707 /* weight of backward seeks wrt forward ones */ 708 unsigned int bfq_back_penalty; 709 /* maximum allowed backward seek */ 710 unsigned int bfq_back_max; 711 /* maximum idling time */ 712 u32 bfq_slice_idle; 713 714 /* user-configured max budget value (0 for auto-tuning) */ 715 int bfq_user_max_budget; 716 /* 717 * Timeout for bfq_queues to consume their budget; used to 718 * prevent seeky queues from imposing long latencies to 719 * sequential or quasi-sequential ones (this also implies that 720 * seeky queues cannot receive guarantees in the service 721 * domain; after a timeout they are charged for the time they 722 * have been in service, to preserve fairness among them, but 723 * without service-domain guarantees). 724 */ 725 unsigned int bfq_timeout; 726 727 /* 728 * Force device idling whenever needed to provide accurate 729 * service guarantees, without caring about throughput 730 * issues. CAVEAT: this may even increase latencies, in case 731 * of useless idling for processes that did stop doing I/O. 732 */ 733 bool strict_guarantees; 734 735 /* 736 * Last time at which a queue entered the current burst of 737 * queues being activated shortly after each other; for more 738 * details about this and the following parameters related to 739 * a burst of activations, see the comments on the function 740 * bfq_handle_burst. 741 */ 742 unsigned long last_ins_in_burst; 743 /* 744 * Reference time interval used to decide whether a queue has 745 * been activated shortly after @last_ins_in_burst. 746 */ 747 unsigned long bfq_burst_interval; 748 /* number of queues in the current burst of queue activations */ 749 int burst_size; 750 751 /* common parent entity for the queues in the burst */ 752 struct bfq_entity *burst_parent_entity; 753 /* Maximum burst size above which the current queue-activation 754 * burst is deemed as 'large'. 755 */ 756 unsigned long bfq_large_burst_thresh; 757 /* true if a large queue-activation burst is in progress */ 758 bool large_burst; 759 /* 760 * Head of the burst list (as for the above fields, more 761 * details in the comments on the function bfq_handle_burst). 762 */ 763 struct hlist_head burst_list; 764 765 /* if set to true, low-latency heuristics are enabled */ 766 bool low_latency; 767 /* 768 * Maximum factor by which the weight of a weight-raised queue 769 * is multiplied. 770 */ 771 unsigned int bfq_wr_coeff; 772 773 /* Maximum weight-raising duration for soft real-time processes */ 774 unsigned int bfq_wr_rt_max_time; 775 /* 776 * Minimum idle period after which weight-raising may be 777 * reactivated for a queue (in jiffies). 778 */ 779 unsigned int bfq_wr_min_idle_time; 780 /* 781 * Minimum period between request arrivals after which 782 * weight-raising may be reactivated for an already busy async 783 * queue (in jiffies). 784 */ 785 unsigned long bfq_wr_min_inter_arr_async; 786 787 /* Max service-rate for a soft real-time queue, in sectors/sec */ 788 unsigned int bfq_wr_max_softrt_rate; 789 /* 790 * Cached value of the product ref_rate*ref_wr_duration, used 791 * for computing the maximum duration of weight raising 792 * automatically. 793 */ 794 u64 rate_dur_prod; 795 796 /* fallback dummy bfqq for extreme OOM conditions */ 797 struct bfq_queue oom_bfqq; 798 799 spinlock_t lock; 800 801 /* 802 * bic associated with the task issuing current bio for 803 * merging. This and the next field are used as a support to 804 * be able to perform the bic lookup, needed by bio-merge 805 * functions, before the scheduler lock is taken, and thus 806 * avoid taking the request-queue lock while the scheduler 807 * lock is being held. 808 */ 809 struct bfq_io_cq *bio_bic; 810 /* bfqq associated with the task issuing current bio for merging */ 811 struct bfq_queue *bio_bfqq; 812 813 /* 814 * Depth limits used in bfq_limit_depth (see comments on the 815 * function) 816 */ 817 unsigned int word_depths[2][2]; 818 unsigned int full_depth_shift; 819 820 /* 821 * Number of independent actuators. This is equal to 1 in 822 * case of single-actuator drives. 823 */ 824 unsigned int num_actuators; 825 /* 826 * Disk independent access ranges for each actuator 827 * in this device. 828 */ 829 sector_t sector[BFQ_MAX_ACTUATORS]; 830 sector_t nr_sectors[BFQ_MAX_ACTUATORS]; 831 struct blk_independent_access_range ia_ranges[BFQ_MAX_ACTUATORS]; 832 833 /* 834 * If the number of I/O requests queued in the device for a 835 * given actuator is below next threshold, then the actuator 836 * is deemed as underutilized. If this condition is found to 837 * hold for some actuator upon a dispatch, but (i) the 838 * in-service queue does not contain I/O for that actuator, 839 * while (ii) some other queue does contain I/O for that 840 * actuator, then the head I/O request of the latter queue is 841 * returned (injected), instead of the head request of the 842 * currently in-service queue. 843 * 844 * We set the threshold, empirically, to the minimum possible 845 * value for which an actuator is fully utilized, or close to 846 * be fully utilized. By doing so, injected I/O 'steals' as 847 * few drive-queue slots as possibile to the in-service 848 * queue. This reduces as much as possible the probability 849 * that the service of I/O from the in-service bfq_queue gets 850 * delayed because of slot exhaustion, i.e., because all the 851 * slots of the drive queue are filled with I/O injected from 852 * other queues (NCQ provides for 32 slots). 853 */ 854 unsigned int actuator_load_threshold; 855 }; 856 857 enum bfqq_state_flags { 858 BFQQF_just_created = 0, /* queue just allocated */ 859 BFQQF_busy, /* has requests or is in service */ 860 BFQQF_wait_request, /* waiting for a request */ 861 BFQQF_non_blocking_wait_rq, /* 862 * waiting for a request 863 * without idling the device 864 */ 865 BFQQF_fifo_expire, /* FIFO checked in this slice */ 866 BFQQF_has_short_ttime, /* queue has a short think time */ 867 BFQQF_sync, /* synchronous queue */ 868 BFQQF_IO_bound, /* 869 * bfqq has timed-out at least once 870 * having consumed at most 2/10 of 871 * its budget 872 */ 873 BFQQF_in_large_burst, /* 874 * bfqq activated in a large burst, 875 * see comments to bfq_handle_burst. 876 */ 877 BFQQF_softrt_update, /* 878 * may need softrt-next-start 879 * update 880 */ 881 BFQQF_coop, /* bfqq is shared */ 882 BFQQF_split_coop, /* shared bfqq will be split */ 883 }; 884 885 #define BFQ_BFQQ_FNS(name) \ 886 void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \ 887 void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \ 888 int bfq_bfqq_##name(const struct bfq_queue *bfqq); 889 890 BFQ_BFQQ_FNS(just_created); 891 BFQ_BFQQ_FNS(busy); 892 BFQ_BFQQ_FNS(wait_request); 893 BFQ_BFQQ_FNS(non_blocking_wait_rq); 894 BFQ_BFQQ_FNS(fifo_expire); 895 BFQ_BFQQ_FNS(has_short_ttime); 896 BFQ_BFQQ_FNS(sync); 897 BFQ_BFQQ_FNS(IO_bound); 898 BFQ_BFQQ_FNS(in_large_burst); 899 BFQ_BFQQ_FNS(coop); 900 BFQ_BFQQ_FNS(split_coop); 901 BFQ_BFQQ_FNS(softrt_update); 902 #undef BFQ_BFQQ_FNS 903 904 /* Expiration reasons. */ 905 enum bfqq_expiration { 906 BFQQE_TOO_IDLE = 0, /* 907 * queue has been idling for 908 * too long 909 */ 910 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */ 911 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */ 912 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */ 913 BFQQE_PREEMPTED /* preemption in progress */ 914 }; 915 916 struct bfq_stat { 917 struct percpu_counter cpu_cnt; 918 atomic64_t aux_cnt; 919 }; 920 921 struct bfqg_stats { 922 /* basic stats */ 923 struct blkg_rwstat bytes; 924 struct blkg_rwstat ios; 925 #ifdef CONFIG_BFQ_CGROUP_DEBUG 926 /* number of ios merged */ 927 struct blkg_rwstat merged; 928 /* total time spent on device in ns, may not be accurate w/ queueing */ 929 struct blkg_rwstat service_time; 930 /* total time spent waiting in scheduler queue in ns */ 931 struct blkg_rwstat wait_time; 932 /* number of IOs queued up */ 933 struct blkg_rwstat queued; 934 /* total disk time and nr sectors dispatched by this group */ 935 struct bfq_stat time; 936 /* sum of number of ios queued across all samples */ 937 struct bfq_stat avg_queue_size_sum; 938 /* count of samples taken for average */ 939 struct bfq_stat avg_queue_size_samples; 940 /* how many times this group has been removed from service tree */ 941 struct bfq_stat dequeue; 942 /* total time spent waiting for it to be assigned a timeslice. */ 943 struct bfq_stat group_wait_time; 944 /* time spent idling for this blkcg_gq */ 945 struct bfq_stat idle_time; 946 /* total time with empty current active q with other requests queued */ 947 struct bfq_stat empty_time; 948 /* fields after this shouldn't be cleared on stat reset */ 949 u64 start_group_wait_time; 950 u64 start_idle_time; 951 u64 start_empty_time; 952 uint16_t flags; 953 #endif /* CONFIG_BFQ_CGROUP_DEBUG */ 954 }; 955 956 #ifdef CONFIG_BFQ_GROUP_IOSCHED 957 958 /* 959 * struct bfq_group_data - per-blkcg storage for the blkio subsystem. 960 * 961 * @ps: @blkcg_policy_storage that this structure inherits 962 * @weight: weight of the bfq_group 963 */ 964 struct bfq_group_data { 965 /* must be the first member */ 966 struct blkcg_policy_data pd; 967 968 unsigned int weight; 969 }; 970 971 /** 972 * struct bfq_group - per (device, cgroup) data structure. 973 * @entity: schedulable entity to insert into the parent group sched_data. 974 * @sched_data: own sched_data, to contain child entities (they may be 975 * both bfq_queues and bfq_groups). 976 * @bfqd: the bfq_data for the device this group acts upon. 977 * @async_bfqq: array of async queues for all the tasks belonging to 978 * the group, one queue per ioprio value per ioprio_class, 979 * except for the idle class that has only one queue. 980 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). 981 * @my_entity: pointer to @entity, %NULL for the toplevel group; used 982 * to avoid too many special cases during group creation/ 983 * migration. 984 * @stats: stats for this bfqg. 985 * @active_entities: number of active entities belonging to the group; 986 * unused for the root group. Used to know whether there 987 * are groups with more than one active @bfq_entity 988 * (see the comments to the function 989 * bfq_bfqq_may_idle()). 990 * @rq_pos_tree: rbtree sorted by next_request position, used when 991 * determining if two or more queues have interleaving 992 * requests (see bfq_find_close_cooperator()). 993 * 994 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup 995 * there is a set of bfq_groups, each one collecting the lower-level 996 * entities belonging to the group that are acting on the same device. 997 * 998 * Locking works as follows: 999 * o @bfqd is protected by the queue lock, RCU is used to access it 1000 * from the readers. 1001 * o All the other fields are protected by the @bfqd queue lock. 1002 */ 1003 struct bfq_group { 1004 /* must be the first member */ 1005 struct blkg_policy_data pd; 1006 1007 /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ 1008 char blkg_path[128]; 1009 1010 /* reference counter (see comments in bfq_bic_update_cgroup) */ 1011 refcount_t ref; 1012 1013 struct bfq_entity entity; 1014 struct bfq_sched_data sched_data; 1015 1016 struct bfq_data *bfqd; 1017 1018 struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; 1019 struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; 1020 1021 struct bfq_entity *my_entity; 1022 1023 int active_entities; 1024 int num_queues_with_pending_reqs; 1025 1026 struct rb_root rq_pos_tree; 1027 1028 struct bfqg_stats stats; 1029 }; 1030 1031 #else 1032 struct bfq_group { 1033 struct bfq_entity entity; 1034 struct bfq_sched_data sched_data; 1035 1036 struct bfq_queue *async_bfqq[2][IOPRIO_NR_LEVELS][BFQ_MAX_ACTUATORS]; 1037 struct bfq_queue *async_idle_bfqq[BFQ_MAX_ACTUATORS]; 1038 1039 struct rb_root rq_pos_tree; 1040 }; 1041 #endif 1042 1043 /* --------------- main algorithm interface ----------------- */ 1044 1045 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ 1046 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) 1047 1048 extern const int bfq_timeout; 1049 1050 struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync, 1051 unsigned int actuator_idx); 1052 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync, 1053 unsigned int actuator_idx); 1054 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); 1055 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1056 void bfq_weights_tree_add(struct bfq_queue *bfqq); 1057 void bfq_weights_tree_remove(struct bfq_queue *bfqq); 1058 void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1059 bool compensate, enum bfqq_expiration reason); 1060 void bfq_put_queue(struct bfq_queue *bfqq); 1061 void bfq_put_cooperator(struct bfq_queue *bfqq); 1062 void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); 1063 void bfq_release_process_ref(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1064 void bfq_schedule_dispatch(struct bfq_data *bfqd); 1065 void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); 1066 1067 /* ------------ end of main algorithm interface -------------- */ 1068 1069 /* ---------------- cgroups-support interface ---------------- */ 1070 1071 void bfqg_stats_update_legacy_io(struct request_queue *q, struct request *rq); 1072 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, blk_opf_t opf); 1073 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, blk_opf_t opf); 1074 void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, 1075 u64 io_start_time_ns, blk_opf_t opf); 1076 void bfqg_stats_update_dequeue(struct bfq_group *bfqg); 1077 void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); 1078 void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1079 struct bfq_group *bfqg); 1080 1081 #ifdef CONFIG_BFQ_CGROUP_DEBUG 1082 void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 1083 blk_opf_t opf); 1084 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); 1085 void bfqg_stats_update_idle_time(struct bfq_group *bfqg); 1086 void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); 1087 #endif 1088 1089 void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); 1090 void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); 1091 void bfq_end_wr_async(struct bfq_data *bfqd); 1092 struct bfq_group *bfq_bio_bfqg(struct bfq_data *bfqd, struct bio *bio); 1093 struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); 1094 struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 1095 struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); 1096 void bfqg_and_blkg_put(struct bfq_group *bfqg); 1097 1098 #ifdef CONFIG_BFQ_GROUP_IOSCHED 1099 extern struct cftype bfq_blkcg_legacy_files[]; 1100 extern struct cftype bfq_blkg_files[]; 1101 extern struct blkcg_policy blkcg_policy_bfq; 1102 #endif 1103 1104 /* ------------- end of cgroups-support interface ------------- */ 1105 1106 /* - interface of the internal hierarchical B-WF2Q+ scheduler - */ 1107 1108 #ifdef CONFIG_BFQ_GROUP_IOSCHED 1109 /* both next loops stop at one of the child entities of the root group */ 1110 #define for_each_entity(entity) \ 1111 for (; entity ; entity = entity->parent) 1112 1113 /* 1114 * For each iteration, compute parent in advance, so as to be safe if 1115 * entity is deallocated during the iteration. Such a deallocation may 1116 * happen as a consequence of a bfq_put_queue that frees the bfq_queue 1117 * containing entity. 1118 */ 1119 #define for_each_entity_safe(entity, parent) \ 1120 for (; entity && ({ parent = entity->parent; 1; }); entity = parent) 1121 1122 #else /* CONFIG_BFQ_GROUP_IOSCHED */ 1123 /* 1124 * Next two macros are fake loops when cgroups support is not 1125 * enabled. I fact, in such a case, there is only one level to go up 1126 * (to reach the root group). 1127 */ 1128 #define for_each_entity(entity) \ 1129 for (; entity ; entity = NULL) 1130 1131 #define for_each_entity_safe(entity, parent) \ 1132 for (parent = NULL; entity ; entity = parent) 1133 #endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1134 1135 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); 1136 unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); 1137 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); 1138 struct bfq_entity *bfq_entity_of(struct rb_node *node); 1139 unsigned short bfq_ioprio_to_weight(int ioprio); 1140 void bfq_put_idle_entity(struct bfq_service_tree *st, 1141 struct bfq_entity *entity); 1142 struct bfq_service_tree * 1143 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, 1144 struct bfq_entity *entity, 1145 bool update_class_too); 1146 void bfq_bfqq_served(struct bfq_queue *bfqq, int served); 1147 void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1148 unsigned long time_ms); 1149 bool __bfq_deactivate_entity(struct bfq_entity *entity, 1150 bool ins_into_idle_tree); 1151 bool next_queue_may_preempt(struct bfq_data *bfqd); 1152 struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd); 1153 bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); 1154 void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1155 bool ins_into_idle_tree, bool expiration); 1156 void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1157 void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1158 bool expiration); 1159 void bfq_del_bfqq_busy(struct bfq_queue *bfqq, bool expiration); 1160 void bfq_add_bfqq_busy(struct bfq_queue *bfqq); 1161 void bfq_add_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); 1162 void bfq_del_bfqq_in_groups_with_pending_reqs(struct bfq_queue *bfqq); 1163 1164 /* --------------- end of interface of B-WF2Q+ ---------------- */ 1165 1166 /* Logging facilities. */ 1167 static inline void bfq_bfqq_name(struct bfq_queue *bfqq, char *str, int len) 1168 { 1169 char type = bfq_bfqq_sync(bfqq) ? 'S' : 'A'; 1170 1171 if (bfqq->pid != -1) 1172 snprintf(str, len, "bfq%d%c", bfqq->pid, type); 1173 else 1174 snprintf(str, len, "bfqSHARED-%c", type); 1175 } 1176 1177 #ifdef CONFIG_BFQ_GROUP_IOSCHED 1178 struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 1179 1180 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ 1181 char pid_str[MAX_BFQQ_NAME_LENGTH]; \ 1182 if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ 1183 break; \ 1184 bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ 1185 blk_add_cgroup_trace_msg((bfqd)->queue, \ 1186 &bfqg_to_blkg(bfqq_group(bfqq))->blkcg->css, \ 1187 "%s " fmt, pid_str, ##args); \ 1188 } while (0) 1189 1190 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ 1191 blk_add_cgroup_trace_msg((bfqd)->queue, \ 1192 &bfqg_to_blkg(bfqg)->blkcg->css, fmt, ##args); \ 1193 } while (0) 1194 1195 #else /* CONFIG_BFQ_GROUP_IOSCHED */ 1196 1197 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ 1198 char pid_str[MAX_BFQQ_NAME_LENGTH]; \ 1199 if (likely(!blk_trace_note_message_enabled((bfqd)->queue))) \ 1200 break; \ 1201 bfq_bfqq_name((bfqq), pid_str, MAX_BFQQ_NAME_LENGTH); \ 1202 blk_add_trace_msg((bfqd)->queue, "%s " fmt, pid_str, ##args); \ 1203 } while (0) 1204 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) 1205 1206 #endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1207 1208 #define bfq_log(bfqd, fmt, args...) \ 1209 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) 1210 1211 #endif /* _BFQ_H */ 1212