1 /* SPDX-License-Identifier: GPL-2.0-or-later */ 2 /* 3 * Header file for the BFQ I/O scheduler: data structures and 4 * prototypes of interface functions among BFQ components. 5 */ 6 #ifndef _BFQ_H 7 #define _BFQ_H 8 9 #include <linux/blktrace_api.h> 10 #include <linux/hrtimer.h> 11 #include <linux/blk-cgroup.h> 12 13 #define BFQ_IOPRIO_CLASSES 3 14 #define BFQ_CL_IDLE_TIMEOUT (HZ/5) 15 16 #define BFQ_MIN_WEIGHT 1 17 #define BFQ_MAX_WEIGHT 1000 18 #define BFQ_WEIGHT_CONVERSION_COEFF 10 19 20 #define BFQ_DEFAULT_QUEUE_IOPRIO 4 21 22 #define BFQ_WEIGHT_LEGACY_DFL 100 23 #define BFQ_DEFAULT_GRP_IOPRIO 0 24 #define BFQ_DEFAULT_GRP_CLASS IOPRIO_CLASS_BE 25 26 #define MAX_PID_STR_LENGTH 12 27 28 /* 29 * Soft real-time applications are extremely more latency sensitive 30 * than interactive ones. Over-raise the weight of the former to 31 * privilege them against the latter. 32 */ 33 #define BFQ_SOFTRT_WEIGHT_FACTOR 100 34 35 struct bfq_entity; 36 37 /** 38 * struct bfq_service_tree - per ioprio_class service tree. 39 * 40 * Each service tree represents a B-WF2Q+ scheduler on its own. Each 41 * ioprio_class has its own independent scheduler, and so its own 42 * bfq_service_tree. All the fields are protected by the queue lock 43 * of the containing bfqd. 44 */ 45 struct bfq_service_tree { 46 /* tree for active entities (i.e., those backlogged) */ 47 struct rb_root active; 48 /* tree for idle entities (i.e., not backlogged, with V < F_i)*/ 49 struct rb_root idle; 50 51 /* idle entity with minimum F_i */ 52 struct bfq_entity *first_idle; 53 /* idle entity with maximum F_i */ 54 struct bfq_entity *last_idle; 55 56 /* scheduler virtual time */ 57 u64 vtime; 58 /* scheduler weight sum; active and idle entities contribute to it */ 59 unsigned long wsum; 60 }; 61 62 /** 63 * struct bfq_sched_data - multi-class scheduler. 64 * 65 * bfq_sched_data is the basic scheduler queue. It supports three 66 * ioprio_classes, and can be used either as a toplevel queue or as an 67 * intermediate queue in a hierarchical setup. 68 * 69 * The supported ioprio_classes are the same as in CFQ, in descending 70 * priority order, IOPRIO_CLASS_RT, IOPRIO_CLASS_BE, IOPRIO_CLASS_IDLE. 71 * Requests from higher priority queues are served before all the 72 * requests from lower priority queues; among requests of the same 73 * queue requests are served according to B-WF2Q+. 74 * 75 * The schedule is implemented by the service trees, plus the field 76 * @next_in_service, which points to the entity on the active trees 77 * that will be served next, if 1) no changes in the schedule occurs 78 * before the current in-service entity is expired, 2) the in-service 79 * queue becomes idle when it expires, and 3) if the entity pointed by 80 * in_service_entity is not a queue, then the in-service child entity 81 * of the entity pointed by in_service_entity becomes idle on 82 * expiration. This peculiar definition allows for the following 83 * optimization, not yet exploited: while a given entity is still in 84 * service, we already know which is the best candidate for next 85 * service among the other active entities in the same parent 86 * entity. We can then quickly compare the timestamps of the 87 * in-service entity with those of such best candidate. 88 * 89 * All fields are protected by the lock of the containing bfqd. 90 */ 91 struct bfq_sched_data { 92 /* entity in service */ 93 struct bfq_entity *in_service_entity; 94 /* head-of-line entity (see comments above) */ 95 struct bfq_entity *next_in_service; 96 /* array of service trees, one per ioprio_class */ 97 struct bfq_service_tree service_tree[BFQ_IOPRIO_CLASSES]; 98 /* last time CLASS_IDLE was served */ 99 unsigned long bfq_class_idle_last_service; 100 101 }; 102 103 /** 104 * struct bfq_weight_counter - counter of the number of all active queues 105 * with a given weight. 106 */ 107 struct bfq_weight_counter { 108 unsigned int weight; /* weight of the queues this counter refers to */ 109 unsigned int num_active; /* nr of active queues with this weight */ 110 /* 111 * Weights tree member (see bfq_data's @queue_weights_tree) 112 */ 113 struct rb_node weights_node; 114 }; 115 116 /** 117 * struct bfq_entity - schedulable entity. 118 * 119 * A bfq_entity is used to represent either a bfq_queue (leaf node in the 120 * cgroup hierarchy) or a bfq_group into the upper level scheduler. Each 121 * entity belongs to the sched_data of the parent group in the cgroup 122 * hierarchy. Non-leaf entities have also their own sched_data, stored 123 * in @my_sched_data. 124 * 125 * Each entity stores independently its priority values; this would 126 * allow different weights on different devices, but this 127 * functionality is not exported to userspace by now. Priorities and 128 * weights are updated lazily, first storing the new values into the 129 * new_* fields, then setting the @prio_changed flag. As soon as 130 * there is a transition in the entity state that allows the priority 131 * update to take place the effective and the requested priority 132 * values are synchronized. 133 * 134 * Unless cgroups are used, the weight value is calculated from the 135 * ioprio to export the same interface as CFQ. When dealing with 136 * "well-behaved" queues (i.e., queues that do not spend too much 137 * time to consume their budget and have true sequential behavior, and 138 * when there are no external factors breaking anticipation) the 139 * relative weights at each level of the cgroups hierarchy should be 140 * guaranteed. All the fields are protected by the queue lock of the 141 * containing bfqd. 142 */ 143 struct bfq_entity { 144 /* service_tree member */ 145 struct rb_node rb_node; 146 147 /* 148 * Flag, true if the entity is on a tree (either the active or 149 * the idle one of its service_tree) or is in service. 150 */ 151 bool on_st; 152 153 /* B-WF2Q+ start and finish timestamps [sectors/weight] */ 154 u64 start, finish; 155 156 /* tree the entity is enqueued into; %NULL if not on a tree */ 157 struct rb_root *tree; 158 159 /* 160 * minimum start time of the (active) subtree rooted at this 161 * entity; used for O(log N) lookups into active trees 162 */ 163 u64 min_start; 164 165 /* amount of service received during the last service slot */ 166 int service; 167 168 /* budget, used also to calculate F_i: F_i = S_i + @budget / @weight */ 169 int budget; 170 171 /* device weight, if non-zero, it overrides the default weight of 172 * bfq_group_data */ 173 int dev_weight; 174 /* weight of the queue */ 175 int weight; 176 /* next weight if a change is in progress */ 177 int new_weight; 178 179 /* original weight, used to implement weight boosting */ 180 int orig_weight; 181 182 /* parent entity, for hierarchical scheduling */ 183 struct bfq_entity *parent; 184 185 /* 186 * For non-leaf nodes in the hierarchy, the associated 187 * scheduler queue, %NULL on leaf nodes. 188 */ 189 struct bfq_sched_data *my_sched_data; 190 /* the scheduler queue this entity belongs to */ 191 struct bfq_sched_data *sched_data; 192 193 /* flag, set to request a weight, ioprio or ioprio_class change */ 194 int prio_changed; 195 196 /* flag, set if the entity is counted in groups_with_pending_reqs */ 197 bool in_groups_with_pending_reqs; 198 }; 199 200 struct bfq_group; 201 202 /** 203 * struct bfq_ttime - per process thinktime stats. 204 */ 205 struct bfq_ttime { 206 /* completion time of the last request */ 207 u64 last_end_request; 208 209 /* total process thinktime */ 210 u64 ttime_total; 211 /* number of thinktime samples */ 212 unsigned long ttime_samples; 213 /* average process thinktime */ 214 u64 ttime_mean; 215 }; 216 217 /** 218 * struct bfq_queue - leaf schedulable entity. 219 * 220 * A bfq_queue is a leaf request queue; it can be associated with an 221 * io_context or more, if it is async or shared between cooperating 222 * processes. @cgroup holds a reference to the cgroup, to be sure that it 223 * does not disappear while a bfqq still references it (mostly to avoid 224 * races between request issuing and task migration followed by cgroup 225 * destruction). 226 * All the fields are protected by the queue lock of the containing bfqd. 227 */ 228 struct bfq_queue { 229 /* reference counter */ 230 int ref; 231 /* parent bfq_data */ 232 struct bfq_data *bfqd; 233 234 /* current ioprio and ioprio class */ 235 unsigned short ioprio, ioprio_class; 236 /* next ioprio and ioprio class if a change is in progress */ 237 unsigned short new_ioprio, new_ioprio_class; 238 239 /* last total-service-time sample, see bfq_update_inject_limit() */ 240 u64 last_serv_time_ns; 241 /* limit for request injection */ 242 unsigned int inject_limit; 243 /* last time the inject limit has been decreased, in jiffies */ 244 unsigned long decrease_time_jif; 245 246 /* 247 * Shared bfq_queue if queue is cooperating with one or more 248 * other queues. 249 */ 250 struct bfq_queue *new_bfqq; 251 /* request-position tree member (see bfq_group's @rq_pos_tree) */ 252 struct rb_node pos_node; 253 /* request-position tree root (see bfq_group's @rq_pos_tree) */ 254 struct rb_root *pos_root; 255 256 /* sorted list of pending requests */ 257 struct rb_root sort_list; 258 /* if fifo isn't expired, next request to serve */ 259 struct request *next_rq; 260 /* number of sync and async requests queued */ 261 int queued[2]; 262 /* number of requests currently allocated */ 263 int allocated; 264 /* number of pending metadata requests */ 265 int meta_pending; 266 /* fifo list of requests in sort_list */ 267 struct list_head fifo; 268 269 /* entity representing this queue in the scheduler */ 270 struct bfq_entity entity; 271 272 /* pointer to the weight counter associated with this entity */ 273 struct bfq_weight_counter *weight_counter; 274 275 /* maximum budget allowed from the feedback mechanism */ 276 int max_budget; 277 /* budget expiration (in jiffies) */ 278 unsigned long budget_timeout; 279 280 /* number of requests on the dispatch list or inside driver */ 281 int dispatched; 282 283 /* status flags */ 284 unsigned long flags; 285 286 /* node for active/idle bfqq list inside parent bfqd */ 287 struct list_head bfqq_list; 288 289 /* associated @bfq_ttime struct */ 290 struct bfq_ttime ttime; 291 292 /* bit vector: a 1 for each seeky requests in history */ 293 u32 seek_history; 294 295 /* node for the device's burst list */ 296 struct hlist_node burst_list_node; 297 298 /* position of the last request enqueued */ 299 sector_t last_request_pos; 300 301 /* Number of consecutive pairs of request completion and 302 * arrival, such that the queue becomes idle after the 303 * completion, but the next request arrives within an idle 304 * time slice; used only if the queue's IO_bound flag has been 305 * cleared. 306 */ 307 unsigned int requests_within_timer; 308 309 /* pid of the process owning the queue, used for logging purposes */ 310 pid_t pid; 311 312 /* 313 * Pointer to the bfq_io_cq owning the bfq_queue, set to %NULL 314 * if the queue is shared. 315 */ 316 struct bfq_io_cq *bic; 317 318 /* current maximum weight-raising time for this queue */ 319 unsigned long wr_cur_max_time; 320 /* 321 * Minimum time instant such that, only if a new request is 322 * enqueued after this time instant in an idle @bfq_queue with 323 * no outstanding requests, then the task associated with the 324 * queue it is deemed as soft real-time (see the comments on 325 * the function bfq_bfqq_softrt_next_start()) 326 */ 327 unsigned long soft_rt_next_start; 328 /* 329 * Start time of the current weight-raising period if 330 * the @bfq-queue is being weight-raised, otherwise 331 * finish time of the last weight-raising period. 332 */ 333 unsigned long last_wr_start_finish; 334 /* factor by which the weight of this queue is multiplied */ 335 unsigned int wr_coeff; 336 /* 337 * Time of the last transition of the @bfq_queue from idle to 338 * backlogged. 339 */ 340 unsigned long last_idle_bklogged; 341 /* 342 * Cumulative service received from the @bfq_queue since the 343 * last transition from idle to backlogged. 344 */ 345 unsigned long service_from_backlogged; 346 /* 347 * Cumulative service received from the @bfq_queue since its 348 * last transition to weight-raised state. 349 */ 350 unsigned long service_from_wr; 351 352 /* 353 * Value of wr start time when switching to soft rt 354 */ 355 unsigned long wr_start_at_switch_to_srt; 356 357 unsigned long split_time; /* time of last split */ 358 359 unsigned long first_IO_time; /* time of first I/O for this queue */ 360 361 /* max service rate measured so far */ 362 u32 max_service_rate; 363 364 /* 365 * Pointer to the waker queue for this queue, i.e., to the 366 * queue Q such that this queue happens to get new I/O right 367 * after some I/O request of Q is completed. For details, see 368 * the comments on the choice of the queue for injection in 369 * bfq_select_queue(). 370 */ 371 struct bfq_queue *waker_bfqq; 372 /* node for woken_list, see below */ 373 struct hlist_node woken_list_node; 374 /* 375 * Head of the list of the woken queues for this queue, i.e., 376 * of the list of the queues for which this queue is a waker 377 * queue. This list is used to reset the waker_bfqq pointer in 378 * the woken queues when this queue exits. 379 */ 380 struct hlist_head woken_list; 381 }; 382 383 /** 384 * struct bfq_io_cq - per (request_queue, io_context) structure. 385 */ 386 struct bfq_io_cq { 387 /* associated io_cq structure */ 388 struct io_cq icq; /* must be the first member */ 389 /* array of two process queues, the sync and the async */ 390 struct bfq_queue *bfqq[2]; 391 /* per (request_queue, blkcg) ioprio */ 392 int ioprio; 393 #ifdef CONFIG_BFQ_GROUP_IOSCHED 394 uint64_t blkcg_serial_nr; /* the current blkcg serial */ 395 #endif 396 /* 397 * Snapshot of the has_short_time flag before merging; taken 398 * to remember its value while the queue is merged, so as to 399 * be able to restore it in case of split. 400 */ 401 bool saved_has_short_ttime; 402 /* 403 * Same purpose as the previous two fields for the I/O bound 404 * classification of a queue. 405 */ 406 bool saved_IO_bound; 407 408 /* 409 * Same purpose as the previous fields for the value of the 410 * field keeping the queue's belonging to a large burst 411 */ 412 bool saved_in_large_burst; 413 /* 414 * True if the queue belonged to a burst list before its merge 415 * with another cooperating queue. 416 */ 417 bool was_in_burst_list; 418 419 /* 420 * Save the weight when a merge occurs, to be able 421 * to restore it in case of split. If the weight is not 422 * correctly resumed when the queue is recycled, 423 * then the weight of the recycled queue could differ 424 * from the weight of the original queue. 425 */ 426 unsigned int saved_weight; 427 428 /* 429 * Similar to previous fields: save wr information. 430 */ 431 unsigned long saved_wr_coeff; 432 unsigned long saved_last_wr_start_finish; 433 unsigned long saved_wr_start_at_switch_to_srt; 434 unsigned int saved_wr_cur_max_time; 435 struct bfq_ttime saved_ttime; 436 }; 437 438 /** 439 * struct bfq_data - per-device data structure. 440 * 441 * All the fields are protected by @lock. 442 */ 443 struct bfq_data { 444 /* device request queue */ 445 struct request_queue *queue; 446 /* dispatch queue */ 447 struct list_head dispatch; 448 449 /* root bfq_group for the device */ 450 struct bfq_group *root_group; 451 452 /* 453 * rbtree of weight counters of @bfq_queues, sorted by 454 * weight. Used to keep track of whether all @bfq_queues have 455 * the same weight. The tree contains one counter for each 456 * distinct weight associated to some active and not 457 * weight-raised @bfq_queue (see the comments to the functions 458 * bfq_weights_tree_[add|remove] for further details). 459 */ 460 struct rb_root_cached queue_weights_tree; 461 462 /* 463 * Number of groups with at least one descendant process that 464 * has at least one request waiting for completion. Note that 465 * this accounts for also requests already dispatched, but not 466 * yet completed. Therefore this number of groups may differ 467 * (be larger) than the number of active groups, as a group is 468 * considered active only if its corresponding entity has 469 * descendant queues with at least one request queued. This 470 * number is used to decide whether a scenario is symmetric. 471 * For a detailed explanation see comments on the computation 472 * of the variable asymmetric_scenario in the function 473 * bfq_better_to_idle(). 474 * 475 * However, it is hard to compute this number exactly, for 476 * groups with multiple descendant processes. Consider a group 477 * that is inactive, i.e., that has no descendant process with 478 * pending I/O inside BFQ queues. Then suppose that 479 * num_groups_with_pending_reqs is still accounting for this 480 * group, because the group has descendant processes with some 481 * I/O request still in flight. num_groups_with_pending_reqs 482 * should be decremented when the in-flight request of the 483 * last descendant process is finally completed (assuming that 484 * nothing else has changed for the group in the meantime, in 485 * terms of composition of the group and active/inactive state of child 486 * groups and processes). To accomplish this, an additional 487 * pending-request counter must be added to entities, and must 488 * be updated correctly. To avoid this additional field and operations, 489 * we resort to the following tradeoff between simplicity and 490 * accuracy: for an inactive group that is still counted in 491 * num_groups_with_pending_reqs, we decrement 492 * num_groups_with_pending_reqs when the first descendant 493 * process of the group remains with no request waiting for 494 * completion. 495 * 496 * Even this simpler decrement strategy requires a little 497 * carefulness: to avoid multiple decrements, we flag a group, 498 * more precisely an entity representing a group, as still 499 * counted in num_groups_with_pending_reqs when it becomes 500 * inactive. Then, when the first descendant queue of the 501 * entity remains with no request waiting for completion, 502 * num_groups_with_pending_reqs is decremented, and this flag 503 * is reset. After this flag is reset for the entity, 504 * num_groups_with_pending_reqs won't be decremented any 505 * longer in case a new descendant queue of the entity remains 506 * with no request waiting for completion. 507 */ 508 unsigned int num_groups_with_pending_reqs; 509 510 /* 511 * Per-class (RT, BE, IDLE) number of bfq_queues containing 512 * requests (including the queue in service, even if it is 513 * idling). 514 */ 515 unsigned int busy_queues[3]; 516 /* number of weight-raised busy @bfq_queues */ 517 int wr_busy_queues; 518 /* number of queued requests */ 519 int queued; 520 /* number of requests dispatched and waiting for completion */ 521 int rq_in_driver; 522 523 /* true if the device is non rotational and performs queueing */ 524 bool nonrot_with_queueing; 525 526 /* 527 * Maximum number of requests in driver in the last 528 * @hw_tag_samples completed requests. 529 */ 530 int max_rq_in_driver; 531 /* number of samples used to calculate hw_tag */ 532 int hw_tag_samples; 533 /* flag set to one if the driver is showing a queueing behavior */ 534 int hw_tag; 535 536 /* number of budgets assigned */ 537 int budgets_assigned; 538 539 /* 540 * Timer set when idling (waiting) for the next request from 541 * the queue in service. 542 */ 543 struct hrtimer idle_slice_timer; 544 545 /* bfq_queue in service */ 546 struct bfq_queue *in_service_queue; 547 548 /* on-disk position of the last served request */ 549 sector_t last_position; 550 551 /* position of the last served request for the in-service queue */ 552 sector_t in_serv_last_pos; 553 554 /* time of last request completion (ns) */ 555 u64 last_completion; 556 557 /* bfqq owning the last completed rq */ 558 struct bfq_queue *last_completed_rq_bfqq; 559 560 /* time of last transition from empty to non-empty (ns) */ 561 u64 last_empty_occupied_ns; 562 563 /* 564 * Flag set to activate the sampling of the total service time 565 * of a just-arrived first I/O request (see 566 * bfq_update_inject_limit()). This will cause the setting of 567 * waited_rq when the request is finally dispatched. 568 */ 569 bool wait_dispatch; 570 /* 571 * If set, then bfq_update_inject_limit() is invoked when 572 * waited_rq is eventually completed. 573 */ 574 struct request *waited_rq; 575 /* 576 * True if some request has been injected during the last service hole. 577 */ 578 bool rqs_injected; 579 580 /* time of first rq dispatch in current observation interval (ns) */ 581 u64 first_dispatch; 582 /* time of last rq dispatch in current observation interval (ns) */ 583 u64 last_dispatch; 584 585 /* beginning of the last budget */ 586 ktime_t last_budget_start; 587 /* beginning of the last idle slice */ 588 ktime_t last_idling_start; 589 unsigned long last_idling_start_jiffies; 590 591 /* number of samples in current observation interval */ 592 int peak_rate_samples; 593 /* num of samples of seq dispatches in current observation interval */ 594 u32 sequential_samples; 595 /* total num of sectors transferred in current observation interval */ 596 u64 tot_sectors_dispatched; 597 /* max rq size seen during current observation interval (sectors) */ 598 u32 last_rq_max_size; 599 /* time elapsed from first dispatch in current observ. interval (us) */ 600 u64 delta_from_first; 601 /* 602 * Current estimate of the device peak rate, measured in 603 * [(sectors/usec) / 2^BFQ_RATE_SHIFT]. The left-shift by 604 * BFQ_RATE_SHIFT is performed to increase precision in 605 * fixed-point calculations. 606 */ 607 u32 peak_rate; 608 609 /* maximum budget allotted to a bfq_queue before rescheduling */ 610 int bfq_max_budget; 611 612 /* list of all the bfq_queues active on the device */ 613 struct list_head active_list; 614 /* list of all the bfq_queues idle on the device */ 615 struct list_head idle_list; 616 617 /* 618 * Timeout for async/sync requests; when it fires, requests 619 * are served in fifo order. 620 */ 621 u64 bfq_fifo_expire[2]; 622 /* weight of backward seeks wrt forward ones */ 623 unsigned int bfq_back_penalty; 624 /* maximum allowed backward seek */ 625 unsigned int bfq_back_max; 626 /* maximum idling time */ 627 u32 bfq_slice_idle; 628 629 /* user-configured max budget value (0 for auto-tuning) */ 630 int bfq_user_max_budget; 631 /* 632 * Timeout for bfq_queues to consume their budget; used to 633 * prevent seeky queues from imposing long latencies to 634 * sequential or quasi-sequential ones (this also implies that 635 * seeky queues cannot receive guarantees in the service 636 * domain; after a timeout they are charged for the time they 637 * have been in service, to preserve fairness among them, but 638 * without service-domain guarantees). 639 */ 640 unsigned int bfq_timeout; 641 642 /* 643 * Number of consecutive requests that must be issued within 644 * the idle time slice to set again idling to a queue which 645 * was marked as non-I/O-bound (see the definition of the 646 * IO_bound flag for further details). 647 */ 648 unsigned int bfq_requests_within_timer; 649 650 /* 651 * Force device idling whenever needed to provide accurate 652 * service guarantees, without caring about throughput 653 * issues. CAVEAT: this may even increase latencies, in case 654 * of useless idling for processes that did stop doing I/O. 655 */ 656 bool strict_guarantees; 657 658 /* 659 * Last time at which a queue entered the current burst of 660 * queues being activated shortly after each other; for more 661 * details about this and the following parameters related to 662 * a burst of activations, see the comments on the function 663 * bfq_handle_burst. 664 */ 665 unsigned long last_ins_in_burst; 666 /* 667 * Reference time interval used to decide whether a queue has 668 * been activated shortly after @last_ins_in_burst. 669 */ 670 unsigned long bfq_burst_interval; 671 /* number of queues in the current burst of queue activations */ 672 int burst_size; 673 674 /* common parent entity for the queues in the burst */ 675 struct bfq_entity *burst_parent_entity; 676 /* Maximum burst size above which the current queue-activation 677 * burst is deemed as 'large'. 678 */ 679 unsigned long bfq_large_burst_thresh; 680 /* true if a large queue-activation burst is in progress */ 681 bool large_burst; 682 /* 683 * Head of the burst list (as for the above fields, more 684 * details in the comments on the function bfq_handle_burst). 685 */ 686 struct hlist_head burst_list; 687 688 /* if set to true, low-latency heuristics are enabled */ 689 bool low_latency; 690 /* 691 * Maximum factor by which the weight of a weight-raised queue 692 * is multiplied. 693 */ 694 unsigned int bfq_wr_coeff; 695 /* maximum duration of a weight-raising period (jiffies) */ 696 unsigned int bfq_wr_max_time; 697 698 /* Maximum weight-raising duration for soft real-time processes */ 699 unsigned int bfq_wr_rt_max_time; 700 /* 701 * Minimum idle period after which weight-raising may be 702 * reactivated for a queue (in jiffies). 703 */ 704 unsigned int bfq_wr_min_idle_time; 705 /* 706 * Minimum period between request arrivals after which 707 * weight-raising may be reactivated for an already busy async 708 * queue (in jiffies). 709 */ 710 unsigned long bfq_wr_min_inter_arr_async; 711 712 /* Max service-rate for a soft real-time queue, in sectors/sec */ 713 unsigned int bfq_wr_max_softrt_rate; 714 /* 715 * Cached value of the product ref_rate*ref_wr_duration, used 716 * for computing the maximum duration of weight raising 717 * automatically. 718 */ 719 u64 rate_dur_prod; 720 721 /* fallback dummy bfqq for extreme OOM conditions */ 722 struct bfq_queue oom_bfqq; 723 724 spinlock_t lock; 725 726 /* 727 * bic associated with the task issuing current bio for 728 * merging. This and the next field are used as a support to 729 * be able to perform the bic lookup, needed by bio-merge 730 * functions, before the scheduler lock is taken, and thus 731 * avoid taking the request-queue lock while the scheduler 732 * lock is being held. 733 */ 734 struct bfq_io_cq *bio_bic; 735 /* bfqq associated with the task issuing current bio for merging */ 736 struct bfq_queue *bio_bfqq; 737 738 /* 739 * Depth limits used in bfq_limit_depth (see comments on the 740 * function) 741 */ 742 unsigned int word_depths[2][2]; 743 }; 744 745 enum bfqq_state_flags { 746 BFQQF_just_created = 0, /* queue just allocated */ 747 BFQQF_busy, /* has requests or is in service */ 748 BFQQF_wait_request, /* waiting for a request */ 749 BFQQF_non_blocking_wait_rq, /* 750 * waiting for a request 751 * without idling the device 752 */ 753 BFQQF_fifo_expire, /* FIFO checked in this slice */ 754 BFQQF_has_short_ttime, /* queue has a short think time */ 755 BFQQF_sync, /* synchronous queue */ 756 BFQQF_IO_bound, /* 757 * bfqq has timed-out at least once 758 * having consumed at most 2/10 of 759 * its budget 760 */ 761 BFQQF_in_large_burst, /* 762 * bfqq activated in a large burst, 763 * see comments to bfq_handle_burst. 764 */ 765 BFQQF_softrt_update, /* 766 * may need softrt-next-start 767 * update 768 */ 769 BFQQF_coop, /* bfqq is shared */ 770 BFQQF_split_coop, /* shared bfqq will be split */ 771 BFQQF_has_waker /* bfqq has a waker queue */ 772 }; 773 774 #define BFQ_BFQQ_FNS(name) \ 775 void bfq_mark_bfqq_##name(struct bfq_queue *bfqq); \ 776 void bfq_clear_bfqq_##name(struct bfq_queue *bfqq); \ 777 int bfq_bfqq_##name(const struct bfq_queue *bfqq); 778 779 BFQ_BFQQ_FNS(just_created); 780 BFQ_BFQQ_FNS(busy); 781 BFQ_BFQQ_FNS(wait_request); 782 BFQ_BFQQ_FNS(non_blocking_wait_rq); 783 BFQ_BFQQ_FNS(fifo_expire); 784 BFQ_BFQQ_FNS(has_short_ttime); 785 BFQ_BFQQ_FNS(sync); 786 BFQ_BFQQ_FNS(IO_bound); 787 BFQ_BFQQ_FNS(in_large_burst); 788 BFQ_BFQQ_FNS(coop); 789 BFQ_BFQQ_FNS(split_coop); 790 BFQ_BFQQ_FNS(softrt_update); 791 BFQ_BFQQ_FNS(has_waker); 792 #undef BFQ_BFQQ_FNS 793 794 /* Expiration reasons. */ 795 enum bfqq_expiration { 796 BFQQE_TOO_IDLE = 0, /* 797 * queue has been idling for 798 * too long 799 */ 800 BFQQE_BUDGET_TIMEOUT, /* budget took too long to be used */ 801 BFQQE_BUDGET_EXHAUSTED, /* budget consumed */ 802 BFQQE_NO_MORE_REQUESTS, /* the queue has no more requests */ 803 BFQQE_PREEMPTED /* preemption in progress */ 804 }; 805 806 struct bfq_stat { 807 struct percpu_counter cpu_cnt; 808 atomic64_t aux_cnt; 809 }; 810 811 struct bfqg_stats { 812 #ifdef CONFIG_BFQ_CGROUP_DEBUG 813 /* number of ios merged */ 814 struct blkg_rwstat merged; 815 /* total time spent on device in ns, may not be accurate w/ queueing */ 816 struct blkg_rwstat service_time; 817 /* total time spent waiting in scheduler queue in ns */ 818 struct blkg_rwstat wait_time; 819 /* number of IOs queued up */ 820 struct blkg_rwstat queued; 821 /* total disk time and nr sectors dispatched by this group */ 822 struct bfq_stat time; 823 /* sum of number of ios queued across all samples */ 824 struct bfq_stat avg_queue_size_sum; 825 /* count of samples taken for average */ 826 struct bfq_stat avg_queue_size_samples; 827 /* how many times this group has been removed from service tree */ 828 struct bfq_stat dequeue; 829 /* total time spent waiting for it to be assigned a timeslice. */ 830 struct bfq_stat group_wait_time; 831 /* time spent idling for this blkcg_gq */ 832 struct bfq_stat idle_time; 833 /* total time with empty current active q with other requests queued */ 834 struct bfq_stat empty_time; 835 /* fields after this shouldn't be cleared on stat reset */ 836 u64 start_group_wait_time; 837 u64 start_idle_time; 838 u64 start_empty_time; 839 uint16_t flags; 840 #endif /* CONFIG_BFQ_CGROUP_DEBUG */ 841 }; 842 843 #ifdef CONFIG_BFQ_GROUP_IOSCHED 844 845 /* 846 * struct bfq_group_data - per-blkcg storage for the blkio subsystem. 847 * 848 * @ps: @blkcg_policy_storage that this structure inherits 849 * @weight: weight of the bfq_group 850 */ 851 struct bfq_group_data { 852 /* must be the first member */ 853 struct blkcg_policy_data pd; 854 855 unsigned int weight; 856 }; 857 858 /** 859 * struct bfq_group - per (device, cgroup) data structure. 860 * @entity: schedulable entity to insert into the parent group sched_data. 861 * @sched_data: own sched_data, to contain child entities (they may be 862 * both bfq_queues and bfq_groups). 863 * @bfqd: the bfq_data for the device this group acts upon. 864 * @async_bfqq: array of async queues for all the tasks belonging to 865 * the group, one queue per ioprio value per ioprio_class, 866 * except for the idle class that has only one queue. 867 * @async_idle_bfqq: async queue for the idle class (ioprio is ignored). 868 * @my_entity: pointer to @entity, %NULL for the toplevel group; used 869 * to avoid too many special cases during group creation/ 870 * migration. 871 * @stats: stats for this bfqg. 872 * @active_entities: number of active entities belonging to the group; 873 * unused for the root group. Used to know whether there 874 * are groups with more than one active @bfq_entity 875 * (see the comments to the function 876 * bfq_bfqq_may_idle()). 877 * @rq_pos_tree: rbtree sorted by next_request position, used when 878 * determining if two or more queues have interleaving 879 * requests (see bfq_find_close_cooperator()). 880 * 881 * Each (device, cgroup) pair has its own bfq_group, i.e., for each cgroup 882 * there is a set of bfq_groups, each one collecting the lower-level 883 * entities belonging to the group that are acting on the same device. 884 * 885 * Locking works as follows: 886 * o @bfqd is protected by the queue lock, RCU is used to access it 887 * from the readers. 888 * o All the other fields are protected by the @bfqd queue lock. 889 */ 890 struct bfq_group { 891 /* must be the first member */ 892 struct blkg_policy_data pd; 893 894 /* cached path for this blkg (see comments in bfq_bic_update_cgroup) */ 895 char blkg_path[128]; 896 897 /* reference counter (see comments in bfq_bic_update_cgroup) */ 898 int ref; 899 900 struct bfq_entity entity; 901 struct bfq_sched_data sched_data; 902 903 void *bfqd; 904 905 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; 906 struct bfq_queue *async_idle_bfqq; 907 908 struct bfq_entity *my_entity; 909 910 int active_entities; 911 912 struct rb_root rq_pos_tree; 913 914 struct bfqg_stats stats; 915 }; 916 917 #else 918 struct bfq_group { 919 struct bfq_sched_data sched_data; 920 921 struct bfq_queue *async_bfqq[2][IOPRIO_BE_NR]; 922 struct bfq_queue *async_idle_bfqq; 923 924 struct rb_root rq_pos_tree; 925 }; 926 #endif 927 928 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); 929 930 /* --------------- main algorithm interface ----------------- */ 931 932 #define BFQ_SERVICE_TREE_INIT ((struct bfq_service_tree) \ 933 { RB_ROOT, RB_ROOT, NULL, NULL, 0, 0 }) 934 935 extern const int bfq_timeout; 936 937 struct bfq_queue *bic_to_bfqq(struct bfq_io_cq *bic, bool is_sync); 938 void bic_set_bfqq(struct bfq_io_cq *bic, struct bfq_queue *bfqq, bool is_sync); 939 struct bfq_data *bic_to_bfqd(struct bfq_io_cq *bic); 940 void bfq_pos_tree_add_move(struct bfq_data *bfqd, struct bfq_queue *bfqq); 941 void bfq_weights_tree_add(struct bfq_data *bfqd, struct bfq_queue *bfqq, 942 struct rb_root_cached *root); 943 void __bfq_weights_tree_remove(struct bfq_data *bfqd, 944 struct bfq_queue *bfqq, 945 struct rb_root_cached *root); 946 void bfq_weights_tree_remove(struct bfq_data *bfqd, 947 struct bfq_queue *bfqq); 948 void bfq_bfqq_expire(struct bfq_data *bfqd, struct bfq_queue *bfqq, 949 bool compensate, enum bfqq_expiration reason); 950 void bfq_put_queue(struct bfq_queue *bfqq); 951 void bfq_end_wr_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); 952 void bfq_schedule_dispatch(struct bfq_data *bfqd); 953 void bfq_put_async_queues(struct bfq_data *bfqd, struct bfq_group *bfqg); 954 955 /* ------------ end of main algorithm interface -------------- */ 956 957 /* ---------------- cgroups-support interface ---------------- */ 958 959 void bfqg_stats_update_io_add(struct bfq_group *bfqg, struct bfq_queue *bfqq, 960 unsigned int op); 961 void bfqg_stats_update_io_remove(struct bfq_group *bfqg, unsigned int op); 962 void bfqg_stats_update_io_merged(struct bfq_group *bfqg, unsigned int op); 963 void bfqg_stats_update_completion(struct bfq_group *bfqg, u64 start_time_ns, 964 u64 io_start_time_ns, unsigned int op); 965 void bfqg_stats_update_dequeue(struct bfq_group *bfqg); 966 void bfqg_stats_set_start_empty_time(struct bfq_group *bfqg); 967 void bfqg_stats_update_idle_time(struct bfq_group *bfqg); 968 void bfqg_stats_set_start_idle_time(struct bfq_group *bfqg); 969 void bfqg_stats_update_avg_queue_size(struct bfq_group *bfqg); 970 void bfq_bfqq_move(struct bfq_data *bfqd, struct bfq_queue *bfqq, 971 struct bfq_group *bfqg); 972 973 void bfq_init_entity(struct bfq_entity *entity, struct bfq_group *bfqg); 974 void bfq_bic_update_cgroup(struct bfq_io_cq *bic, struct bio *bio); 975 void bfq_end_wr_async(struct bfq_data *bfqd); 976 struct bfq_group *bfq_find_set_group(struct bfq_data *bfqd, 977 struct blkcg *blkcg); 978 struct blkcg_gq *bfqg_to_blkg(struct bfq_group *bfqg); 979 struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 980 struct bfq_group *bfq_create_group_hierarchy(struct bfq_data *bfqd, int node); 981 void bfqg_and_blkg_put(struct bfq_group *bfqg); 982 983 #ifdef CONFIG_BFQ_GROUP_IOSCHED 984 extern struct cftype bfq_blkcg_legacy_files[]; 985 extern struct cftype bfq_blkg_files[]; 986 extern struct blkcg_policy blkcg_policy_bfq; 987 #endif 988 989 /* ------------- end of cgroups-support interface ------------- */ 990 991 /* - interface of the internal hierarchical B-WF2Q+ scheduler - */ 992 993 #ifdef CONFIG_BFQ_GROUP_IOSCHED 994 /* both next loops stop at one of the child entities of the root group */ 995 #define for_each_entity(entity) \ 996 for (; entity ; entity = entity->parent) 997 998 /* 999 * For each iteration, compute parent in advance, so as to be safe if 1000 * entity is deallocated during the iteration. Such a deallocation may 1001 * happen as a consequence of a bfq_put_queue that frees the bfq_queue 1002 * containing entity. 1003 */ 1004 #define for_each_entity_safe(entity, parent) \ 1005 for (; entity && ({ parent = entity->parent; 1; }); entity = parent) 1006 1007 #else /* CONFIG_BFQ_GROUP_IOSCHED */ 1008 /* 1009 * Next two macros are fake loops when cgroups support is not 1010 * enabled. I fact, in such a case, there is only one level to go up 1011 * (to reach the root group). 1012 */ 1013 #define for_each_entity(entity) \ 1014 for (; entity ; entity = NULL) 1015 1016 #define for_each_entity_safe(entity, parent) \ 1017 for (parent = NULL; entity ; entity = parent) 1018 #endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1019 1020 struct bfq_group *bfq_bfqq_to_bfqg(struct bfq_queue *bfqq); 1021 struct bfq_queue *bfq_entity_to_bfqq(struct bfq_entity *entity); 1022 unsigned int bfq_tot_busy_queues(struct bfq_data *bfqd); 1023 struct bfq_service_tree *bfq_entity_service_tree(struct bfq_entity *entity); 1024 struct bfq_entity *bfq_entity_of(struct rb_node *node); 1025 unsigned short bfq_ioprio_to_weight(int ioprio); 1026 void bfq_put_idle_entity(struct bfq_service_tree *st, 1027 struct bfq_entity *entity); 1028 struct bfq_service_tree * 1029 __bfq_entity_update_weight_prio(struct bfq_service_tree *old_st, 1030 struct bfq_entity *entity, 1031 bool update_class_too); 1032 void bfq_bfqq_served(struct bfq_queue *bfqq, int served); 1033 void bfq_bfqq_charge_time(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1034 unsigned long time_ms); 1035 bool __bfq_deactivate_entity(struct bfq_entity *entity, 1036 bool ins_into_idle_tree); 1037 bool next_queue_may_preempt(struct bfq_data *bfqd); 1038 struct bfq_queue *bfq_get_next_queue(struct bfq_data *bfqd); 1039 bool __bfq_bfqd_reset_in_service(struct bfq_data *bfqd); 1040 void bfq_deactivate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1041 bool ins_into_idle_tree, bool expiration); 1042 void bfq_activate_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1043 void bfq_requeue_bfqq(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1044 bool expiration); 1045 void bfq_del_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq, 1046 bool expiration); 1047 void bfq_add_bfqq_busy(struct bfq_data *bfqd, struct bfq_queue *bfqq); 1048 1049 /* --------------- end of interface of B-WF2Q+ ---------------- */ 1050 1051 /* Logging facilities. */ 1052 static inline void bfq_pid_to_str(int pid, char *str, int len) 1053 { 1054 if (pid != -1) 1055 snprintf(str, len, "%d", pid); 1056 else 1057 snprintf(str, len, "SHARED-"); 1058 } 1059 1060 #ifdef CONFIG_BFQ_GROUP_IOSCHED 1061 struct bfq_group *bfqq_group(struct bfq_queue *bfqq); 1062 1063 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ 1064 char pid_str[MAX_PID_STR_LENGTH]; \ 1065 bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ 1066 blk_add_cgroup_trace_msg((bfqd)->queue, \ 1067 bfqg_to_blkg(bfqq_group(bfqq))->blkcg, \ 1068 "bfq%s%c " fmt, pid_str, \ 1069 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', ##args); \ 1070 } while (0) 1071 1072 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do { \ 1073 blk_add_cgroup_trace_msg((bfqd)->queue, \ 1074 bfqg_to_blkg(bfqg)->blkcg, fmt, ##args); \ 1075 } while (0) 1076 1077 #else /* CONFIG_BFQ_GROUP_IOSCHED */ 1078 1079 #define bfq_log_bfqq(bfqd, bfqq, fmt, args...) do { \ 1080 char pid_str[MAX_PID_STR_LENGTH]; \ 1081 bfq_pid_to_str((bfqq)->pid, pid_str, MAX_PID_STR_LENGTH); \ 1082 blk_add_trace_msg((bfqd)->queue, "bfq%s%c " fmt, pid_str, \ 1083 bfq_bfqq_sync((bfqq)) ? 'S' : 'A', \ 1084 ##args); \ 1085 } while (0) 1086 #define bfq_log_bfqg(bfqd, bfqg, fmt, args...) do {} while (0) 1087 1088 #endif /* CONFIG_BFQ_GROUP_IOSCHED */ 1089 1090 #define bfq_log(bfqd, fmt, args...) \ 1091 blk_add_trace_msg((bfqd)->queue, "bfq " fmt, ##args) 1092 1093 #endif /* _BFQ_H */ 1094