1 /* 2 * Interface for controlling IO bandwidth on a request queue 3 * 4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5 */ 6 7 #include <linux/module.h> 8 #include <linux/slab.h> 9 #include <linux/blkdev.h> 10 #include <linux/bio.h> 11 #include <linux/blktrace_api.h> 12 #include <linux/blk-cgroup.h> 13 #include "blk.h" 14 15 /* Max dispatch from a group in 1 round */ 16 static int throtl_grp_quantum = 8; 17 18 /* Total max dispatch from all groups in one round */ 19 static int throtl_quantum = 32; 20 21 /* Throttling is performed over a slice and after that slice is renewed */ 22 #define DFL_THROTL_SLICE_HD (HZ / 10) 23 #define DFL_THROTL_SLICE_SSD (HZ / 50) 24 #define MAX_THROTL_SLICE (HZ) 25 #define DFL_IDLE_THRESHOLD_SSD (1000L) /* 1 ms */ 26 #define DFL_IDLE_THRESHOLD_HD (100L * 1000) /* 100 ms */ 27 #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ 28 /* default latency target is 0, eg, guarantee IO latency by default */ 29 #define DFL_LATENCY_TARGET (0) 30 31 #define SKIP_LATENCY (((u64)1) << BLK_STAT_RES_SHIFT) 32 33 static struct blkcg_policy blkcg_policy_throtl; 34 35 /* A workqueue to queue throttle related work */ 36 static struct workqueue_struct *kthrotld_workqueue; 37 38 /* 39 * To implement hierarchical throttling, throtl_grps form a tree and bios 40 * are dispatched upwards level by level until they reach the top and get 41 * issued. When dispatching bios from the children and local group at each 42 * level, if the bios are dispatched into a single bio_list, there's a risk 43 * of a local or child group which can queue many bios at once filling up 44 * the list starving others. 45 * 46 * To avoid such starvation, dispatched bios are queued separately 47 * according to where they came from. When they are again dispatched to 48 * the parent, they're popped in round-robin order so that no single source 49 * hogs the dispatch window. 50 * 51 * throtl_qnode is used to keep the queued bios separated by their sources. 52 * Bios are queued to throtl_qnode which in turn is queued to 53 * throtl_service_queue and then dispatched in round-robin order. 54 * 55 * It's also used to track the reference counts on blkg's. A qnode always 56 * belongs to a throtl_grp and gets queued on itself or the parent, so 57 * incrementing the reference of the associated throtl_grp when a qnode is 58 * queued and decrementing when dequeued is enough to keep the whole blkg 59 * tree pinned while bios are in flight. 60 */ 61 struct throtl_qnode { 62 struct list_head node; /* service_queue->queued[] */ 63 struct bio_list bios; /* queued bios */ 64 struct throtl_grp *tg; /* tg this qnode belongs to */ 65 }; 66 67 struct throtl_service_queue { 68 struct throtl_service_queue *parent_sq; /* the parent service_queue */ 69 70 /* 71 * Bios queued directly to this service_queue or dispatched from 72 * children throtl_grp's. 73 */ 74 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 75 unsigned int nr_queued[2]; /* number of queued bios */ 76 77 /* 78 * RB tree of active children throtl_grp's, which are sorted by 79 * their ->disptime. 80 */ 81 struct rb_root pending_tree; /* RB tree of active tgs */ 82 struct rb_node *first_pending; /* first node in the tree */ 83 unsigned int nr_pending; /* # queued in the tree */ 84 unsigned long first_pending_disptime; /* disptime of the first tg */ 85 struct timer_list pending_timer; /* fires on first_pending_disptime */ 86 }; 87 88 enum tg_state_flags { 89 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 90 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 91 }; 92 93 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 94 95 enum { 96 LIMIT_LOW, 97 LIMIT_MAX, 98 LIMIT_CNT, 99 }; 100 101 struct throtl_grp { 102 /* must be the first member */ 103 struct blkg_policy_data pd; 104 105 /* active throtl group service_queue member */ 106 struct rb_node rb_node; 107 108 /* throtl_data this group belongs to */ 109 struct throtl_data *td; 110 111 /* this group's service queue */ 112 struct throtl_service_queue service_queue; 113 114 /* 115 * qnode_on_self is used when bios are directly queued to this 116 * throtl_grp so that local bios compete fairly with bios 117 * dispatched from children. qnode_on_parent is used when bios are 118 * dispatched from this throtl_grp into its parent and will compete 119 * with the sibling qnode_on_parents and the parent's 120 * qnode_on_self. 121 */ 122 struct throtl_qnode qnode_on_self[2]; 123 struct throtl_qnode qnode_on_parent[2]; 124 125 /* 126 * Dispatch time in jiffies. This is the estimated time when group 127 * will unthrottle and is ready to dispatch more bio. It is used as 128 * key to sort active groups in service tree. 129 */ 130 unsigned long disptime; 131 132 unsigned int flags; 133 134 /* are there any throtl rules between this group and td? */ 135 bool has_rules[2]; 136 137 /* internally used bytes per second rate limits */ 138 uint64_t bps[2][LIMIT_CNT]; 139 /* user configured bps limits */ 140 uint64_t bps_conf[2][LIMIT_CNT]; 141 142 /* internally used IOPS limits */ 143 unsigned int iops[2][LIMIT_CNT]; 144 /* user configured IOPS limits */ 145 unsigned int iops_conf[2][LIMIT_CNT]; 146 147 /* Number of bytes disptached in current slice */ 148 uint64_t bytes_disp[2]; 149 /* Number of bio's dispatched in current slice */ 150 unsigned int io_disp[2]; 151 152 unsigned long last_low_overflow_time[2]; 153 154 uint64_t last_bytes_disp[2]; 155 unsigned int last_io_disp[2]; 156 157 unsigned long last_check_time; 158 159 unsigned long latency_target; /* us */ 160 /* When did we start a new slice */ 161 unsigned long slice_start[2]; 162 unsigned long slice_end[2]; 163 164 unsigned long last_finish_time; /* ns / 1024 */ 165 unsigned long checked_last_finish_time; /* ns / 1024 */ 166 unsigned long avg_idletime; /* ns / 1024 */ 167 unsigned long idletime_threshold; /* us */ 168 169 unsigned int bio_cnt; /* total bios */ 170 unsigned int bad_bio_cnt; /* bios exceeding latency threshold */ 171 unsigned long bio_cnt_reset_time; 172 }; 173 174 /* We measure latency for request size from <= 4k to >= 1M */ 175 #define LATENCY_BUCKET_SIZE 9 176 177 struct latency_bucket { 178 unsigned long total_latency; /* ns / 1024 */ 179 int samples; 180 }; 181 182 struct avg_latency_bucket { 183 unsigned long latency; /* ns / 1024 */ 184 bool valid; 185 }; 186 187 struct throtl_data 188 { 189 /* service tree for active throtl groups */ 190 struct throtl_service_queue service_queue; 191 192 struct request_queue *queue; 193 194 /* Total Number of queued bios on READ and WRITE lists */ 195 unsigned int nr_queued[2]; 196 197 unsigned int throtl_slice; 198 199 /* Work for dispatching throttled bios */ 200 struct work_struct dispatch_work; 201 unsigned int limit_index; 202 bool limit_valid[LIMIT_CNT]; 203 204 unsigned long dft_idletime_threshold; /* us */ 205 206 unsigned long low_upgrade_time; 207 unsigned long low_downgrade_time; 208 209 unsigned int scale; 210 211 struct latency_bucket tmp_buckets[LATENCY_BUCKET_SIZE]; 212 struct avg_latency_bucket avg_buckets[LATENCY_BUCKET_SIZE]; 213 struct latency_bucket __percpu *latency_buckets; 214 unsigned long last_calculate_time; 215 216 bool track_bio_latency; 217 }; 218 219 static void throtl_pending_timer_fn(unsigned long arg); 220 221 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 222 { 223 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 224 } 225 226 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 227 { 228 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 229 } 230 231 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 232 { 233 return pd_to_blkg(&tg->pd); 234 } 235 236 /** 237 * sq_to_tg - return the throl_grp the specified service queue belongs to 238 * @sq: the throtl_service_queue of interest 239 * 240 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 241 * embedded in throtl_data, %NULL is returned. 242 */ 243 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 244 { 245 if (sq && sq->parent_sq) 246 return container_of(sq, struct throtl_grp, service_queue); 247 else 248 return NULL; 249 } 250 251 /** 252 * sq_to_td - return throtl_data the specified service queue belongs to 253 * @sq: the throtl_service_queue of interest 254 * 255 * A service_queue can be embedded in either a throtl_grp or throtl_data. 256 * Determine the associated throtl_data accordingly and return it. 257 */ 258 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 259 { 260 struct throtl_grp *tg = sq_to_tg(sq); 261 262 if (tg) 263 return tg->td; 264 else 265 return container_of(sq, struct throtl_data, service_queue); 266 } 267 268 /* 269 * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to 270 * make the IO dispatch more smooth. 271 * Scale up: linearly scale up according to lapsed time since upgrade. For 272 * every throtl_slice, the limit scales up 1/2 .low limit till the 273 * limit hits .max limit 274 * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit 275 */ 276 static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) 277 { 278 /* arbitrary value to avoid too big scale */ 279 if (td->scale < 4096 && time_after_eq(jiffies, 280 td->low_upgrade_time + td->scale * td->throtl_slice)) 281 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; 282 283 return low + (low >> 1) * td->scale; 284 } 285 286 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 287 { 288 struct blkcg_gq *blkg = tg_to_blkg(tg); 289 struct throtl_data *td; 290 uint64_t ret; 291 292 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 293 return U64_MAX; 294 295 td = tg->td; 296 ret = tg->bps[rw][td->limit_index]; 297 if (ret == 0 && td->limit_index == LIMIT_LOW) 298 return tg->bps[rw][LIMIT_MAX]; 299 300 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && 301 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { 302 uint64_t adjusted; 303 304 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); 305 ret = min(tg->bps[rw][LIMIT_MAX], adjusted); 306 } 307 return ret; 308 } 309 310 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 311 { 312 struct blkcg_gq *blkg = tg_to_blkg(tg); 313 struct throtl_data *td; 314 unsigned int ret; 315 316 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 317 return UINT_MAX; 318 td = tg->td; 319 ret = tg->iops[rw][td->limit_index]; 320 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) 321 return tg->iops[rw][LIMIT_MAX]; 322 323 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && 324 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { 325 uint64_t adjusted; 326 327 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); 328 if (adjusted > UINT_MAX) 329 adjusted = UINT_MAX; 330 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); 331 } 332 return ret; 333 } 334 335 #define request_bucket_index(sectors) \ 336 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) 337 338 /** 339 * throtl_log - log debug message via blktrace 340 * @sq: the service_queue being reported 341 * @fmt: printf format string 342 * @args: printf args 343 * 344 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 345 * throtl_grp; otherwise, just "throtl". 346 */ 347 #define throtl_log(sq, fmt, args...) do { \ 348 struct throtl_grp *__tg = sq_to_tg((sq)); \ 349 struct throtl_data *__td = sq_to_td((sq)); \ 350 \ 351 (void)__td; \ 352 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 353 break; \ 354 if ((__tg)) { \ 355 char __pbuf[128]; \ 356 \ 357 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ 358 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ 359 } else { \ 360 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 361 } \ 362 } while (0) 363 364 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 365 { 366 INIT_LIST_HEAD(&qn->node); 367 bio_list_init(&qn->bios); 368 qn->tg = tg; 369 } 370 371 /** 372 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 373 * @bio: bio being added 374 * @qn: qnode to add bio to 375 * @queued: the service_queue->queued[] list @qn belongs to 376 * 377 * Add @bio to @qn and put @qn on @queued if it's not already on. 378 * @qn->tg's reference count is bumped when @qn is activated. See the 379 * comment on top of throtl_qnode definition for details. 380 */ 381 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 382 struct list_head *queued) 383 { 384 bio_list_add(&qn->bios, bio); 385 if (list_empty(&qn->node)) { 386 list_add_tail(&qn->node, queued); 387 blkg_get(tg_to_blkg(qn->tg)); 388 } 389 } 390 391 /** 392 * throtl_peek_queued - peek the first bio on a qnode list 393 * @queued: the qnode list to peek 394 */ 395 static struct bio *throtl_peek_queued(struct list_head *queued) 396 { 397 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 398 struct bio *bio; 399 400 if (list_empty(queued)) 401 return NULL; 402 403 bio = bio_list_peek(&qn->bios); 404 WARN_ON_ONCE(!bio); 405 return bio; 406 } 407 408 /** 409 * throtl_pop_queued - pop the first bio form a qnode list 410 * @queued: the qnode list to pop a bio from 411 * @tg_to_put: optional out argument for throtl_grp to put 412 * 413 * Pop the first bio from the qnode list @queued. After popping, the first 414 * qnode is removed from @queued if empty or moved to the end of @queued so 415 * that the popping order is round-robin. 416 * 417 * When the first qnode is removed, its associated throtl_grp should be put 418 * too. If @tg_to_put is NULL, this function automatically puts it; 419 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 420 * responsible for putting it. 421 */ 422 static struct bio *throtl_pop_queued(struct list_head *queued, 423 struct throtl_grp **tg_to_put) 424 { 425 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 426 struct bio *bio; 427 428 if (list_empty(queued)) 429 return NULL; 430 431 bio = bio_list_pop(&qn->bios); 432 WARN_ON_ONCE(!bio); 433 434 if (bio_list_empty(&qn->bios)) { 435 list_del_init(&qn->node); 436 if (tg_to_put) 437 *tg_to_put = qn->tg; 438 else 439 blkg_put(tg_to_blkg(qn->tg)); 440 } else { 441 list_move_tail(&qn->node, queued); 442 } 443 444 return bio; 445 } 446 447 /* init a service_queue, assumes the caller zeroed it */ 448 static void throtl_service_queue_init(struct throtl_service_queue *sq) 449 { 450 INIT_LIST_HEAD(&sq->queued[0]); 451 INIT_LIST_HEAD(&sq->queued[1]); 452 sq->pending_tree = RB_ROOT; 453 setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 454 (unsigned long)sq); 455 } 456 457 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) 458 { 459 struct throtl_grp *tg; 460 int rw; 461 462 tg = kzalloc_node(sizeof(*tg), gfp, node); 463 if (!tg) 464 return NULL; 465 466 throtl_service_queue_init(&tg->service_queue); 467 468 for (rw = READ; rw <= WRITE; rw++) { 469 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 470 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 471 } 472 473 RB_CLEAR_NODE(&tg->rb_node); 474 tg->bps[READ][LIMIT_MAX] = U64_MAX; 475 tg->bps[WRITE][LIMIT_MAX] = U64_MAX; 476 tg->iops[READ][LIMIT_MAX] = UINT_MAX; 477 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; 478 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; 479 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; 480 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; 481 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; 482 /* LIMIT_LOW will have default value 0 */ 483 484 tg->latency_target = DFL_LATENCY_TARGET; 485 486 return &tg->pd; 487 } 488 489 static void throtl_pd_init(struct blkg_policy_data *pd) 490 { 491 struct throtl_grp *tg = pd_to_tg(pd); 492 struct blkcg_gq *blkg = tg_to_blkg(tg); 493 struct throtl_data *td = blkg->q->td; 494 struct throtl_service_queue *sq = &tg->service_queue; 495 496 /* 497 * If on the default hierarchy, we switch to properly hierarchical 498 * behavior where limits on a given throtl_grp are applied to the 499 * whole subtree rather than just the group itself. e.g. If 16M 500 * read_bps limit is set on the root group, the whole system can't 501 * exceed 16M for the device. 502 * 503 * If not on the default hierarchy, the broken flat hierarchy 504 * behavior is retained where all throtl_grps are treated as if 505 * they're all separate root groups right below throtl_data. 506 * Limits of a group don't interact with limits of other groups 507 * regardless of the position of the group in the hierarchy. 508 */ 509 sq->parent_sq = &td->service_queue; 510 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 511 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 512 tg->td = td; 513 514 tg->idletime_threshold = td->dft_idletime_threshold; 515 } 516 517 /* 518 * Set has_rules[] if @tg or any of its parents have limits configured. 519 * This doesn't require walking up to the top of the hierarchy as the 520 * parent's has_rules[] is guaranteed to be correct. 521 */ 522 static void tg_update_has_rules(struct throtl_grp *tg) 523 { 524 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 525 struct throtl_data *td = tg->td; 526 int rw; 527 528 for (rw = READ; rw <= WRITE; rw++) 529 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 530 (td->limit_valid[td->limit_index] && 531 (tg_bps_limit(tg, rw) != U64_MAX || 532 tg_iops_limit(tg, rw) != UINT_MAX)); 533 } 534 535 static void throtl_pd_online(struct blkg_policy_data *pd) 536 { 537 struct throtl_grp *tg = pd_to_tg(pd); 538 /* 539 * We don't want new groups to escape the limits of its ancestors. 540 * Update has_rules[] after a new group is brought online. 541 */ 542 tg_update_has_rules(tg); 543 } 544 545 static void blk_throtl_update_limit_valid(struct throtl_data *td) 546 { 547 struct cgroup_subsys_state *pos_css; 548 struct blkcg_gq *blkg; 549 bool low_valid = false; 550 551 rcu_read_lock(); 552 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 553 struct throtl_grp *tg = blkg_to_tg(blkg); 554 555 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || 556 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) 557 low_valid = true; 558 } 559 rcu_read_unlock(); 560 561 td->limit_valid[LIMIT_LOW] = low_valid; 562 } 563 564 static void throtl_upgrade_state(struct throtl_data *td); 565 static void throtl_pd_offline(struct blkg_policy_data *pd) 566 { 567 struct throtl_grp *tg = pd_to_tg(pd); 568 569 tg->bps[READ][LIMIT_LOW] = 0; 570 tg->bps[WRITE][LIMIT_LOW] = 0; 571 tg->iops[READ][LIMIT_LOW] = 0; 572 tg->iops[WRITE][LIMIT_LOW] = 0; 573 574 blk_throtl_update_limit_valid(tg->td); 575 576 if (!tg->td->limit_valid[tg->td->limit_index]) 577 throtl_upgrade_state(tg->td); 578 } 579 580 static void throtl_pd_free(struct blkg_policy_data *pd) 581 { 582 struct throtl_grp *tg = pd_to_tg(pd); 583 584 del_timer_sync(&tg->service_queue.pending_timer); 585 kfree(tg); 586 } 587 588 static struct throtl_grp * 589 throtl_rb_first(struct throtl_service_queue *parent_sq) 590 { 591 /* Service tree is empty */ 592 if (!parent_sq->nr_pending) 593 return NULL; 594 595 if (!parent_sq->first_pending) 596 parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 597 598 if (parent_sq->first_pending) 599 return rb_entry_tg(parent_sq->first_pending); 600 601 return NULL; 602 } 603 604 static void rb_erase_init(struct rb_node *n, struct rb_root *root) 605 { 606 rb_erase(n, root); 607 RB_CLEAR_NODE(n); 608 } 609 610 static void throtl_rb_erase(struct rb_node *n, 611 struct throtl_service_queue *parent_sq) 612 { 613 if (parent_sq->first_pending == n) 614 parent_sq->first_pending = NULL; 615 rb_erase_init(n, &parent_sq->pending_tree); 616 --parent_sq->nr_pending; 617 } 618 619 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 620 { 621 struct throtl_grp *tg; 622 623 tg = throtl_rb_first(parent_sq); 624 if (!tg) 625 return; 626 627 parent_sq->first_pending_disptime = tg->disptime; 628 } 629 630 static void tg_service_queue_add(struct throtl_grp *tg) 631 { 632 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 633 struct rb_node **node = &parent_sq->pending_tree.rb_node; 634 struct rb_node *parent = NULL; 635 struct throtl_grp *__tg; 636 unsigned long key = tg->disptime; 637 int left = 1; 638 639 while (*node != NULL) { 640 parent = *node; 641 __tg = rb_entry_tg(parent); 642 643 if (time_before(key, __tg->disptime)) 644 node = &parent->rb_left; 645 else { 646 node = &parent->rb_right; 647 left = 0; 648 } 649 } 650 651 if (left) 652 parent_sq->first_pending = &tg->rb_node; 653 654 rb_link_node(&tg->rb_node, parent, node); 655 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 656 } 657 658 static void __throtl_enqueue_tg(struct throtl_grp *tg) 659 { 660 tg_service_queue_add(tg); 661 tg->flags |= THROTL_TG_PENDING; 662 tg->service_queue.parent_sq->nr_pending++; 663 } 664 665 static void throtl_enqueue_tg(struct throtl_grp *tg) 666 { 667 if (!(tg->flags & THROTL_TG_PENDING)) 668 __throtl_enqueue_tg(tg); 669 } 670 671 static void __throtl_dequeue_tg(struct throtl_grp *tg) 672 { 673 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 674 tg->flags &= ~THROTL_TG_PENDING; 675 } 676 677 static void throtl_dequeue_tg(struct throtl_grp *tg) 678 { 679 if (tg->flags & THROTL_TG_PENDING) 680 __throtl_dequeue_tg(tg); 681 } 682 683 /* Call with queue lock held */ 684 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 685 unsigned long expires) 686 { 687 unsigned long max_expire = jiffies + 8 * sq_to_tg(sq)->td->throtl_slice; 688 689 /* 690 * Since we are adjusting the throttle limit dynamically, the sleep 691 * time calculated according to previous limit might be invalid. It's 692 * possible the cgroup sleep time is very long and no other cgroups 693 * have IO running so notify the limit changes. Make sure the cgroup 694 * doesn't sleep too long to avoid the missed notification. 695 */ 696 if (time_after(expires, max_expire)) 697 expires = max_expire; 698 mod_timer(&sq->pending_timer, expires); 699 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 700 expires - jiffies, jiffies); 701 } 702 703 /** 704 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 705 * @sq: the service_queue to schedule dispatch for 706 * @force: force scheduling 707 * 708 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 709 * dispatch time of the first pending child. Returns %true if either timer 710 * is armed or there's no pending child left. %false if the current 711 * dispatch window is still open and the caller should continue 712 * dispatching. 713 * 714 * If @force is %true, the dispatch timer is always scheduled and this 715 * function is guaranteed to return %true. This is to be used when the 716 * caller can't dispatch itself and needs to invoke pending_timer 717 * unconditionally. Note that forced scheduling is likely to induce short 718 * delay before dispatch starts even if @sq->first_pending_disptime is not 719 * in the future and thus shouldn't be used in hot paths. 720 */ 721 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 722 bool force) 723 { 724 /* any pending children left? */ 725 if (!sq->nr_pending) 726 return true; 727 728 update_min_dispatch_time(sq); 729 730 /* is the next dispatch time in the future? */ 731 if (force || time_after(sq->first_pending_disptime, jiffies)) { 732 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 733 return true; 734 } 735 736 /* tell the caller to continue dispatching */ 737 return false; 738 } 739 740 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 741 bool rw, unsigned long start) 742 { 743 tg->bytes_disp[rw] = 0; 744 tg->io_disp[rw] = 0; 745 746 /* 747 * Previous slice has expired. We must have trimmed it after last 748 * bio dispatch. That means since start of last slice, we never used 749 * that bandwidth. Do try to make use of that bandwidth while giving 750 * credit. 751 */ 752 if (time_after_eq(start, tg->slice_start[rw])) 753 tg->slice_start[rw] = start; 754 755 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 756 throtl_log(&tg->service_queue, 757 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 758 rw == READ ? 'R' : 'W', tg->slice_start[rw], 759 tg->slice_end[rw], jiffies); 760 } 761 762 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 763 { 764 tg->bytes_disp[rw] = 0; 765 tg->io_disp[rw] = 0; 766 tg->slice_start[rw] = jiffies; 767 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 768 throtl_log(&tg->service_queue, 769 "[%c] new slice start=%lu end=%lu jiffies=%lu", 770 rw == READ ? 'R' : 'W', tg->slice_start[rw], 771 tg->slice_end[rw], jiffies); 772 } 773 774 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 775 unsigned long jiffy_end) 776 { 777 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); 778 } 779 780 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 781 unsigned long jiffy_end) 782 { 783 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); 784 throtl_log(&tg->service_queue, 785 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 786 rw == READ ? 'R' : 'W', tg->slice_start[rw], 787 tg->slice_end[rw], jiffies); 788 } 789 790 /* Determine if previously allocated or extended slice is complete or not */ 791 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 792 { 793 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 794 return false; 795 796 return 1; 797 } 798 799 /* Trim the used slices and adjust slice start accordingly */ 800 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 801 { 802 unsigned long nr_slices, time_elapsed, io_trim; 803 u64 bytes_trim, tmp; 804 805 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 806 807 /* 808 * If bps are unlimited (-1), then time slice don't get 809 * renewed. Don't try to trim the slice if slice is used. A new 810 * slice will start when appropriate. 811 */ 812 if (throtl_slice_used(tg, rw)) 813 return; 814 815 /* 816 * A bio has been dispatched. Also adjust slice_end. It might happen 817 * that initially cgroup limit was very low resulting in high 818 * slice_end, but later limit was bumped up and bio was dispached 819 * sooner, then we need to reduce slice_end. A high bogus slice_end 820 * is bad because it does not allow new slice to start. 821 */ 822 823 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); 824 825 time_elapsed = jiffies - tg->slice_start[rw]; 826 827 nr_slices = time_elapsed / tg->td->throtl_slice; 828 829 if (!nr_slices) 830 return; 831 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; 832 do_div(tmp, HZ); 833 bytes_trim = tmp; 834 835 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / 836 HZ; 837 838 if (!bytes_trim && !io_trim) 839 return; 840 841 if (tg->bytes_disp[rw] >= bytes_trim) 842 tg->bytes_disp[rw] -= bytes_trim; 843 else 844 tg->bytes_disp[rw] = 0; 845 846 if (tg->io_disp[rw] >= io_trim) 847 tg->io_disp[rw] -= io_trim; 848 else 849 tg->io_disp[rw] = 0; 850 851 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; 852 853 throtl_log(&tg->service_queue, 854 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 855 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 856 tg->slice_start[rw], tg->slice_end[rw], jiffies); 857 } 858 859 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 860 unsigned long *wait) 861 { 862 bool rw = bio_data_dir(bio); 863 unsigned int io_allowed; 864 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 865 u64 tmp; 866 867 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 868 869 /* Slice has just started. Consider one slice interval */ 870 if (!jiffy_elapsed) 871 jiffy_elapsed_rnd = tg->td->throtl_slice; 872 873 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 874 875 /* 876 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 877 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 878 * will allow dispatch after 1 second and after that slice should 879 * have been trimmed. 880 */ 881 882 tmp = (u64)tg_iops_limit(tg, rw) * jiffy_elapsed_rnd; 883 do_div(tmp, HZ); 884 885 if (tmp > UINT_MAX) 886 io_allowed = UINT_MAX; 887 else 888 io_allowed = tmp; 889 890 if (tg->io_disp[rw] + 1 <= io_allowed) { 891 if (wait) 892 *wait = 0; 893 return true; 894 } 895 896 /* Calc approx time to dispatch */ 897 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ) / tg_iops_limit(tg, rw) + 1; 898 899 if (jiffy_wait > jiffy_elapsed) 900 jiffy_wait = jiffy_wait - jiffy_elapsed; 901 else 902 jiffy_wait = 1; 903 904 if (wait) 905 *wait = jiffy_wait; 906 return 0; 907 } 908 909 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 910 unsigned long *wait) 911 { 912 bool rw = bio_data_dir(bio); 913 u64 bytes_allowed, extra_bytes, tmp; 914 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 915 916 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 917 918 /* Slice has just started. Consider one slice interval */ 919 if (!jiffy_elapsed) 920 jiffy_elapsed_rnd = tg->td->throtl_slice; 921 922 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 923 924 tmp = tg_bps_limit(tg, rw) * jiffy_elapsed_rnd; 925 do_div(tmp, HZ); 926 bytes_allowed = tmp; 927 928 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 929 if (wait) 930 *wait = 0; 931 return true; 932 } 933 934 /* Calc approx time to dispatch */ 935 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; 936 jiffy_wait = div64_u64(extra_bytes * HZ, tg_bps_limit(tg, rw)); 937 938 if (!jiffy_wait) 939 jiffy_wait = 1; 940 941 /* 942 * This wait time is without taking into consideration the rounding 943 * up we did. Add that time also. 944 */ 945 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 946 if (wait) 947 *wait = jiffy_wait; 948 return 0; 949 } 950 951 /* 952 * Returns whether one can dispatch a bio or not. Also returns approx number 953 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 954 */ 955 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 956 unsigned long *wait) 957 { 958 bool rw = bio_data_dir(bio); 959 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 960 961 /* 962 * Currently whole state machine of group depends on first bio 963 * queued in the group bio list. So one should not be calling 964 * this function with a different bio if there are other bios 965 * queued. 966 */ 967 BUG_ON(tg->service_queue.nr_queued[rw] && 968 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 969 970 /* If tg->bps = -1, then BW is unlimited */ 971 if (tg_bps_limit(tg, rw) == U64_MAX && 972 tg_iops_limit(tg, rw) == UINT_MAX) { 973 if (wait) 974 *wait = 0; 975 return true; 976 } 977 978 /* 979 * If previous slice expired, start a new one otherwise renew/extend 980 * existing slice to make sure it is at least throtl_slice interval 981 * long since now. New slice is started only for empty throttle group. 982 * If there is queued bio, that means there should be an active 983 * slice and it should be extended instead. 984 */ 985 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 986 throtl_start_new_slice(tg, rw); 987 else { 988 if (time_before(tg->slice_end[rw], 989 jiffies + tg->td->throtl_slice)) 990 throtl_extend_slice(tg, rw, 991 jiffies + tg->td->throtl_slice); 992 } 993 994 if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 995 tg_with_in_iops_limit(tg, bio, &iops_wait)) { 996 if (wait) 997 *wait = 0; 998 return 1; 999 } 1000 1001 max_wait = max(bps_wait, iops_wait); 1002 1003 if (wait) 1004 *wait = max_wait; 1005 1006 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 1007 throtl_extend_slice(tg, rw, jiffies + max_wait); 1008 1009 return 0; 1010 } 1011 1012 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 1013 { 1014 bool rw = bio_data_dir(bio); 1015 1016 /* Charge the bio to the group */ 1017 tg->bytes_disp[rw] += bio->bi_iter.bi_size; 1018 tg->io_disp[rw]++; 1019 tg->last_bytes_disp[rw] += bio->bi_iter.bi_size; 1020 tg->last_io_disp[rw]++; 1021 1022 /* 1023 * BIO_THROTTLED is used to prevent the same bio to be throttled 1024 * more than once as a throttled bio will go through blk-throtl the 1025 * second time when it eventually gets issued. Set it when a bio 1026 * is being charged to a tg. 1027 */ 1028 if (!bio_flagged(bio, BIO_THROTTLED)) 1029 bio_set_flag(bio, BIO_THROTTLED); 1030 } 1031 1032 /** 1033 * throtl_add_bio_tg - add a bio to the specified throtl_grp 1034 * @bio: bio to add 1035 * @qn: qnode to use 1036 * @tg: the target throtl_grp 1037 * 1038 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 1039 * tg->qnode_on_self[] is used. 1040 */ 1041 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 1042 struct throtl_grp *tg) 1043 { 1044 struct throtl_service_queue *sq = &tg->service_queue; 1045 bool rw = bio_data_dir(bio); 1046 1047 if (!qn) 1048 qn = &tg->qnode_on_self[rw]; 1049 1050 /* 1051 * If @tg doesn't currently have any bios queued in the same 1052 * direction, queueing @bio can change when @tg should be 1053 * dispatched. Mark that @tg was empty. This is automatically 1054 * cleaered on the next tg_update_disptime(). 1055 */ 1056 if (!sq->nr_queued[rw]) 1057 tg->flags |= THROTL_TG_WAS_EMPTY; 1058 1059 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 1060 1061 sq->nr_queued[rw]++; 1062 throtl_enqueue_tg(tg); 1063 } 1064 1065 static void tg_update_disptime(struct throtl_grp *tg) 1066 { 1067 struct throtl_service_queue *sq = &tg->service_queue; 1068 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1069 struct bio *bio; 1070 1071 bio = throtl_peek_queued(&sq->queued[READ]); 1072 if (bio) 1073 tg_may_dispatch(tg, bio, &read_wait); 1074 1075 bio = throtl_peek_queued(&sq->queued[WRITE]); 1076 if (bio) 1077 tg_may_dispatch(tg, bio, &write_wait); 1078 1079 min_wait = min(read_wait, write_wait); 1080 disptime = jiffies + min_wait; 1081 1082 /* Update dispatch time */ 1083 throtl_dequeue_tg(tg); 1084 tg->disptime = disptime; 1085 throtl_enqueue_tg(tg); 1086 1087 /* see throtl_add_bio_tg() */ 1088 tg->flags &= ~THROTL_TG_WAS_EMPTY; 1089 } 1090 1091 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 1092 struct throtl_grp *parent_tg, bool rw) 1093 { 1094 if (throtl_slice_used(parent_tg, rw)) { 1095 throtl_start_new_slice_with_credit(parent_tg, rw, 1096 child_tg->slice_start[rw]); 1097 } 1098 1099 } 1100 1101 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 1102 { 1103 struct throtl_service_queue *sq = &tg->service_queue; 1104 struct throtl_service_queue *parent_sq = sq->parent_sq; 1105 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1106 struct throtl_grp *tg_to_put = NULL; 1107 struct bio *bio; 1108 1109 /* 1110 * @bio is being transferred from @tg to @parent_sq. Popping a bio 1111 * from @tg may put its reference and @parent_sq might end up 1112 * getting released prematurely. Remember the tg to put and put it 1113 * after @bio is transferred to @parent_sq. 1114 */ 1115 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 1116 sq->nr_queued[rw]--; 1117 1118 throtl_charge_bio(tg, bio); 1119 1120 /* 1121 * If our parent is another tg, we just need to transfer @bio to 1122 * the parent using throtl_add_bio_tg(). If our parent is 1123 * @td->service_queue, @bio is ready to be issued. Put it on its 1124 * bio_lists[] and decrease total number queued. The caller is 1125 * responsible for issuing these bios. 1126 */ 1127 if (parent_tg) { 1128 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 1129 start_parent_slice_with_credit(tg, parent_tg, rw); 1130 } else { 1131 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1132 &parent_sq->queued[rw]); 1133 BUG_ON(tg->td->nr_queued[rw] <= 0); 1134 tg->td->nr_queued[rw]--; 1135 } 1136 1137 throtl_trim_slice(tg, rw); 1138 1139 if (tg_to_put) 1140 blkg_put(tg_to_blkg(tg_to_put)); 1141 } 1142 1143 static int throtl_dispatch_tg(struct throtl_grp *tg) 1144 { 1145 struct throtl_service_queue *sq = &tg->service_queue; 1146 unsigned int nr_reads = 0, nr_writes = 0; 1147 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 1148 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 1149 struct bio *bio; 1150 1151 /* Try to dispatch 75% READS and 25% WRITES */ 1152 1153 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 1154 tg_may_dispatch(tg, bio, NULL)) { 1155 1156 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1157 nr_reads++; 1158 1159 if (nr_reads >= max_nr_reads) 1160 break; 1161 } 1162 1163 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 1164 tg_may_dispatch(tg, bio, NULL)) { 1165 1166 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1167 nr_writes++; 1168 1169 if (nr_writes >= max_nr_writes) 1170 break; 1171 } 1172 1173 return nr_reads + nr_writes; 1174 } 1175 1176 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1177 { 1178 unsigned int nr_disp = 0; 1179 1180 while (1) { 1181 struct throtl_grp *tg = throtl_rb_first(parent_sq); 1182 struct throtl_service_queue *sq = &tg->service_queue; 1183 1184 if (!tg) 1185 break; 1186 1187 if (time_before(jiffies, tg->disptime)) 1188 break; 1189 1190 throtl_dequeue_tg(tg); 1191 1192 nr_disp += throtl_dispatch_tg(tg); 1193 1194 if (sq->nr_queued[0] || sq->nr_queued[1]) 1195 tg_update_disptime(tg); 1196 1197 if (nr_disp >= throtl_quantum) 1198 break; 1199 } 1200 1201 return nr_disp; 1202 } 1203 1204 static bool throtl_can_upgrade(struct throtl_data *td, 1205 struct throtl_grp *this_tg); 1206 /** 1207 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1208 * @arg: the throtl_service_queue being serviced 1209 * 1210 * This timer is armed when a child throtl_grp with active bio's become 1211 * pending and queued on the service_queue's pending_tree and expires when 1212 * the first child throtl_grp should be dispatched. This function 1213 * dispatches bio's from the children throtl_grps to the parent 1214 * service_queue. 1215 * 1216 * If the parent's parent is another throtl_grp, dispatching is propagated 1217 * by either arming its pending_timer or repeating dispatch directly. If 1218 * the top-level service_tree is reached, throtl_data->dispatch_work is 1219 * kicked so that the ready bio's are issued. 1220 */ 1221 static void throtl_pending_timer_fn(unsigned long arg) 1222 { 1223 struct throtl_service_queue *sq = (void *)arg; 1224 struct throtl_grp *tg = sq_to_tg(sq); 1225 struct throtl_data *td = sq_to_td(sq); 1226 struct request_queue *q = td->queue; 1227 struct throtl_service_queue *parent_sq; 1228 bool dispatched; 1229 int ret; 1230 1231 spin_lock_irq(q->queue_lock); 1232 if (throtl_can_upgrade(td, NULL)) 1233 throtl_upgrade_state(td); 1234 1235 again: 1236 parent_sq = sq->parent_sq; 1237 dispatched = false; 1238 1239 while (true) { 1240 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1241 sq->nr_queued[READ] + sq->nr_queued[WRITE], 1242 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1243 1244 ret = throtl_select_dispatch(sq); 1245 if (ret) { 1246 throtl_log(sq, "bios disp=%u", ret); 1247 dispatched = true; 1248 } 1249 1250 if (throtl_schedule_next_dispatch(sq, false)) 1251 break; 1252 1253 /* this dispatch windows is still open, relax and repeat */ 1254 spin_unlock_irq(q->queue_lock); 1255 cpu_relax(); 1256 spin_lock_irq(q->queue_lock); 1257 } 1258 1259 if (!dispatched) 1260 goto out_unlock; 1261 1262 if (parent_sq) { 1263 /* @parent_sq is another throl_grp, propagate dispatch */ 1264 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1265 tg_update_disptime(tg); 1266 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1267 /* window is already open, repeat dispatching */ 1268 sq = parent_sq; 1269 tg = sq_to_tg(sq); 1270 goto again; 1271 } 1272 } 1273 } else { 1274 /* reached the top-level, queue issueing */ 1275 queue_work(kthrotld_workqueue, &td->dispatch_work); 1276 } 1277 out_unlock: 1278 spin_unlock_irq(q->queue_lock); 1279 } 1280 1281 /** 1282 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1283 * @work: work item being executed 1284 * 1285 * This function is queued for execution when bio's reach the bio_lists[] 1286 * of throtl_data->service_queue. Those bio's are ready and issued by this 1287 * function. 1288 */ 1289 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1290 { 1291 struct throtl_data *td = container_of(work, struct throtl_data, 1292 dispatch_work); 1293 struct throtl_service_queue *td_sq = &td->service_queue; 1294 struct request_queue *q = td->queue; 1295 struct bio_list bio_list_on_stack; 1296 struct bio *bio; 1297 struct blk_plug plug; 1298 int rw; 1299 1300 bio_list_init(&bio_list_on_stack); 1301 1302 spin_lock_irq(q->queue_lock); 1303 for (rw = READ; rw <= WRITE; rw++) 1304 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1305 bio_list_add(&bio_list_on_stack, bio); 1306 spin_unlock_irq(q->queue_lock); 1307 1308 if (!bio_list_empty(&bio_list_on_stack)) { 1309 blk_start_plug(&plug); 1310 while((bio = bio_list_pop(&bio_list_on_stack))) 1311 generic_make_request(bio); 1312 blk_finish_plug(&plug); 1313 } 1314 } 1315 1316 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1317 int off) 1318 { 1319 struct throtl_grp *tg = pd_to_tg(pd); 1320 u64 v = *(u64 *)((void *)tg + off); 1321 1322 if (v == U64_MAX) 1323 return 0; 1324 return __blkg_prfill_u64(sf, pd, v); 1325 } 1326 1327 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1328 int off) 1329 { 1330 struct throtl_grp *tg = pd_to_tg(pd); 1331 unsigned int v = *(unsigned int *)((void *)tg + off); 1332 1333 if (v == UINT_MAX) 1334 return 0; 1335 return __blkg_prfill_u64(sf, pd, v); 1336 } 1337 1338 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1339 { 1340 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1341 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1342 return 0; 1343 } 1344 1345 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1346 { 1347 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1348 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1349 return 0; 1350 } 1351 1352 static void tg_conf_updated(struct throtl_grp *tg) 1353 { 1354 struct throtl_service_queue *sq = &tg->service_queue; 1355 struct cgroup_subsys_state *pos_css; 1356 struct blkcg_gq *blkg; 1357 1358 throtl_log(&tg->service_queue, 1359 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1360 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1361 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1362 1363 /* 1364 * Update has_rules[] flags for the updated tg's subtree. A tg is 1365 * considered to have rules if either the tg itself or any of its 1366 * ancestors has rules. This identifies groups without any 1367 * restrictions in the whole hierarchy and allows them to bypass 1368 * blk-throttle. 1369 */ 1370 blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) 1371 tg_update_has_rules(blkg_to_tg(blkg)); 1372 1373 /* 1374 * We're already holding queue_lock and know @tg is valid. Let's 1375 * apply the new config directly. 1376 * 1377 * Restart the slices for both READ and WRITES. It might happen 1378 * that a group's limit are dropped suddenly and we don't want to 1379 * account recently dispatched IO with new low rate. 1380 */ 1381 throtl_start_new_slice(tg, 0); 1382 throtl_start_new_slice(tg, 1); 1383 1384 if (tg->flags & THROTL_TG_PENDING) { 1385 tg_update_disptime(tg); 1386 throtl_schedule_next_dispatch(sq->parent_sq, true); 1387 } 1388 } 1389 1390 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1391 char *buf, size_t nbytes, loff_t off, bool is_u64) 1392 { 1393 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1394 struct blkg_conf_ctx ctx; 1395 struct throtl_grp *tg; 1396 int ret; 1397 u64 v; 1398 1399 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1400 if (ret) 1401 return ret; 1402 1403 ret = -EINVAL; 1404 if (sscanf(ctx.body, "%llu", &v) != 1) 1405 goto out_finish; 1406 if (!v) 1407 v = U64_MAX; 1408 1409 tg = blkg_to_tg(ctx.blkg); 1410 1411 if (is_u64) 1412 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1413 else 1414 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1415 1416 tg_conf_updated(tg); 1417 ret = 0; 1418 out_finish: 1419 blkg_conf_finish(&ctx); 1420 return ret ?: nbytes; 1421 } 1422 1423 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1424 char *buf, size_t nbytes, loff_t off) 1425 { 1426 return tg_set_conf(of, buf, nbytes, off, true); 1427 } 1428 1429 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1430 char *buf, size_t nbytes, loff_t off) 1431 { 1432 return tg_set_conf(of, buf, nbytes, off, false); 1433 } 1434 1435 static struct cftype throtl_legacy_files[] = { 1436 { 1437 .name = "throttle.read_bps_device", 1438 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), 1439 .seq_show = tg_print_conf_u64, 1440 .write = tg_set_conf_u64, 1441 }, 1442 { 1443 .name = "throttle.write_bps_device", 1444 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), 1445 .seq_show = tg_print_conf_u64, 1446 .write = tg_set_conf_u64, 1447 }, 1448 { 1449 .name = "throttle.read_iops_device", 1450 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), 1451 .seq_show = tg_print_conf_uint, 1452 .write = tg_set_conf_uint, 1453 }, 1454 { 1455 .name = "throttle.write_iops_device", 1456 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), 1457 .seq_show = tg_print_conf_uint, 1458 .write = tg_set_conf_uint, 1459 }, 1460 { 1461 .name = "throttle.io_service_bytes", 1462 .private = (unsigned long)&blkcg_policy_throtl, 1463 .seq_show = blkg_print_stat_bytes, 1464 }, 1465 { 1466 .name = "throttle.io_serviced", 1467 .private = (unsigned long)&blkcg_policy_throtl, 1468 .seq_show = blkg_print_stat_ios, 1469 }, 1470 { } /* terminate */ 1471 }; 1472 1473 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1474 int off) 1475 { 1476 struct throtl_grp *tg = pd_to_tg(pd); 1477 const char *dname = blkg_dev_name(pd->blkg); 1478 char bufs[4][21] = { "max", "max", "max", "max" }; 1479 u64 bps_dft; 1480 unsigned int iops_dft; 1481 char idle_time[26] = ""; 1482 char latency_time[26] = ""; 1483 1484 if (!dname) 1485 return 0; 1486 1487 if (off == LIMIT_LOW) { 1488 bps_dft = 0; 1489 iops_dft = 0; 1490 } else { 1491 bps_dft = U64_MAX; 1492 iops_dft = UINT_MAX; 1493 } 1494 1495 if (tg->bps_conf[READ][off] == bps_dft && 1496 tg->bps_conf[WRITE][off] == bps_dft && 1497 tg->iops_conf[READ][off] == iops_dft && 1498 tg->iops_conf[WRITE][off] == iops_dft && 1499 (off != LIMIT_LOW || 1500 (tg->idletime_threshold == tg->td->dft_idletime_threshold && 1501 tg->latency_target == DFL_LATENCY_TARGET))) 1502 return 0; 1503 1504 if (tg->bps_conf[READ][off] != bps_dft) 1505 snprintf(bufs[0], sizeof(bufs[0]), "%llu", 1506 tg->bps_conf[READ][off]); 1507 if (tg->bps_conf[WRITE][off] != bps_dft) 1508 snprintf(bufs[1], sizeof(bufs[1]), "%llu", 1509 tg->bps_conf[WRITE][off]); 1510 if (tg->iops_conf[READ][off] != iops_dft) 1511 snprintf(bufs[2], sizeof(bufs[2]), "%u", 1512 tg->iops_conf[READ][off]); 1513 if (tg->iops_conf[WRITE][off] != iops_dft) 1514 snprintf(bufs[3], sizeof(bufs[3]), "%u", 1515 tg->iops_conf[WRITE][off]); 1516 if (off == LIMIT_LOW) { 1517 if (tg->idletime_threshold == ULONG_MAX) 1518 strcpy(idle_time, " idle=max"); 1519 else 1520 snprintf(idle_time, sizeof(idle_time), " idle=%lu", 1521 tg->idletime_threshold); 1522 1523 if (tg->latency_target == ULONG_MAX) 1524 strcpy(latency_time, " latency=max"); 1525 else 1526 snprintf(latency_time, sizeof(latency_time), 1527 " latency=%lu", tg->latency_target); 1528 } 1529 1530 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", 1531 dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, 1532 latency_time); 1533 return 0; 1534 } 1535 1536 static int tg_print_limit(struct seq_file *sf, void *v) 1537 { 1538 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1539 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1540 return 0; 1541 } 1542 1543 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1544 char *buf, size_t nbytes, loff_t off) 1545 { 1546 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1547 struct blkg_conf_ctx ctx; 1548 struct throtl_grp *tg; 1549 u64 v[4]; 1550 unsigned long idle_time; 1551 unsigned long latency_time; 1552 int ret; 1553 int index = of_cft(of)->private; 1554 1555 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1556 if (ret) 1557 return ret; 1558 1559 tg = blkg_to_tg(ctx.blkg); 1560 1561 v[0] = tg->bps_conf[READ][index]; 1562 v[1] = tg->bps_conf[WRITE][index]; 1563 v[2] = tg->iops_conf[READ][index]; 1564 v[3] = tg->iops_conf[WRITE][index]; 1565 1566 idle_time = tg->idletime_threshold; 1567 latency_time = tg->latency_target; 1568 while (true) { 1569 char tok[27]; /* wiops=18446744073709551616 */ 1570 char *p; 1571 u64 val = U64_MAX; 1572 int len; 1573 1574 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1575 break; 1576 if (tok[0] == '\0') 1577 break; 1578 ctx.body += len; 1579 1580 ret = -EINVAL; 1581 p = tok; 1582 strsep(&p, "="); 1583 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1584 goto out_finish; 1585 1586 ret = -ERANGE; 1587 if (!val) 1588 goto out_finish; 1589 1590 ret = -EINVAL; 1591 if (!strcmp(tok, "rbps")) 1592 v[0] = val; 1593 else if (!strcmp(tok, "wbps")) 1594 v[1] = val; 1595 else if (!strcmp(tok, "riops")) 1596 v[2] = min_t(u64, val, UINT_MAX); 1597 else if (!strcmp(tok, "wiops")) 1598 v[3] = min_t(u64, val, UINT_MAX); 1599 else if (off == LIMIT_LOW && !strcmp(tok, "idle")) 1600 idle_time = val; 1601 else if (off == LIMIT_LOW && !strcmp(tok, "latency")) 1602 latency_time = val; 1603 else 1604 goto out_finish; 1605 } 1606 1607 tg->bps_conf[READ][index] = v[0]; 1608 tg->bps_conf[WRITE][index] = v[1]; 1609 tg->iops_conf[READ][index] = v[2]; 1610 tg->iops_conf[WRITE][index] = v[3]; 1611 1612 if (index == LIMIT_MAX) { 1613 tg->bps[READ][index] = v[0]; 1614 tg->bps[WRITE][index] = v[1]; 1615 tg->iops[READ][index] = v[2]; 1616 tg->iops[WRITE][index] = v[3]; 1617 } 1618 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], 1619 tg->bps_conf[READ][LIMIT_MAX]); 1620 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], 1621 tg->bps_conf[WRITE][LIMIT_MAX]); 1622 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], 1623 tg->iops_conf[READ][LIMIT_MAX]); 1624 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], 1625 tg->iops_conf[WRITE][LIMIT_MAX]); 1626 1627 if (index == LIMIT_LOW) { 1628 blk_throtl_update_limit_valid(tg->td); 1629 if (tg->td->limit_valid[LIMIT_LOW]) 1630 tg->td->limit_index = LIMIT_LOW; 1631 tg->idletime_threshold = (idle_time == ULONG_MAX) ? 1632 ULONG_MAX : idle_time; 1633 tg->latency_target = (latency_time == ULONG_MAX) ? 1634 ULONG_MAX : latency_time; 1635 } 1636 tg_conf_updated(tg); 1637 ret = 0; 1638 out_finish: 1639 blkg_conf_finish(&ctx); 1640 return ret ?: nbytes; 1641 } 1642 1643 static struct cftype throtl_files[] = { 1644 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1645 { 1646 .name = "low", 1647 .flags = CFTYPE_NOT_ON_ROOT, 1648 .seq_show = tg_print_limit, 1649 .write = tg_set_limit, 1650 .private = LIMIT_LOW, 1651 }, 1652 #endif 1653 { 1654 .name = "max", 1655 .flags = CFTYPE_NOT_ON_ROOT, 1656 .seq_show = tg_print_limit, 1657 .write = tg_set_limit, 1658 .private = LIMIT_MAX, 1659 }, 1660 { } /* terminate */ 1661 }; 1662 1663 static void throtl_shutdown_wq(struct request_queue *q) 1664 { 1665 struct throtl_data *td = q->td; 1666 1667 cancel_work_sync(&td->dispatch_work); 1668 } 1669 1670 static struct blkcg_policy blkcg_policy_throtl = { 1671 .dfl_cftypes = throtl_files, 1672 .legacy_cftypes = throtl_legacy_files, 1673 1674 .pd_alloc_fn = throtl_pd_alloc, 1675 .pd_init_fn = throtl_pd_init, 1676 .pd_online_fn = throtl_pd_online, 1677 .pd_offline_fn = throtl_pd_offline, 1678 .pd_free_fn = throtl_pd_free, 1679 }; 1680 1681 static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) 1682 { 1683 unsigned long rtime = jiffies, wtime = jiffies; 1684 1685 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) 1686 rtime = tg->last_low_overflow_time[READ]; 1687 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) 1688 wtime = tg->last_low_overflow_time[WRITE]; 1689 return min(rtime, wtime); 1690 } 1691 1692 /* tg should not be an intermediate node */ 1693 static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) 1694 { 1695 struct throtl_service_queue *parent_sq; 1696 struct throtl_grp *parent = tg; 1697 unsigned long ret = __tg_last_low_overflow_time(tg); 1698 1699 while (true) { 1700 parent_sq = parent->service_queue.parent_sq; 1701 parent = sq_to_tg(parent_sq); 1702 if (!parent) 1703 break; 1704 1705 /* 1706 * The parent doesn't have low limit, it always reaches low 1707 * limit. Its overflow time is useless for children 1708 */ 1709 if (!parent->bps[READ][LIMIT_LOW] && 1710 !parent->iops[READ][LIMIT_LOW] && 1711 !parent->bps[WRITE][LIMIT_LOW] && 1712 !parent->iops[WRITE][LIMIT_LOW]) 1713 continue; 1714 if (time_after(__tg_last_low_overflow_time(parent), ret)) 1715 ret = __tg_last_low_overflow_time(parent); 1716 } 1717 return ret; 1718 } 1719 1720 static bool throtl_tg_is_idle(struct throtl_grp *tg) 1721 { 1722 /* 1723 * cgroup is idle if: 1724 * - single idle is too long, longer than a fixed value (in case user 1725 * configure a too big threshold) or 4 times of slice 1726 * - average think time is more than threshold 1727 * - IO latency is largely below threshold 1728 */ 1729 unsigned long time = jiffies_to_usecs(4 * tg->td->throtl_slice); 1730 1731 time = min_t(unsigned long, MAX_IDLE_TIME, time); 1732 return (ktime_get_ns() >> 10) - tg->last_finish_time > time || 1733 tg->avg_idletime > tg->idletime_threshold || 1734 (tg->latency_target && tg->bio_cnt && 1735 tg->bad_bio_cnt * 5 < tg->bio_cnt); 1736 } 1737 1738 static bool throtl_tg_can_upgrade(struct throtl_grp *tg) 1739 { 1740 struct throtl_service_queue *sq = &tg->service_queue; 1741 bool read_limit, write_limit; 1742 1743 /* 1744 * if cgroup reaches low limit (if low limit is 0, the cgroup always 1745 * reaches), it's ok to upgrade to next limit 1746 */ 1747 read_limit = tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]; 1748 write_limit = tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]; 1749 if (!read_limit && !write_limit) 1750 return true; 1751 if (read_limit && sq->nr_queued[READ] && 1752 (!write_limit || sq->nr_queued[WRITE])) 1753 return true; 1754 if (write_limit && sq->nr_queued[WRITE] && 1755 (!read_limit || sq->nr_queued[READ])) 1756 return true; 1757 1758 if (time_after_eq(jiffies, 1759 tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && 1760 throtl_tg_is_idle(tg)) 1761 return true; 1762 return false; 1763 } 1764 1765 static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) 1766 { 1767 while (true) { 1768 if (throtl_tg_can_upgrade(tg)) 1769 return true; 1770 tg = sq_to_tg(tg->service_queue.parent_sq); 1771 if (!tg || !tg_to_blkg(tg)->parent) 1772 return false; 1773 } 1774 return false; 1775 } 1776 1777 static bool throtl_can_upgrade(struct throtl_data *td, 1778 struct throtl_grp *this_tg) 1779 { 1780 struct cgroup_subsys_state *pos_css; 1781 struct blkcg_gq *blkg; 1782 1783 if (td->limit_index != LIMIT_LOW) 1784 return false; 1785 1786 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) 1787 return false; 1788 1789 rcu_read_lock(); 1790 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1791 struct throtl_grp *tg = blkg_to_tg(blkg); 1792 1793 if (tg == this_tg) 1794 continue; 1795 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1796 continue; 1797 if (!throtl_hierarchy_can_upgrade(tg)) { 1798 rcu_read_unlock(); 1799 return false; 1800 } 1801 } 1802 rcu_read_unlock(); 1803 return true; 1804 } 1805 1806 static void throtl_upgrade_check(struct throtl_grp *tg) 1807 { 1808 unsigned long now = jiffies; 1809 1810 if (tg->td->limit_index != LIMIT_LOW) 1811 return; 1812 1813 if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1814 return; 1815 1816 tg->last_check_time = now; 1817 1818 if (!time_after_eq(now, 1819 __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) 1820 return; 1821 1822 if (throtl_can_upgrade(tg->td, NULL)) 1823 throtl_upgrade_state(tg->td); 1824 } 1825 1826 static void throtl_upgrade_state(struct throtl_data *td) 1827 { 1828 struct cgroup_subsys_state *pos_css; 1829 struct blkcg_gq *blkg; 1830 1831 td->limit_index = LIMIT_MAX; 1832 td->low_upgrade_time = jiffies; 1833 td->scale = 0; 1834 rcu_read_lock(); 1835 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1836 struct throtl_grp *tg = blkg_to_tg(blkg); 1837 struct throtl_service_queue *sq = &tg->service_queue; 1838 1839 tg->disptime = jiffies - 1; 1840 throtl_select_dispatch(sq); 1841 throtl_schedule_next_dispatch(sq, false); 1842 } 1843 rcu_read_unlock(); 1844 throtl_select_dispatch(&td->service_queue); 1845 throtl_schedule_next_dispatch(&td->service_queue, false); 1846 queue_work(kthrotld_workqueue, &td->dispatch_work); 1847 } 1848 1849 static void throtl_downgrade_state(struct throtl_data *td, int new) 1850 { 1851 td->scale /= 2; 1852 1853 if (td->scale) { 1854 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; 1855 return; 1856 } 1857 1858 td->limit_index = new; 1859 td->low_downgrade_time = jiffies; 1860 } 1861 1862 static bool throtl_tg_can_downgrade(struct throtl_grp *tg) 1863 { 1864 struct throtl_data *td = tg->td; 1865 unsigned long now = jiffies; 1866 1867 /* 1868 * If cgroup is below low limit, consider downgrade and throttle other 1869 * cgroups 1870 */ 1871 if (time_after_eq(now, td->low_upgrade_time + td->throtl_slice) && 1872 time_after_eq(now, tg_last_low_overflow_time(tg) + 1873 td->throtl_slice) && 1874 (!throtl_tg_is_idle(tg) || 1875 !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) 1876 return true; 1877 return false; 1878 } 1879 1880 static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) 1881 { 1882 while (true) { 1883 if (!throtl_tg_can_downgrade(tg)) 1884 return false; 1885 tg = sq_to_tg(tg->service_queue.parent_sq); 1886 if (!tg || !tg_to_blkg(tg)->parent) 1887 break; 1888 } 1889 return true; 1890 } 1891 1892 static void throtl_downgrade_check(struct throtl_grp *tg) 1893 { 1894 uint64_t bps; 1895 unsigned int iops; 1896 unsigned long elapsed_time; 1897 unsigned long now = jiffies; 1898 1899 if (tg->td->limit_index != LIMIT_MAX || 1900 !tg->td->limit_valid[LIMIT_LOW]) 1901 return; 1902 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1903 return; 1904 if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1905 return; 1906 1907 elapsed_time = now - tg->last_check_time; 1908 tg->last_check_time = now; 1909 1910 if (time_before(now, tg_last_low_overflow_time(tg) + 1911 tg->td->throtl_slice)) 1912 return; 1913 1914 if (tg->bps[READ][LIMIT_LOW]) { 1915 bps = tg->last_bytes_disp[READ] * HZ; 1916 do_div(bps, elapsed_time); 1917 if (bps >= tg->bps[READ][LIMIT_LOW]) 1918 tg->last_low_overflow_time[READ] = now; 1919 } 1920 1921 if (tg->bps[WRITE][LIMIT_LOW]) { 1922 bps = tg->last_bytes_disp[WRITE] * HZ; 1923 do_div(bps, elapsed_time); 1924 if (bps >= tg->bps[WRITE][LIMIT_LOW]) 1925 tg->last_low_overflow_time[WRITE] = now; 1926 } 1927 1928 if (tg->iops[READ][LIMIT_LOW]) { 1929 iops = tg->last_io_disp[READ] * HZ / elapsed_time; 1930 if (iops >= tg->iops[READ][LIMIT_LOW]) 1931 tg->last_low_overflow_time[READ] = now; 1932 } 1933 1934 if (tg->iops[WRITE][LIMIT_LOW]) { 1935 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; 1936 if (iops >= tg->iops[WRITE][LIMIT_LOW]) 1937 tg->last_low_overflow_time[WRITE] = now; 1938 } 1939 1940 /* 1941 * If cgroup is below low limit, consider downgrade and throttle other 1942 * cgroups 1943 */ 1944 if (throtl_hierarchy_can_downgrade(tg)) 1945 throtl_downgrade_state(tg->td, LIMIT_LOW); 1946 1947 tg->last_bytes_disp[READ] = 0; 1948 tg->last_bytes_disp[WRITE] = 0; 1949 tg->last_io_disp[READ] = 0; 1950 tg->last_io_disp[WRITE] = 0; 1951 } 1952 1953 static void blk_throtl_update_idletime(struct throtl_grp *tg) 1954 { 1955 unsigned long now = ktime_get_ns() >> 10; 1956 unsigned long last_finish_time = tg->last_finish_time; 1957 1958 if (now <= last_finish_time || last_finish_time == 0 || 1959 last_finish_time == tg->checked_last_finish_time) 1960 return; 1961 1962 tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; 1963 tg->checked_last_finish_time = last_finish_time; 1964 } 1965 1966 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1967 static void throtl_update_latency_buckets(struct throtl_data *td) 1968 { 1969 struct avg_latency_bucket avg_latency[LATENCY_BUCKET_SIZE]; 1970 int i, cpu; 1971 unsigned long last_latency = 0; 1972 unsigned long latency; 1973 1974 if (!blk_queue_nonrot(td->queue)) 1975 return; 1976 if (time_before(jiffies, td->last_calculate_time + HZ)) 1977 return; 1978 td->last_calculate_time = jiffies; 1979 1980 memset(avg_latency, 0, sizeof(avg_latency)); 1981 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 1982 struct latency_bucket *tmp = &td->tmp_buckets[i]; 1983 1984 for_each_possible_cpu(cpu) { 1985 struct latency_bucket *bucket; 1986 1987 /* this isn't race free, but ok in practice */ 1988 bucket = per_cpu_ptr(td->latency_buckets, cpu); 1989 tmp->total_latency += bucket[i].total_latency; 1990 tmp->samples += bucket[i].samples; 1991 bucket[i].total_latency = 0; 1992 bucket[i].samples = 0; 1993 } 1994 1995 if (tmp->samples >= 32) { 1996 int samples = tmp->samples; 1997 1998 latency = tmp->total_latency; 1999 2000 tmp->total_latency = 0; 2001 tmp->samples = 0; 2002 latency /= samples; 2003 if (latency == 0) 2004 continue; 2005 avg_latency[i].latency = latency; 2006 } 2007 } 2008 2009 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2010 if (!avg_latency[i].latency) { 2011 if (td->avg_buckets[i].latency < last_latency) 2012 td->avg_buckets[i].latency = last_latency; 2013 continue; 2014 } 2015 2016 if (!td->avg_buckets[i].valid) 2017 latency = avg_latency[i].latency; 2018 else 2019 latency = (td->avg_buckets[i].latency * 7 + 2020 avg_latency[i].latency) >> 3; 2021 2022 td->avg_buckets[i].latency = max(latency, last_latency); 2023 td->avg_buckets[i].valid = true; 2024 last_latency = td->avg_buckets[i].latency; 2025 } 2026 } 2027 #else 2028 static inline void throtl_update_latency_buckets(struct throtl_data *td) 2029 { 2030 } 2031 #endif 2032 2033 static void blk_throtl_assoc_bio(struct throtl_grp *tg, struct bio *bio) 2034 { 2035 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2036 int ret; 2037 2038 ret = bio_associate_current(bio); 2039 if (ret == 0 || ret == -EBUSY) 2040 bio->bi_cg_private = tg; 2041 blk_stat_set_issue(&bio->bi_issue_stat, bio_sectors(bio)); 2042 #else 2043 bio_associate_current(bio); 2044 #endif 2045 } 2046 2047 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 2048 struct bio *bio) 2049 { 2050 struct throtl_qnode *qn = NULL; 2051 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); 2052 struct throtl_service_queue *sq; 2053 bool rw = bio_data_dir(bio); 2054 bool throttled = false; 2055 struct throtl_data *td = tg->td; 2056 2057 WARN_ON_ONCE(!rcu_read_lock_held()); 2058 2059 /* see throtl_charge_bio() */ 2060 if (bio_flagged(bio, BIO_THROTTLED) || !tg->has_rules[rw]) 2061 goto out; 2062 2063 spin_lock_irq(q->queue_lock); 2064 2065 throtl_update_latency_buckets(td); 2066 2067 if (unlikely(blk_queue_bypass(q))) 2068 goto out_unlock; 2069 2070 blk_throtl_assoc_bio(tg, bio); 2071 blk_throtl_update_idletime(tg); 2072 2073 sq = &tg->service_queue; 2074 2075 again: 2076 while (true) { 2077 if (tg->last_low_overflow_time[rw] == 0) 2078 tg->last_low_overflow_time[rw] = jiffies; 2079 throtl_downgrade_check(tg); 2080 throtl_upgrade_check(tg); 2081 /* throtl is FIFO - if bios are already queued, should queue */ 2082 if (sq->nr_queued[rw]) 2083 break; 2084 2085 /* if above limits, break to queue */ 2086 if (!tg_may_dispatch(tg, bio, NULL)) { 2087 tg->last_low_overflow_time[rw] = jiffies; 2088 if (throtl_can_upgrade(td, tg)) { 2089 throtl_upgrade_state(td); 2090 goto again; 2091 } 2092 break; 2093 } 2094 2095 /* within limits, let's charge and dispatch directly */ 2096 throtl_charge_bio(tg, bio); 2097 2098 /* 2099 * We need to trim slice even when bios are not being queued 2100 * otherwise it might happen that a bio is not queued for 2101 * a long time and slice keeps on extending and trim is not 2102 * called for a long time. Now if limits are reduced suddenly 2103 * we take into account all the IO dispatched so far at new 2104 * low rate and * newly queued IO gets a really long dispatch 2105 * time. 2106 * 2107 * So keep on trimming slice even if bio is not queued. 2108 */ 2109 throtl_trim_slice(tg, rw); 2110 2111 /* 2112 * @bio passed through this layer without being throttled. 2113 * Climb up the ladder. If we''re already at the top, it 2114 * can be executed directly. 2115 */ 2116 qn = &tg->qnode_on_parent[rw]; 2117 sq = sq->parent_sq; 2118 tg = sq_to_tg(sq); 2119 if (!tg) 2120 goto out_unlock; 2121 } 2122 2123 /* out-of-limit, queue to @tg */ 2124 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 2125 rw == READ ? 'R' : 'W', 2126 tg->bytes_disp[rw], bio->bi_iter.bi_size, 2127 tg_bps_limit(tg, rw), 2128 tg->io_disp[rw], tg_iops_limit(tg, rw), 2129 sq->nr_queued[READ], sq->nr_queued[WRITE]); 2130 2131 tg->last_low_overflow_time[rw] = jiffies; 2132 2133 td->nr_queued[rw]++; 2134 throtl_add_bio_tg(bio, qn, tg); 2135 throttled = true; 2136 2137 /* 2138 * Update @tg's dispatch time and force schedule dispatch if @tg 2139 * was empty before @bio. The forced scheduling isn't likely to 2140 * cause undue delay as @bio is likely to be dispatched directly if 2141 * its @tg's disptime is not in the future. 2142 */ 2143 if (tg->flags & THROTL_TG_WAS_EMPTY) { 2144 tg_update_disptime(tg); 2145 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 2146 } 2147 2148 out_unlock: 2149 spin_unlock_irq(q->queue_lock); 2150 out: 2151 /* 2152 * As multiple blk-throtls may stack in the same issue path, we 2153 * don't want bios to leave with the flag set. Clear the flag if 2154 * being issued. 2155 */ 2156 if (!throttled) 2157 bio_clear_flag(bio, BIO_THROTTLED); 2158 2159 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2160 if (throttled || !td->track_bio_latency) 2161 bio->bi_issue_stat.stat |= SKIP_LATENCY; 2162 #endif 2163 return throttled; 2164 } 2165 2166 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2167 static void throtl_track_latency(struct throtl_data *td, sector_t size, 2168 int op, unsigned long time) 2169 { 2170 struct latency_bucket *latency; 2171 int index; 2172 2173 if (!td || td->limit_index != LIMIT_LOW || op != REQ_OP_READ || 2174 !blk_queue_nonrot(td->queue)) 2175 return; 2176 2177 index = request_bucket_index(size); 2178 2179 latency = get_cpu_ptr(td->latency_buckets); 2180 latency[index].total_latency += time; 2181 latency[index].samples++; 2182 put_cpu_ptr(td->latency_buckets); 2183 } 2184 2185 void blk_throtl_stat_add(struct request *rq, u64 time_ns) 2186 { 2187 struct request_queue *q = rq->q; 2188 struct throtl_data *td = q->td; 2189 2190 throtl_track_latency(td, blk_stat_size(&rq->issue_stat), 2191 req_op(rq), time_ns >> 10); 2192 } 2193 2194 void blk_throtl_bio_endio(struct bio *bio) 2195 { 2196 struct throtl_grp *tg; 2197 u64 finish_time_ns; 2198 unsigned long finish_time; 2199 unsigned long start_time; 2200 unsigned long lat; 2201 2202 tg = bio->bi_cg_private; 2203 if (!tg) 2204 return; 2205 bio->bi_cg_private = NULL; 2206 2207 finish_time_ns = ktime_get_ns(); 2208 tg->last_finish_time = finish_time_ns >> 10; 2209 2210 start_time = blk_stat_time(&bio->bi_issue_stat) >> 10; 2211 finish_time = __blk_stat_time(finish_time_ns) >> 10; 2212 if (!start_time || finish_time <= start_time) 2213 return; 2214 2215 lat = finish_time - start_time; 2216 /* this is only for bio based driver */ 2217 if (!(bio->bi_issue_stat.stat & SKIP_LATENCY)) 2218 throtl_track_latency(tg->td, blk_stat_size(&bio->bi_issue_stat), 2219 bio_op(bio), lat); 2220 2221 if (tg->latency_target) { 2222 int bucket; 2223 unsigned int threshold; 2224 2225 bucket = request_bucket_index( 2226 blk_stat_size(&bio->bi_issue_stat)); 2227 threshold = tg->td->avg_buckets[bucket].latency + 2228 tg->latency_target; 2229 if (lat > threshold) 2230 tg->bad_bio_cnt++; 2231 /* 2232 * Not race free, could get wrong count, which means cgroups 2233 * will be throttled 2234 */ 2235 tg->bio_cnt++; 2236 } 2237 2238 if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { 2239 tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; 2240 tg->bio_cnt /= 2; 2241 tg->bad_bio_cnt /= 2; 2242 } 2243 } 2244 #endif 2245 2246 /* 2247 * Dispatch all bios from all children tg's queued on @parent_sq. On 2248 * return, @parent_sq is guaranteed to not have any active children tg's 2249 * and all bios from previously active tg's are on @parent_sq->bio_lists[]. 2250 */ 2251 static void tg_drain_bios(struct throtl_service_queue *parent_sq) 2252 { 2253 struct throtl_grp *tg; 2254 2255 while ((tg = throtl_rb_first(parent_sq))) { 2256 struct throtl_service_queue *sq = &tg->service_queue; 2257 struct bio *bio; 2258 2259 throtl_dequeue_tg(tg); 2260 2261 while ((bio = throtl_peek_queued(&sq->queued[READ]))) 2262 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 2263 while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 2264 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 2265 } 2266 } 2267 2268 /** 2269 * blk_throtl_drain - drain throttled bios 2270 * @q: request_queue to drain throttled bios for 2271 * 2272 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 2273 */ 2274 void blk_throtl_drain(struct request_queue *q) 2275 __releases(q->queue_lock) __acquires(q->queue_lock) 2276 { 2277 struct throtl_data *td = q->td; 2278 struct blkcg_gq *blkg; 2279 struct cgroup_subsys_state *pos_css; 2280 struct bio *bio; 2281 int rw; 2282 2283 queue_lockdep_assert_held(q); 2284 rcu_read_lock(); 2285 2286 /* 2287 * Drain each tg while doing post-order walk on the blkg tree, so 2288 * that all bios are propagated to td->service_queue. It'd be 2289 * better to walk service_queue tree directly but blkg walk is 2290 * easier. 2291 */ 2292 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) 2293 tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 2294 2295 /* finally, transfer bios from top-level tg's into the td */ 2296 tg_drain_bios(&td->service_queue); 2297 2298 rcu_read_unlock(); 2299 spin_unlock_irq(q->queue_lock); 2300 2301 /* all bios now should be in td->service_queue, issue them */ 2302 for (rw = READ; rw <= WRITE; rw++) 2303 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], 2304 NULL))) 2305 generic_make_request(bio); 2306 2307 spin_lock_irq(q->queue_lock); 2308 } 2309 2310 int blk_throtl_init(struct request_queue *q) 2311 { 2312 struct throtl_data *td; 2313 int ret; 2314 2315 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2316 if (!td) 2317 return -ENOMEM; 2318 td->latency_buckets = __alloc_percpu(sizeof(struct latency_bucket) * 2319 LATENCY_BUCKET_SIZE, __alignof__(u64)); 2320 if (!td->latency_buckets) { 2321 kfree(td); 2322 return -ENOMEM; 2323 } 2324 2325 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 2326 throtl_service_queue_init(&td->service_queue); 2327 2328 q->td = td; 2329 td->queue = q; 2330 2331 td->limit_valid[LIMIT_MAX] = true; 2332 td->limit_index = LIMIT_MAX; 2333 td->low_upgrade_time = jiffies; 2334 td->low_downgrade_time = jiffies; 2335 2336 /* activate policy */ 2337 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2338 if (ret) { 2339 free_percpu(td->latency_buckets); 2340 kfree(td); 2341 } 2342 return ret; 2343 } 2344 2345 void blk_throtl_exit(struct request_queue *q) 2346 { 2347 BUG_ON(!q->td); 2348 throtl_shutdown_wq(q); 2349 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2350 free_percpu(q->td->latency_buckets); 2351 kfree(q->td); 2352 } 2353 2354 void blk_throtl_register_queue(struct request_queue *q) 2355 { 2356 struct throtl_data *td; 2357 struct cgroup_subsys_state *pos_css; 2358 struct blkcg_gq *blkg; 2359 2360 td = q->td; 2361 BUG_ON(!td); 2362 2363 if (blk_queue_nonrot(q)) { 2364 td->throtl_slice = DFL_THROTL_SLICE_SSD; 2365 td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_SSD; 2366 } else { 2367 td->throtl_slice = DFL_THROTL_SLICE_HD; 2368 td->dft_idletime_threshold = DFL_IDLE_THRESHOLD_HD; 2369 } 2370 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW 2371 /* if no low limit, use previous default */ 2372 td->throtl_slice = DFL_THROTL_SLICE_HD; 2373 #endif 2374 2375 td->track_bio_latency = !q->mq_ops && !q->request_fn; 2376 if (!td->track_bio_latency) 2377 blk_stat_enable_accounting(q); 2378 2379 /* 2380 * some tg are created before queue is fully initialized, eg, nonrot 2381 * isn't initialized yet 2382 */ 2383 rcu_read_lock(); 2384 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 2385 struct throtl_grp *tg = blkg_to_tg(blkg); 2386 2387 tg->idletime_threshold = td->dft_idletime_threshold; 2388 } 2389 rcu_read_unlock(); 2390 } 2391 2392 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2393 ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) 2394 { 2395 if (!q->td) 2396 return -EINVAL; 2397 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); 2398 } 2399 2400 ssize_t blk_throtl_sample_time_store(struct request_queue *q, 2401 const char *page, size_t count) 2402 { 2403 unsigned long v; 2404 unsigned long t; 2405 2406 if (!q->td) 2407 return -EINVAL; 2408 if (kstrtoul(page, 10, &v)) 2409 return -EINVAL; 2410 t = msecs_to_jiffies(v); 2411 if (t == 0 || t > MAX_THROTL_SLICE) 2412 return -EINVAL; 2413 q->td->throtl_slice = t; 2414 return count; 2415 } 2416 #endif 2417 2418 static int __init throtl_init(void) 2419 { 2420 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 2421 if (!kthrotld_workqueue) 2422 panic("Failed to create kthrotld\n"); 2423 2424 return blkcg_policy_register(&blkcg_policy_throtl); 2425 } 2426 2427 module_init(throtl_init); 2428