1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * Interface for controlling IO bandwidth on a request queue 4 * 5 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 6 */ 7 8 #include <linux/module.h> 9 #include <linux/slab.h> 10 #include <linux/blkdev.h> 11 #include <linux/bio.h> 12 #include <linux/blktrace_api.h> 13 #include "blk.h" 14 #include "blk-cgroup-rwstat.h" 15 #include "blk-stat.h" 16 #include "blk-throttle.h" 17 18 /* Max dispatch from a group in 1 round */ 19 #define THROTL_GRP_QUANTUM 8 20 21 /* Total max dispatch from all groups in one round */ 22 #define THROTL_QUANTUM 32 23 24 /* Throttling is performed over a slice and after that slice is renewed */ 25 #define DFL_THROTL_SLICE_HD (HZ / 10) 26 #define DFL_THROTL_SLICE_SSD (HZ / 50) 27 #define MAX_THROTL_SLICE (HZ) 28 #define MAX_IDLE_TIME (5L * 1000 * 1000) /* 5 s */ 29 #define MIN_THROTL_BPS (320 * 1024) 30 #define MIN_THROTL_IOPS (10) 31 #define DFL_LATENCY_TARGET (-1L) 32 #define DFL_IDLE_THRESHOLD (0) 33 #define DFL_HD_BASELINE_LATENCY (4000L) /* 4ms */ 34 #define LATENCY_FILTERED_SSD (0) 35 /* 36 * For HD, very small latency comes from sequential IO. Such IO is helpless to 37 * help determine if its IO is impacted by others, hence we ignore the IO 38 */ 39 #define LATENCY_FILTERED_HD (1000L) /* 1ms */ 40 41 /* A workqueue to queue throttle related work */ 42 static struct workqueue_struct *kthrotld_workqueue; 43 44 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 45 46 /* We measure latency for request size from <= 4k to >= 1M */ 47 #define LATENCY_BUCKET_SIZE 9 48 49 struct latency_bucket { 50 unsigned long total_latency; /* ns / 1024 */ 51 int samples; 52 }; 53 54 struct avg_latency_bucket { 55 unsigned long latency; /* ns / 1024 */ 56 bool valid; 57 }; 58 59 struct throtl_data 60 { 61 /* service tree for active throtl groups */ 62 struct throtl_service_queue service_queue; 63 64 struct request_queue *queue; 65 66 /* Total Number of queued bios on READ and WRITE lists */ 67 unsigned int nr_queued[2]; 68 69 unsigned int throtl_slice; 70 71 /* Work for dispatching throttled bios */ 72 struct work_struct dispatch_work; 73 unsigned int limit_index; 74 bool limit_valid[LIMIT_CNT]; 75 76 unsigned long low_upgrade_time; 77 unsigned long low_downgrade_time; 78 79 unsigned int scale; 80 81 struct latency_bucket tmp_buckets[2][LATENCY_BUCKET_SIZE]; 82 struct avg_latency_bucket avg_buckets[2][LATENCY_BUCKET_SIZE]; 83 struct latency_bucket __percpu *latency_buckets[2]; 84 unsigned long last_calculate_time; 85 unsigned long filtered_latency; 86 87 bool track_bio_latency; 88 }; 89 90 static void throtl_pending_timer_fn(struct timer_list *t); 91 92 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 93 { 94 return pd_to_blkg(&tg->pd); 95 } 96 97 /** 98 * sq_to_tg - return the throl_grp the specified service queue belongs to 99 * @sq: the throtl_service_queue of interest 100 * 101 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 102 * embedded in throtl_data, %NULL is returned. 103 */ 104 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 105 { 106 if (sq && sq->parent_sq) 107 return container_of(sq, struct throtl_grp, service_queue); 108 else 109 return NULL; 110 } 111 112 /** 113 * sq_to_td - return throtl_data the specified service queue belongs to 114 * @sq: the throtl_service_queue of interest 115 * 116 * A service_queue can be embedded in either a throtl_grp or throtl_data. 117 * Determine the associated throtl_data accordingly and return it. 118 */ 119 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 120 { 121 struct throtl_grp *tg = sq_to_tg(sq); 122 123 if (tg) 124 return tg->td; 125 else 126 return container_of(sq, struct throtl_data, service_queue); 127 } 128 129 /* 130 * cgroup's limit in LIMIT_MAX is scaled if low limit is set. This scale is to 131 * make the IO dispatch more smooth. 132 * Scale up: linearly scale up according to elapsed time since upgrade. For 133 * every throtl_slice, the limit scales up 1/2 .low limit till the 134 * limit hits .max limit 135 * Scale down: exponentially scale down if a cgroup doesn't hit its .low limit 136 */ 137 static uint64_t throtl_adjusted_limit(uint64_t low, struct throtl_data *td) 138 { 139 /* arbitrary value to avoid too big scale */ 140 if (td->scale < 4096 && time_after_eq(jiffies, 141 td->low_upgrade_time + td->scale * td->throtl_slice)) 142 td->scale = (jiffies - td->low_upgrade_time) / td->throtl_slice; 143 144 return low + (low >> 1) * td->scale; 145 } 146 147 static uint64_t tg_bps_limit(struct throtl_grp *tg, int rw) 148 { 149 struct blkcg_gq *blkg = tg_to_blkg(tg); 150 struct throtl_data *td; 151 uint64_t ret; 152 153 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 154 return U64_MAX; 155 156 td = tg->td; 157 ret = tg->bps[rw][td->limit_index]; 158 if (ret == 0 && td->limit_index == LIMIT_LOW) { 159 /* intermediate node or iops isn't 0 */ 160 if (!list_empty(&blkg->blkcg->css.children) || 161 tg->iops[rw][td->limit_index]) 162 return U64_MAX; 163 else 164 return MIN_THROTL_BPS; 165 } 166 167 if (td->limit_index == LIMIT_MAX && tg->bps[rw][LIMIT_LOW] && 168 tg->bps[rw][LIMIT_LOW] != tg->bps[rw][LIMIT_MAX]) { 169 uint64_t adjusted; 170 171 adjusted = throtl_adjusted_limit(tg->bps[rw][LIMIT_LOW], td); 172 ret = min(tg->bps[rw][LIMIT_MAX], adjusted); 173 } 174 return ret; 175 } 176 177 static unsigned int tg_iops_limit(struct throtl_grp *tg, int rw) 178 { 179 struct blkcg_gq *blkg = tg_to_blkg(tg); 180 struct throtl_data *td; 181 unsigned int ret; 182 183 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && !blkg->parent) 184 return UINT_MAX; 185 186 td = tg->td; 187 ret = tg->iops[rw][td->limit_index]; 188 if (ret == 0 && tg->td->limit_index == LIMIT_LOW) { 189 /* intermediate node or bps isn't 0 */ 190 if (!list_empty(&blkg->blkcg->css.children) || 191 tg->bps[rw][td->limit_index]) 192 return UINT_MAX; 193 else 194 return MIN_THROTL_IOPS; 195 } 196 197 if (td->limit_index == LIMIT_MAX && tg->iops[rw][LIMIT_LOW] && 198 tg->iops[rw][LIMIT_LOW] != tg->iops[rw][LIMIT_MAX]) { 199 uint64_t adjusted; 200 201 adjusted = throtl_adjusted_limit(tg->iops[rw][LIMIT_LOW], td); 202 if (adjusted > UINT_MAX) 203 adjusted = UINT_MAX; 204 ret = min_t(unsigned int, tg->iops[rw][LIMIT_MAX], adjusted); 205 } 206 return ret; 207 } 208 209 #define request_bucket_index(sectors) \ 210 clamp_t(int, order_base_2(sectors) - 3, 0, LATENCY_BUCKET_SIZE - 1) 211 212 /** 213 * throtl_log - log debug message via blktrace 214 * @sq: the service_queue being reported 215 * @fmt: printf format string 216 * @args: printf args 217 * 218 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 219 * throtl_grp; otherwise, just "throtl". 220 */ 221 #define throtl_log(sq, fmt, args...) do { \ 222 struct throtl_grp *__tg = sq_to_tg((sq)); \ 223 struct throtl_data *__td = sq_to_td((sq)); \ 224 \ 225 (void)__td; \ 226 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 227 break; \ 228 if ((__tg)) { \ 229 blk_add_cgroup_trace_msg(__td->queue, \ 230 &tg_to_blkg(__tg)->blkcg->css, "throtl " fmt, ##args);\ 231 } else { \ 232 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 233 } \ 234 } while (0) 235 236 static inline unsigned int throtl_bio_data_size(struct bio *bio) 237 { 238 /* assume it's one sector */ 239 if (unlikely(bio_op(bio) == REQ_OP_DISCARD)) 240 return 512; 241 return bio->bi_iter.bi_size; 242 } 243 244 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 245 { 246 INIT_LIST_HEAD(&qn->node); 247 bio_list_init(&qn->bios); 248 qn->tg = tg; 249 } 250 251 /** 252 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 253 * @bio: bio being added 254 * @qn: qnode to add bio to 255 * @queued: the service_queue->queued[] list @qn belongs to 256 * 257 * Add @bio to @qn and put @qn on @queued if it's not already on. 258 * @qn->tg's reference count is bumped when @qn is activated. See the 259 * comment on top of throtl_qnode definition for details. 260 */ 261 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 262 struct list_head *queued) 263 { 264 bio_list_add(&qn->bios, bio); 265 if (list_empty(&qn->node)) { 266 list_add_tail(&qn->node, queued); 267 blkg_get(tg_to_blkg(qn->tg)); 268 } 269 } 270 271 /** 272 * throtl_peek_queued - peek the first bio on a qnode list 273 * @queued: the qnode list to peek 274 */ 275 static struct bio *throtl_peek_queued(struct list_head *queued) 276 { 277 struct throtl_qnode *qn; 278 struct bio *bio; 279 280 if (list_empty(queued)) 281 return NULL; 282 283 qn = list_first_entry(queued, struct throtl_qnode, node); 284 bio = bio_list_peek(&qn->bios); 285 WARN_ON_ONCE(!bio); 286 return bio; 287 } 288 289 /** 290 * throtl_pop_queued - pop the first bio form a qnode list 291 * @queued: the qnode list to pop a bio from 292 * @tg_to_put: optional out argument for throtl_grp to put 293 * 294 * Pop the first bio from the qnode list @queued. After popping, the first 295 * qnode is removed from @queued if empty or moved to the end of @queued so 296 * that the popping order is round-robin. 297 * 298 * When the first qnode is removed, its associated throtl_grp should be put 299 * too. If @tg_to_put is NULL, this function automatically puts it; 300 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 301 * responsible for putting it. 302 */ 303 static struct bio *throtl_pop_queued(struct list_head *queued, 304 struct throtl_grp **tg_to_put) 305 { 306 struct throtl_qnode *qn; 307 struct bio *bio; 308 309 if (list_empty(queued)) 310 return NULL; 311 312 qn = list_first_entry(queued, struct throtl_qnode, node); 313 bio = bio_list_pop(&qn->bios); 314 WARN_ON_ONCE(!bio); 315 316 if (bio_list_empty(&qn->bios)) { 317 list_del_init(&qn->node); 318 if (tg_to_put) 319 *tg_to_put = qn->tg; 320 else 321 blkg_put(tg_to_blkg(qn->tg)); 322 } else { 323 list_move_tail(&qn->node, queued); 324 } 325 326 return bio; 327 } 328 329 /* init a service_queue, assumes the caller zeroed it */ 330 static void throtl_service_queue_init(struct throtl_service_queue *sq) 331 { 332 INIT_LIST_HEAD(&sq->queued[READ]); 333 INIT_LIST_HEAD(&sq->queued[WRITE]); 334 sq->pending_tree = RB_ROOT_CACHED; 335 timer_setup(&sq->pending_timer, throtl_pending_timer_fn, 0); 336 } 337 338 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, 339 struct request_queue *q, 340 struct blkcg *blkcg) 341 { 342 struct throtl_grp *tg; 343 int rw; 344 345 tg = kzalloc_node(sizeof(*tg), gfp, q->node); 346 if (!tg) 347 return NULL; 348 349 if (blkg_rwstat_init(&tg->stat_bytes, gfp)) 350 goto err_free_tg; 351 352 if (blkg_rwstat_init(&tg->stat_ios, gfp)) 353 goto err_exit_stat_bytes; 354 355 throtl_service_queue_init(&tg->service_queue); 356 357 for (rw = READ; rw <= WRITE; rw++) { 358 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 359 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 360 } 361 362 RB_CLEAR_NODE(&tg->rb_node); 363 tg->bps[READ][LIMIT_MAX] = U64_MAX; 364 tg->bps[WRITE][LIMIT_MAX] = U64_MAX; 365 tg->iops[READ][LIMIT_MAX] = UINT_MAX; 366 tg->iops[WRITE][LIMIT_MAX] = UINT_MAX; 367 tg->bps_conf[READ][LIMIT_MAX] = U64_MAX; 368 tg->bps_conf[WRITE][LIMIT_MAX] = U64_MAX; 369 tg->iops_conf[READ][LIMIT_MAX] = UINT_MAX; 370 tg->iops_conf[WRITE][LIMIT_MAX] = UINT_MAX; 371 /* LIMIT_LOW will have default value 0 */ 372 373 tg->latency_target = DFL_LATENCY_TARGET; 374 tg->latency_target_conf = DFL_LATENCY_TARGET; 375 tg->idletime_threshold = DFL_IDLE_THRESHOLD; 376 tg->idletime_threshold_conf = DFL_IDLE_THRESHOLD; 377 378 return &tg->pd; 379 380 err_exit_stat_bytes: 381 blkg_rwstat_exit(&tg->stat_bytes); 382 err_free_tg: 383 kfree(tg); 384 return NULL; 385 } 386 387 static void throtl_pd_init(struct blkg_policy_data *pd) 388 { 389 struct throtl_grp *tg = pd_to_tg(pd); 390 struct blkcg_gq *blkg = tg_to_blkg(tg); 391 struct throtl_data *td = blkg->q->td; 392 struct throtl_service_queue *sq = &tg->service_queue; 393 394 /* 395 * If on the default hierarchy, we switch to properly hierarchical 396 * behavior where limits on a given throtl_grp are applied to the 397 * whole subtree rather than just the group itself. e.g. If 16M 398 * read_bps limit is set on a parent group, summary bps of 399 * parent group and its subtree groups can't exceed 16M for the 400 * device. 401 * 402 * If not on the default hierarchy, the broken flat hierarchy 403 * behavior is retained where all throtl_grps are treated as if 404 * they're all separate root groups right below throtl_data. 405 * Limits of a group don't interact with limits of other groups 406 * regardless of the position of the group in the hierarchy. 407 */ 408 sq->parent_sq = &td->service_queue; 409 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 410 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 411 tg->td = td; 412 } 413 414 /* 415 * Set has_rules[] if @tg or any of its parents have limits configured. 416 * This doesn't require walking up to the top of the hierarchy as the 417 * parent's has_rules[] is guaranteed to be correct. 418 */ 419 static void tg_update_has_rules(struct throtl_grp *tg) 420 { 421 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 422 struct throtl_data *td = tg->td; 423 int rw; 424 425 for (rw = READ; rw <= WRITE; rw++) { 426 tg->has_rules_iops[rw] = 427 (parent_tg && parent_tg->has_rules_iops[rw]) || 428 (td->limit_valid[td->limit_index] && 429 tg_iops_limit(tg, rw) != UINT_MAX); 430 tg->has_rules_bps[rw] = 431 (parent_tg && parent_tg->has_rules_bps[rw]) || 432 (td->limit_valid[td->limit_index] && 433 (tg_bps_limit(tg, rw) != U64_MAX)); 434 } 435 } 436 437 static void throtl_pd_online(struct blkg_policy_data *pd) 438 { 439 struct throtl_grp *tg = pd_to_tg(pd); 440 /* 441 * We don't want new groups to escape the limits of its ancestors. 442 * Update has_rules[] after a new group is brought online. 443 */ 444 tg_update_has_rules(tg); 445 } 446 447 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 448 static void blk_throtl_update_limit_valid(struct throtl_data *td) 449 { 450 struct cgroup_subsys_state *pos_css; 451 struct blkcg_gq *blkg; 452 bool low_valid = false; 453 454 rcu_read_lock(); 455 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 456 struct throtl_grp *tg = blkg_to_tg(blkg); 457 458 if (tg->bps[READ][LIMIT_LOW] || tg->bps[WRITE][LIMIT_LOW] || 459 tg->iops[READ][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) { 460 low_valid = true; 461 break; 462 } 463 } 464 rcu_read_unlock(); 465 466 td->limit_valid[LIMIT_LOW] = low_valid; 467 } 468 #else 469 static inline void blk_throtl_update_limit_valid(struct throtl_data *td) 470 { 471 } 472 #endif 473 474 static void throtl_upgrade_state(struct throtl_data *td); 475 static void throtl_pd_offline(struct blkg_policy_data *pd) 476 { 477 struct throtl_grp *tg = pd_to_tg(pd); 478 479 tg->bps[READ][LIMIT_LOW] = 0; 480 tg->bps[WRITE][LIMIT_LOW] = 0; 481 tg->iops[READ][LIMIT_LOW] = 0; 482 tg->iops[WRITE][LIMIT_LOW] = 0; 483 484 blk_throtl_update_limit_valid(tg->td); 485 486 if (!tg->td->limit_valid[tg->td->limit_index]) 487 throtl_upgrade_state(tg->td); 488 } 489 490 static void throtl_pd_free(struct blkg_policy_data *pd) 491 { 492 struct throtl_grp *tg = pd_to_tg(pd); 493 494 del_timer_sync(&tg->service_queue.pending_timer); 495 blkg_rwstat_exit(&tg->stat_bytes); 496 blkg_rwstat_exit(&tg->stat_ios); 497 kfree(tg); 498 } 499 500 static struct throtl_grp * 501 throtl_rb_first(struct throtl_service_queue *parent_sq) 502 { 503 struct rb_node *n; 504 505 n = rb_first_cached(&parent_sq->pending_tree); 506 WARN_ON_ONCE(!n); 507 if (!n) 508 return NULL; 509 return rb_entry_tg(n); 510 } 511 512 static void throtl_rb_erase(struct rb_node *n, 513 struct throtl_service_queue *parent_sq) 514 { 515 rb_erase_cached(n, &parent_sq->pending_tree); 516 RB_CLEAR_NODE(n); 517 } 518 519 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 520 { 521 struct throtl_grp *tg; 522 523 tg = throtl_rb_first(parent_sq); 524 if (!tg) 525 return; 526 527 parent_sq->first_pending_disptime = tg->disptime; 528 } 529 530 static void tg_service_queue_add(struct throtl_grp *tg) 531 { 532 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 533 struct rb_node **node = &parent_sq->pending_tree.rb_root.rb_node; 534 struct rb_node *parent = NULL; 535 struct throtl_grp *__tg; 536 unsigned long key = tg->disptime; 537 bool leftmost = true; 538 539 while (*node != NULL) { 540 parent = *node; 541 __tg = rb_entry_tg(parent); 542 543 if (time_before(key, __tg->disptime)) 544 node = &parent->rb_left; 545 else { 546 node = &parent->rb_right; 547 leftmost = false; 548 } 549 } 550 551 rb_link_node(&tg->rb_node, parent, node); 552 rb_insert_color_cached(&tg->rb_node, &parent_sq->pending_tree, 553 leftmost); 554 } 555 556 static void throtl_enqueue_tg(struct throtl_grp *tg) 557 { 558 if (!(tg->flags & THROTL_TG_PENDING)) { 559 tg_service_queue_add(tg); 560 tg->flags |= THROTL_TG_PENDING; 561 tg->service_queue.parent_sq->nr_pending++; 562 } 563 } 564 565 static void throtl_dequeue_tg(struct throtl_grp *tg) 566 { 567 if (tg->flags & THROTL_TG_PENDING) { 568 struct throtl_service_queue *parent_sq = 569 tg->service_queue.parent_sq; 570 571 throtl_rb_erase(&tg->rb_node, parent_sq); 572 --parent_sq->nr_pending; 573 tg->flags &= ~THROTL_TG_PENDING; 574 } 575 } 576 577 /* Call with queue lock held */ 578 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 579 unsigned long expires) 580 { 581 unsigned long max_expire = jiffies + 8 * sq_to_td(sq)->throtl_slice; 582 583 /* 584 * Since we are adjusting the throttle limit dynamically, the sleep 585 * time calculated according to previous limit might be invalid. It's 586 * possible the cgroup sleep time is very long and no other cgroups 587 * have IO running so notify the limit changes. Make sure the cgroup 588 * doesn't sleep too long to avoid the missed notification. 589 */ 590 if (time_after(expires, max_expire)) 591 expires = max_expire; 592 mod_timer(&sq->pending_timer, expires); 593 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 594 expires - jiffies, jiffies); 595 } 596 597 /** 598 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 599 * @sq: the service_queue to schedule dispatch for 600 * @force: force scheduling 601 * 602 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 603 * dispatch time of the first pending child. Returns %true if either timer 604 * is armed or there's no pending child left. %false if the current 605 * dispatch window is still open and the caller should continue 606 * dispatching. 607 * 608 * If @force is %true, the dispatch timer is always scheduled and this 609 * function is guaranteed to return %true. This is to be used when the 610 * caller can't dispatch itself and needs to invoke pending_timer 611 * unconditionally. Note that forced scheduling is likely to induce short 612 * delay before dispatch starts even if @sq->first_pending_disptime is not 613 * in the future and thus shouldn't be used in hot paths. 614 */ 615 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 616 bool force) 617 { 618 /* any pending children left? */ 619 if (!sq->nr_pending) 620 return true; 621 622 update_min_dispatch_time(sq); 623 624 /* is the next dispatch time in the future? */ 625 if (force || time_after(sq->first_pending_disptime, jiffies)) { 626 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 627 return true; 628 } 629 630 /* tell the caller to continue dispatching */ 631 return false; 632 } 633 634 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 635 bool rw, unsigned long start) 636 { 637 tg->bytes_disp[rw] = 0; 638 tg->io_disp[rw] = 0; 639 tg->carryover_bytes[rw] = 0; 640 tg->carryover_ios[rw] = 0; 641 642 /* 643 * Previous slice has expired. We must have trimmed it after last 644 * bio dispatch. That means since start of last slice, we never used 645 * that bandwidth. Do try to make use of that bandwidth while giving 646 * credit. 647 */ 648 if (time_after(start, tg->slice_start[rw])) 649 tg->slice_start[rw] = start; 650 651 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 652 throtl_log(&tg->service_queue, 653 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 654 rw == READ ? 'R' : 'W', tg->slice_start[rw], 655 tg->slice_end[rw], jiffies); 656 } 657 658 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw, 659 bool clear_carryover) 660 { 661 tg->bytes_disp[rw] = 0; 662 tg->io_disp[rw] = 0; 663 tg->slice_start[rw] = jiffies; 664 tg->slice_end[rw] = jiffies + tg->td->throtl_slice; 665 if (clear_carryover) { 666 tg->carryover_bytes[rw] = 0; 667 tg->carryover_ios[rw] = 0; 668 } 669 670 throtl_log(&tg->service_queue, 671 "[%c] new slice start=%lu end=%lu jiffies=%lu", 672 rw == READ ? 'R' : 'W', tg->slice_start[rw], 673 tg->slice_end[rw], jiffies); 674 } 675 676 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 677 unsigned long jiffy_end) 678 { 679 tg->slice_end[rw] = roundup(jiffy_end, tg->td->throtl_slice); 680 } 681 682 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 683 unsigned long jiffy_end) 684 { 685 throtl_set_slice_end(tg, rw, jiffy_end); 686 throtl_log(&tg->service_queue, 687 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 688 rw == READ ? 'R' : 'W', tg->slice_start[rw], 689 tg->slice_end[rw], jiffies); 690 } 691 692 /* Determine if previously allocated or extended slice is complete or not */ 693 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 694 { 695 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 696 return false; 697 698 return true; 699 } 700 701 /* Trim the used slices and adjust slice start accordingly */ 702 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 703 { 704 unsigned long nr_slices, time_elapsed, io_trim; 705 u64 bytes_trim, tmp; 706 707 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 708 709 /* 710 * If bps are unlimited (-1), then time slice don't get 711 * renewed. Don't try to trim the slice if slice is used. A new 712 * slice will start when appropriate. 713 */ 714 if (throtl_slice_used(tg, rw)) 715 return; 716 717 /* 718 * A bio has been dispatched. Also adjust slice_end. It might happen 719 * that initially cgroup limit was very low resulting in high 720 * slice_end, but later limit was bumped up and bio was dispatched 721 * sooner, then we need to reduce slice_end. A high bogus slice_end 722 * is bad because it does not allow new slice to start. 723 */ 724 725 throtl_set_slice_end(tg, rw, jiffies + tg->td->throtl_slice); 726 727 time_elapsed = jiffies - tg->slice_start[rw]; 728 729 nr_slices = time_elapsed / tg->td->throtl_slice; 730 731 if (!nr_slices) 732 return; 733 tmp = tg_bps_limit(tg, rw) * tg->td->throtl_slice * nr_slices; 734 do_div(tmp, HZ); 735 bytes_trim = tmp; 736 737 io_trim = (tg_iops_limit(tg, rw) * tg->td->throtl_slice * nr_slices) / 738 HZ; 739 740 if (!bytes_trim && !io_trim) 741 return; 742 743 if (tg->bytes_disp[rw] >= bytes_trim) 744 tg->bytes_disp[rw] -= bytes_trim; 745 else 746 tg->bytes_disp[rw] = 0; 747 748 if (tg->io_disp[rw] >= io_trim) 749 tg->io_disp[rw] -= io_trim; 750 else 751 tg->io_disp[rw] = 0; 752 753 tg->slice_start[rw] += nr_slices * tg->td->throtl_slice; 754 755 throtl_log(&tg->service_queue, 756 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 757 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 758 tg->slice_start[rw], tg->slice_end[rw], jiffies); 759 } 760 761 static unsigned int calculate_io_allowed(u32 iops_limit, 762 unsigned long jiffy_elapsed) 763 { 764 unsigned int io_allowed; 765 u64 tmp; 766 767 /* 768 * jiffy_elapsed should not be a big value as minimum iops can be 769 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 770 * will allow dispatch after 1 second and after that slice should 771 * have been trimmed. 772 */ 773 774 tmp = (u64)iops_limit * jiffy_elapsed; 775 do_div(tmp, HZ); 776 777 if (tmp > UINT_MAX) 778 io_allowed = UINT_MAX; 779 else 780 io_allowed = tmp; 781 782 return io_allowed; 783 } 784 785 static u64 calculate_bytes_allowed(u64 bps_limit, unsigned long jiffy_elapsed) 786 { 787 return mul_u64_u64_div_u64(bps_limit, (u64)jiffy_elapsed, (u64)HZ); 788 } 789 790 static void __tg_update_carryover(struct throtl_grp *tg, bool rw) 791 { 792 unsigned long jiffy_elapsed = jiffies - tg->slice_start[rw]; 793 u64 bps_limit = tg_bps_limit(tg, rw); 794 u32 iops_limit = tg_iops_limit(tg, rw); 795 796 /* 797 * If config is updated while bios are still throttled, calculate and 798 * accumulate how many bytes/ios are waited across changes. And 799 * carryover_bytes/ios will be used to calculate new wait time under new 800 * configuration. 801 */ 802 if (bps_limit != U64_MAX) 803 tg->carryover_bytes[rw] += 804 calculate_bytes_allowed(bps_limit, jiffy_elapsed) - 805 tg->bytes_disp[rw]; 806 if (iops_limit != UINT_MAX) 807 tg->carryover_ios[rw] += 808 calculate_io_allowed(iops_limit, jiffy_elapsed) - 809 tg->io_disp[rw]; 810 } 811 812 static void tg_update_carryover(struct throtl_grp *tg) 813 { 814 if (tg->service_queue.nr_queued[READ]) 815 __tg_update_carryover(tg, READ); 816 if (tg->service_queue.nr_queued[WRITE]) 817 __tg_update_carryover(tg, WRITE); 818 819 /* see comments in struct throtl_grp for meaning of these fields. */ 820 throtl_log(&tg->service_queue, "%s: %llu %llu %u %u\n", __func__, 821 tg->carryover_bytes[READ], tg->carryover_bytes[WRITE], 822 tg->carryover_ios[READ], tg->carryover_ios[WRITE]); 823 } 824 825 static unsigned long tg_within_iops_limit(struct throtl_grp *tg, struct bio *bio, 826 u32 iops_limit) 827 { 828 bool rw = bio_data_dir(bio); 829 unsigned int io_allowed; 830 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 831 832 if (iops_limit == UINT_MAX) { 833 return 0; 834 } 835 836 jiffy_elapsed = jiffies - tg->slice_start[rw]; 837 838 /* Round up to the next throttle slice, wait time must be nonzero */ 839 jiffy_elapsed_rnd = roundup(jiffy_elapsed + 1, tg->td->throtl_slice); 840 io_allowed = calculate_io_allowed(iops_limit, jiffy_elapsed_rnd) + 841 tg->carryover_ios[rw]; 842 if (tg->io_disp[rw] + 1 <= io_allowed) { 843 return 0; 844 } 845 846 /* Calc approx time to dispatch */ 847 jiffy_wait = jiffy_elapsed_rnd - jiffy_elapsed; 848 return jiffy_wait; 849 } 850 851 static unsigned long tg_within_bps_limit(struct throtl_grp *tg, struct bio *bio, 852 u64 bps_limit) 853 { 854 bool rw = bio_data_dir(bio); 855 u64 bytes_allowed, extra_bytes; 856 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 857 unsigned int bio_size = throtl_bio_data_size(bio); 858 859 /* no need to throttle if this bio's bytes have been accounted */ 860 if (bps_limit == U64_MAX || bio_flagged(bio, BIO_BPS_THROTTLED)) { 861 return 0; 862 } 863 864 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 865 866 /* Slice has just started. Consider one slice interval */ 867 if (!jiffy_elapsed) 868 jiffy_elapsed_rnd = tg->td->throtl_slice; 869 870 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, tg->td->throtl_slice); 871 bytes_allowed = calculate_bytes_allowed(bps_limit, jiffy_elapsed_rnd) + 872 tg->carryover_bytes[rw]; 873 if (tg->bytes_disp[rw] + bio_size <= bytes_allowed) { 874 return 0; 875 } 876 877 /* Calc approx time to dispatch */ 878 extra_bytes = tg->bytes_disp[rw] + bio_size - bytes_allowed; 879 jiffy_wait = div64_u64(extra_bytes * HZ, bps_limit); 880 881 if (!jiffy_wait) 882 jiffy_wait = 1; 883 884 /* 885 * This wait time is without taking into consideration the rounding 886 * up we did. Add that time also. 887 */ 888 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 889 return jiffy_wait; 890 } 891 892 /* 893 * Returns whether one can dispatch a bio or not. Also returns approx number 894 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 895 */ 896 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 897 unsigned long *wait) 898 { 899 bool rw = bio_data_dir(bio); 900 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 901 u64 bps_limit = tg_bps_limit(tg, rw); 902 u32 iops_limit = tg_iops_limit(tg, rw); 903 904 /* 905 * Currently whole state machine of group depends on first bio 906 * queued in the group bio list. So one should not be calling 907 * this function with a different bio if there are other bios 908 * queued. 909 */ 910 BUG_ON(tg->service_queue.nr_queued[rw] && 911 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 912 913 /* If tg->bps = -1, then BW is unlimited */ 914 if ((bps_limit == U64_MAX && iops_limit == UINT_MAX) || 915 tg->flags & THROTL_TG_CANCELING) { 916 if (wait) 917 *wait = 0; 918 return true; 919 } 920 921 /* 922 * If previous slice expired, start a new one otherwise renew/extend 923 * existing slice to make sure it is at least throtl_slice interval 924 * long since now. New slice is started only for empty throttle group. 925 * If there is queued bio, that means there should be an active 926 * slice and it should be extended instead. 927 */ 928 if (throtl_slice_used(tg, rw) && !(tg->service_queue.nr_queued[rw])) 929 throtl_start_new_slice(tg, rw, true); 930 else { 931 if (time_before(tg->slice_end[rw], 932 jiffies + tg->td->throtl_slice)) 933 throtl_extend_slice(tg, rw, 934 jiffies + tg->td->throtl_slice); 935 } 936 937 bps_wait = tg_within_bps_limit(tg, bio, bps_limit); 938 iops_wait = tg_within_iops_limit(tg, bio, iops_limit); 939 if (bps_wait + iops_wait == 0) { 940 if (wait) 941 *wait = 0; 942 return true; 943 } 944 945 max_wait = max(bps_wait, iops_wait); 946 947 if (wait) 948 *wait = max_wait; 949 950 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 951 throtl_extend_slice(tg, rw, jiffies + max_wait); 952 953 return false; 954 } 955 956 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 957 { 958 bool rw = bio_data_dir(bio); 959 unsigned int bio_size = throtl_bio_data_size(bio); 960 961 /* Charge the bio to the group */ 962 if (!bio_flagged(bio, BIO_BPS_THROTTLED)) { 963 tg->bytes_disp[rw] += bio_size; 964 tg->last_bytes_disp[rw] += bio_size; 965 } 966 967 tg->io_disp[rw]++; 968 tg->last_io_disp[rw]++; 969 } 970 971 /** 972 * throtl_add_bio_tg - add a bio to the specified throtl_grp 973 * @bio: bio to add 974 * @qn: qnode to use 975 * @tg: the target throtl_grp 976 * 977 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 978 * tg->qnode_on_self[] is used. 979 */ 980 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 981 struct throtl_grp *tg) 982 { 983 struct throtl_service_queue *sq = &tg->service_queue; 984 bool rw = bio_data_dir(bio); 985 986 if (!qn) 987 qn = &tg->qnode_on_self[rw]; 988 989 /* 990 * If @tg doesn't currently have any bios queued in the same 991 * direction, queueing @bio can change when @tg should be 992 * dispatched. Mark that @tg was empty. This is automatically 993 * cleared on the next tg_update_disptime(). 994 */ 995 if (!sq->nr_queued[rw]) 996 tg->flags |= THROTL_TG_WAS_EMPTY; 997 998 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 999 1000 sq->nr_queued[rw]++; 1001 throtl_enqueue_tg(tg); 1002 } 1003 1004 static void tg_update_disptime(struct throtl_grp *tg) 1005 { 1006 struct throtl_service_queue *sq = &tg->service_queue; 1007 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 1008 struct bio *bio; 1009 1010 bio = throtl_peek_queued(&sq->queued[READ]); 1011 if (bio) 1012 tg_may_dispatch(tg, bio, &read_wait); 1013 1014 bio = throtl_peek_queued(&sq->queued[WRITE]); 1015 if (bio) 1016 tg_may_dispatch(tg, bio, &write_wait); 1017 1018 min_wait = min(read_wait, write_wait); 1019 disptime = jiffies + min_wait; 1020 1021 /* Update dispatch time */ 1022 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 1023 tg->disptime = disptime; 1024 tg_service_queue_add(tg); 1025 1026 /* see throtl_add_bio_tg() */ 1027 tg->flags &= ~THROTL_TG_WAS_EMPTY; 1028 } 1029 1030 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 1031 struct throtl_grp *parent_tg, bool rw) 1032 { 1033 if (throtl_slice_used(parent_tg, rw)) { 1034 throtl_start_new_slice_with_credit(parent_tg, rw, 1035 child_tg->slice_start[rw]); 1036 } 1037 1038 } 1039 1040 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 1041 { 1042 struct throtl_service_queue *sq = &tg->service_queue; 1043 struct throtl_service_queue *parent_sq = sq->parent_sq; 1044 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 1045 struct throtl_grp *tg_to_put = NULL; 1046 struct bio *bio; 1047 1048 /* 1049 * @bio is being transferred from @tg to @parent_sq. Popping a bio 1050 * from @tg may put its reference and @parent_sq might end up 1051 * getting released prematurely. Remember the tg to put and put it 1052 * after @bio is transferred to @parent_sq. 1053 */ 1054 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 1055 sq->nr_queued[rw]--; 1056 1057 throtl_charge_bio(tg, bio); 1058 1059 /* 1060 * If our parent is another tg, we just need to transfer @bio to 1061 * the parent using throtl_add_bio_tg(). If our parent is 1062 * @td->service_queue, @bio is ready to be issued. Put it on its 1063 * bio_lists[] and decrease total number queued. The caller is 1064 * responsible for issuing these bios. 1065 */ 1066 if (parent_tg) { 1067 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 1068 start_parent_slice_with_credit(tg, parent_tg, rw); 1069 } else { 1070 bio_set_flag(bio, BIO_BPS_THROTTLED); 1071 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 1072 &parent_sq->queued[rw]); 1073 BUG_ON(tg->td->nr_queued[rw] <= 0); 1074 tg->td->nr_queued[rw]--; 1075 } 1076 1077 throtl_trim_slice(tg, rw); 1078 1079 if (tg_to_put) 1080 blkg_put(tg_to_blkg(tg_to_put)); 1081 } 1082 1083 static int throtl_dispatch_tg(struct throtl_grp *tg) 1084 { 1085 struct throtl_service_queue *sq = &tg->service_queue; 1086 unsigned int nr_reads = 0, nr_writes = 0; 1087 unsigned int max_nr_reads = THROTL_GRP_QUANTUM * 3 / 4; 1088 unsigned int max_nr_writes = THROTL_GRP_QUANTUM - max_nr_reads; 1089 struct bio *bio; 1090 1091 /* Try to dispatch 75% READS and 25% WRITES */ 1092 1093 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 1094 tg_may_dispatch(tg, bio, NULL)) { 1095 1096 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1097 nr_reads++; 1098 1099 if (nr_reads >= max_nr_reads) 1100 break; 1101 } 1102 1103 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 1104 tg_may_dispatch(tg, bio, NULL)) { 1105 1106 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1107 nr_writes++; 1108 1109 if (nr_writes >= max_nr_writes) 1110 break; 1111 } 1112 1113 return nr_reads + nr_writes; 1114 } 1115 1116 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 1117 { 1118 unsigned int nr_disp = 0; 1119 1120 while (1) { 1121 struct throtl_grp *tg; 1122 struct throtl_service_queue *sq; 1123 1124 if (!parent_sq->nr_pending) 1125 break; 1126 1127 tg = throtl_rb_first(parent_sq); 1128 if (!tg) 1129 break; 1130 1131 if (time_before(jiffies, tg->disptime)) 1132 break; 1133 1134 nr_disp += throtl_dispatch_tg(tg); 1135 1136 sq = &tg->service_queue; 1137 if (sq->nr_queued[READ] || sq->nr_queued[WRITE]) 1138 tg_update_disptime(tg); 1139 else 1140 throtl_dequeue_tg(tg); 1141 1142 if (nr_disp >= THROTL_QUANTUM) 1143 break; 1144 } 1145 1146 return nr_disp; 1147 } 1148 1149 static bool throtl_can_upgrade(struct throtl_data *td, 1150 struct throtl_grp *this_tg); 1151 /** 1152 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1153 * @t: the pending_timer member of the throtl_service_queue being serviced 1154 * 1155 * This timer is armed when a child throtl_grp with active bio's become 1156 * pending and queued on the service_queue's pending_tree and expires when 1157 * the first child throtl_grp should be dispatched. This function 1158 * dispatches bio's from the children throtl_grps to the parent 1159 * service_queue. 1160 * 1161 * If the parent's parent is another throtl_grp, dispatching is propagated 1162 * by either arming its pending_timer or repeating dispatch directly. If 1163 * the top-level service_tree is reached, throtl_data->dispatch_work is 1164 * kicked so that the ready bio's are issued. 1165 */ 1166 static void throtl_pending_timer_fn(struct timer_list *t) 1167 { 1168 struct throtl_service_queue *sq = from_timer(sq, t, pending_timer); 1169 struct throtl_grp *tg = sq_to_tg(sq); 1170 struct throtl_data *td = sq_to_td(sq); 1171 struct throtl_service_queue *parent_sq; 1172 struct request_queue *q; 1173 bool dispatched; 1174 int ret; 1175 1176 /* throtl_data may be gone, so figure out request queue by blkg */ 1177 if (tg) 1178 q = tg->pd.blkg->q; 1179 else 1180 q = td->queue; 1181 1182 spin_lock_irq(&q->queue_lock); 1183 1184 if (!q->root_blkg) 1185 goto out_unlock; 1186 1187 if (throtl_can_upgrade(td, NULL)) 1188 throtl_upgrade_state(td); 1189 1190 again: 1191 parent_sq = sq->parent_sq; 1192 dispatched = false; 1193 1194 while (true) { 1195 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1196 sq->nr_queued[READ] + sq->nr_queued[WRITE], 1197 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1198 1199 ret = throtl_select_dispatch(sq); 1200 if (ret) { 1201 throtl_log(sq, "bios disp=%u", ret); 1202 dispatched = true; 1203 } 1204 1205 if (throtl_schedule_next_dispatch(sq, false)) 1206 break; 1207 1208 /* this dispatch windows is still open, relax and repeat */ 1209 spin_unlock_irq(&q->queue_lock); 1210 cpu_relax(); 1211 spin_lock_irq(&q->queue_lock); 1212 } 1213 1214 if (!dispatched) 1215 goto out_unlock; 1216 1217 if (parent_sq) { 1218 /* @parent_sq is another throl_grp, propagate dispatch */ 1219 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1220 tg_update_disptime(tg); 1221 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1222 /* window is already open, repeat dispatching */ 1223 sq = parent_sq; 1224 tg = sq_to_tg(sq); 1225 goto again; 1226 } 1227 } 1228 } else { 1229 /* reached the top-level, queue issuing */ 1230 queue_work(kthrotld_workqueue, &td->dispatch_work); 1231 } 1232 out_unlock: 1233 spin_unlock_irq(&q->queue_lock); 1234 } 1235 1236 /** 1237 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1238 * @work: work item being executed 1239 * 1240 * This function is queued for execution when bios reach the bio_lists[] 1241 * of throtl_data->service_queue. Those bios are ready and issued by this 1242 * function. 1243 */ 1244 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1245 { 1246 struct throtl_data *td = container_of(work, struct throtl_data, 1247 dispatch_work); 1248 struct throtl_service_queue *td_sq = &td->service_queue; 1249 struct request_queue *q = td->queue; 1250 struct bio_list bio_list_on_stack; 1251 struct bio *bio; 1252 struct blk_plug plug; 1253 int rw; 1254 1255 bio_list_init(&bio_list_on_stack); 1256 1257 spin_lock_irq(&q->queue_lock); 1258 for (rw = READ; rw <= WRITE; rw++) 1259 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1260 bio_list_add(&bio_list_on_stack, bio); 1261 spin_unlock_irq(&q->queue_lock); 1262 1263 if (!bio_list_empty(&bio_list_on_stack)) { 1264 blk_start_plug(&plug); 1265 while ((bio = bio_list_pop(&bio_list_on_stack))) 1266 submit_bio_noacct_nocheck(bio); 1267 blk_finish_plug(&plug); 1268 } 1269 } 1270 1271 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1272 int off) 1273 { 1274 struct throtl_grp *tg = pd_to_tg(pd); 1275 u64 v = *(u64 *)((void *)tg + off); 1276 1277 if (v == U64_MAX) 1278 return 0; 1279 return __blkg_prfill_u64(sf, pd, v); 1280 } 1281 1282 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1283 int off) 1284 { 1285 struct throtl_grp *tg = pd_to_tg(pd); 1286 unsigned int v = *(unsigned int *)((void *)tg + off); 1287 1288 if (v == UINT_MAX) 1289 return 0; 1290 return __blkg_prfill_u64(sf, pd, v); 1291 } 1292 1293 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1294 { 1295 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1296 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1297 return 0; 1298 } 1299 1300 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1301 { 1302 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1303 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1304 return 0; 1305 } 1306 1307 static void tg_conf_updated(struct throtl_grp *tg, bool global) 1308 { 1309 struct throtl_service_queue *sq = &tg->service_queue; 1310 struct cgroup_subsys_state *pos_css; 1311 struct blkcg_gq *blkg; 1312 1313 throtl_log(&tg->service_queue, 1314 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1315 tg_bps_limit(tg, READ), tg_bps_limit(tg, WRITE), 1316 tg_iops_limit(tg, READ), tg_iops_limit(tg, WRITE)); 1317 1318 /* 1319 * Update has_rules[] flags for the updated tg's subtree. A tg is 1320 * considered to have rules if either the tg itself or any of its 1321 * ancestors has rules. This identifies groups without any 1322 * restrictions in the whole hierarchy and allows them to bypass 1323 * blk-throttle. 1324 */ 1325 blkg_for_each_descendant_pre(blkg, pos_css, 1326 global ? tg->td->queue->root_blkg : tg_to_blkg(tg)) { 1327 struct throtl_grp *this_tg = blkg_to_tg(blkg); 1328 struct throtl_grp *parent_tg; 1329 1330 tg_update_has_rules(this_tg); 1331 /* ignore root/second level */ 1332 if (!cgroup_subsys_on_dfl(io_cgrp_subsys) || !blkg->parent || 1333 !blkg->parent->parent) 1334 continue; 1335 parent_tg = blkg_to_tg(blkg->parent); 1336 /* 1337 * make sure all children has lower idle time threshold and 1338 * higher latency target 1339 */ 1340 this_tg->idletime_threshold = min(this_tg->idletime_threshold, 1341 parent_tg->idletime_threshold); 1342 this_tg->latency_target = max(this_tg->latency_target, 1343 parent_tg->latency_target); 1344 } 1345 1346 /* 1347 * We're already holding queue_lock and know @tg is valid. Let's 1348 * apply the new config directly. 1349 * 1350 * Restart the slices for both READ and WRITES. It might happen 1351 * that a group's limit are dropped suddenly and we don't want to 1352 * account recently dispatched IO with new low rate. 1353 */ 1354 throtl_start_new_slice(tg, READ, false); 1355 throtl_start_new_slice(tg, WRITE, false); 1356 1357 if (tg->flags & THROTL_TG_PENDING) { 1358 tg_update_disptime(tg); 1359 throtl_schedule_next_dispatch(sq->parent_sq, true); 1360 } 1361 } 1362 1363 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1364 char *buf, size_t nbytes, loff_t off, bool is_u64) 1365 { 1366 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1367 struct blkg_conf_ctx ctx; 1368 struct throtl_grp *tg; 1369 int ret; 1370 u64 v; 1371 1372 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1373 if (ret) 1374 return ret; 1375 1376 ret = -EINVAL; 1377 if (sscanf(ctx.body, "%llu", &v) != 1) 1378 goto out_finish; 1379 if (!v) 1380 v = U64_MAX; 1381 1382 tg = blkg_to_tg(ctx.blkg); 1383 tg_update_carryover(tg); 1384 1385 if (is_u64) 1386 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1387 else 1388 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1389 1390 tg_conf_updated(tg, false); 1391 ret = 0; 1392 out_finish: 1393 blkg_conf_finish(&ctx); 1394 return ret ?: nbytes; 1395 } 1396 1397 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1398 char *buf, size_t nbytes, loff_t off) 1399 { 1400 return tg_set_conf(of, buf, nbytes, off, true); 1401 } 1402 1403 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1404 char *buf, size_t nbytes, loff_t off) 1405 { 1406 return tg_set_conf(of, buf, nbytes, off, false); 1407 } 1408 1409 static int tg_print_rwstat(struct seq_file *sf, void *v) 1410 { 1411 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1412 blkg_prfill_rwstat, &blkcg_policy_throtl, 1413 seq_cft(sf)->private, true); 1414 return 0; 1415 } 1416 1417 static u64 tg_prfill_rwstat_recursive(struct seq_file *sf, 1418 struct blkg_policy_data *pd, int off) 1419 { 1420 struct blkg_rwstat_sample sum; 1421 1422 blkg_rwstat_recursive_sum(pd_to_blkg(pd), &blkcg_policy_throtl, off, 1423 &sum); 1424 return __blkg_prfill_rwstat(sf, pd, &sum); 1425 } 1426 1427 static int tg_print_rwstat_recursive(struct seq_file *sf, void *v) 1428 { 1429 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 1430 tg_prfill_rwstat_recursive, &blkcg_policy_throtl, 1431 seq_cft(sf)->private, true); 1432 return 0; 1433 } 1434 1435 static struct cftype throtl_legacy_files[] = { 1436 { 1437 .name = "throttle.read_bps_device", 1438 .private = offsetof(struct throtl_grp, bps[READ][LIMIT_MAX]), 1439 .seq_show = tg_print_conf_u64, 1440 .write = tg_set_conf_u64, 1441 }, 1442 { 1443 .name = "throttle.write_bps_device", 1444 .private = offsetof(struct throtl_grp, bps[WRITE][LIMIT_MAX]), 1445 .seq_show = tg_print_conf_u64, 1446 .write = tg_set_conf_u64, 1447 }, 1448 { 1449 .name = "throttle.read_iops_device", 1450 .private = offsetof(struct throtl_grp, iops[READ][LIMIT_MAX]), 1451 .seq_show = tg_print_conf_uint, 1452 .write = tg_set_conf_uint, 1453 }, 1454 { 1455 .name = "throttle.write_iops_device", 1456 .private = offsetof(struct throtl_grp, iops[WRITE][LIMIT_MAX]), 1457 .seq_show = tg_print_conf_uint, 1458 .write = tg_set_conf_uint, 1459 }, 1460 { 1461 .name = "throttle.io_service_bytes", 1462 .private = offsetof(struct throtl_grp, stat_bytes), 1463 .seq_show = tg_print_rwstat, 1464 }, 1465 { 1466 .name = "throttle.io_service_bytes_recursive", 1467 .private = offsetof(struct throtl_grp, stat_bytes), 1468 .seq_show = tg_print_rwstat_recursive, 1469 }, 1470 { 1471 .name = "throttle.io_serviced", 1472 .private = offsetof(struct throtl_grp, stat_ios), 1473 .seq_show = tg_print_rwstat, 1474 }, 1475 { 1476 .name = "throttle.io_serviced_recursive", 1477 .private = offsetof(struct throtl_grp, stat_ios), 1478 .seq_show = tg_print_rwstat_recursive, 1479 }, 1480 { } /* terminate */ 1481 }; 1482 1483 static u64 tg_prfill_limit(struct seq_file *sf, struct blkg_policy_data *pd, 1484 int off) 1485 { 1486 struct throtl_grp *tg = pd_to_tg(pd); 1487 const char *dname = blkg_dev_name(pd->blkg); 1488 char bufs[4][21] = { "max", "max", "max", "max" }; 1489 u64 bps_dft; 1490 unsigned int iops_dft; 1491 char idle_time[26] = ""; 1492 char latency_time[26] = ""; 1493 1494 if (!dname) 1495 return 0; 1496 1497 if (off == LIMIT_LOW) { 1498 bps_dft = 0; 1499 iops_dft = 0; 1500 } else { 1501 bps_dft = U64_MAX; 1502 iops_dft = UINT_MAX; 1503 } 1504 1505 if (tg->bps_conf[READ][off] == bps_dft && 1506 tg->bps_conf[WRITE][off] == bps_dft && 1507 tg->iops_conf[READ][off] == iops_dft && 1508 tg->iops_conf[WRITE][off] == iops_dft && 1509 (off != LIMIT_LOW || 1510 (tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD && 1511 tg->latency_target_conf == DFL_LATENCY_TARGET))) 1512 return 0; 1513 1514 if (tg->bps_conf[READ][off] != U64_MAX) 1515 snprintf(bufs[0], sizeof(bufs[0]), "%llu", 1516 tg->bps_conf[READ][off]); 1517 if (tg->bps_conf[WRITE][off] != U64_MAX) 1518 snprintf(bufs[1], sizeof(bufs[1]), "%llu", 1519 tg->bps_conf[WRITE][off]); 1520 if (tg->iops_conf[READ][off] != UINT_MAX) 1521 snprintf(bufs[2], sizeof(bufs[2]), "%u", 1522 tg->iops_conf[READ][off]); 1523 if (tg->iops_conf[WRITE][off] != UINT_MAX) 1524 snprintf(bufs[3], sizeof(bufs[3]), "%u", 1525 tg->iops_conf[WRITE][off]); 1526 if (off == LIMIT_LOW) { 1527 if (tg->idletime_threshold_conf == ULONG_MAX) 1528 strcpy(idle_time, " idle=max"); 1529 else 1530 snprintf(idle_time, sizeof(idle_time), " idle=%lu", 1531 tg->idletime_threshold_conf); 1532 1533 if (tg->latency_target_conf == ULONG_MAX) 1534 strcpy(latency_time, " latency=max"); 1535 else 1536 snprintf(latency_time, sizeof(latency_time), 1537 " latency=%lu", tg->latency_target_conf); 1538 } 1539 1540 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s%s%s\n", 1541 dname, bufs[0], bufs[1], bufs[2], bufs[3], idle_time, 1542 latency_time); 1543 return 0; 1544 } 1545 1546 static int tg_print_limit(struct seq_file *sf, void *v) 1547 { 1548 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_limit, 1549 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1550 return 0; 1551 } 1552 1553 static ssize_t tg_set_limit(struct kernfs_open_file *of, 1554 char *buf, size_t nbytes, loff_t off) 1555 { 1556 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1557 struct blkg_conf_ctx ctx; 1558 struct throtl_grp *tg; 1559 u64 v[4]; 1560 unsigned long idle_time; 1561 unsigned long latency_time; 1562 int ret; 1563 int index = of_cft(of)->private; 1564 1565 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1566 if (ret) 1567 return ret; 1568 1569 tg = blkg_to_tg(ctx.blkg); 1570 tg_update_carryover(tg); 1571 1572 v[0] = tg->bps_conf[READ][index]; 1573 v[1] = tg->bps_conf[WRITE][index]; 1574 v[2] = tg->iops_conf[READ][index]; 1575 v[3] = tg->iops_conf[WRITE][index]; 1576 1577 idle_time = tg->idletime_threshold_conf; 1578 latency_time = tg->latency_target_conf; 1579 while (true) { 1580 char tok[27]; /* wiops=18446744073709551616 */ 1581 char *p; 1582 u64 val = U64_MAX; 1583 int len; 1584 1585 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1586 break; 1587 if (tok[0] == '\0') 1588 break; 1589 ctx.body += len; 1590 1591 ret = -EINVAL; 1592 p = tok; 1593 strsep(&p, "="); 1594 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1595 goto out_finish; 1596 1597 ret = -ERANGE; 1598 if (!val) 1599 goto out_finish; 1600 1601 ret = -EINVAL; 1602 if (!strcmp(tok, "rbps") && val > 1) 1603 v[0] = val; 1604 else if (!strcmp(tok, "wbps") && val > 1) 1605 v[1] = val; 1606 else if (!strcmp(tok, "riops") && val > 1) 1607 v[2] = min_t(u64, val, UINT_MAX); 1608 else if (!strcmp(tok, "wiops") && val > 1) 1609 v[3] = min_t(u64, val, UINT_MAX); 1610 else if (off == LIMIT_LOW && !strcmp(tok, "idle")) 1611 idle_time = val; 1612 else if (off == LIMIT_LOW && !strcmp(tok, "latency")) 1613 latency_time = val; 1614 else 1615 goto out_finish; 1616 } 1617 1618 tg->bps_conf[READ][index] = v[0]; 1619 tg->bps_conf[WRITE][index] = v[1]; 1620 tg->iops_conf[READ][index] = v[2]; 1621 tg->iops_conf[WRITE][index] = v[3]; 1622 1623 if (index == LIMIT_MAX) { 1624 tg->bps[READ][index] = v[0]; 1625 tg->bps[WRITE][index] = v[1]; 1626 tg->iops[READ][index] = v[2]; 1627 tg->iops[WRITE][index] = v[3]; 1628 } 1629 tg->bps[READ][LIMIT_LOW] = min(tg->bps_conf[READ][LIMIT_LOW], 1630 tg->bps_conf[READ][LIMIT_MAX]); 1631 tg->bps[WRITE][LIMIT_LOW] = min(tg->bps_conf[WRITE][LIMIT_LOW], 1632 tg->bps_conf[WRITE][LIMIT_MAX]); 1633 tg->iops[READ][LIMIT_LOW] = min(tg->iops_conf[READ][LIMIT_LOW], 1634 tg->iops_conf[READ][LIMIT_MAX]); 1635 tg->iops[WRITE][LIMIT_LOW] = min(tg->iops_conf[WRITE][LIMIT_LOW], 1636 tg->iops_conf[WRITE][LIMIT_MAX]); 1637 tg->idletime_threshold_conf = idle_time; 1638 tg->latency_target_conf = latency_time; 1639 1640 /* force user to configure all settings for low limit */ 1641 if (!(tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW] || 1642 tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) || 1643 tg->idletime_threshold_conf == DFL_IDLE_THRESHOLD || 1644 tg->latency_target_conf == DFL_LATENCY_TARGET) { 1645 tg->bps[READ][LIMIT_LOW] = 0; 1646 tg->bps[WRITE][LIMIT_LOW] = 0; 1647 tg->iops[READ][LIMIT_LOW] = 0; 1648 tg->iops[WRITE][LIMIT_LOW] = 0; 1649 tg->idletime_threshold = DFL_IDLE_THRESHOLD; 1650 tg->latency_target = DFL_LATENCY_TARGET; 1651 } else if (index == LIMIT_LOW) { 1652 tg->idletime_threshold = tg->idletime_threshold_conf; 1653 tg->latency_target = tg->latency_target_conf; 1654 } 1655 1656 blk_throtl_update_limit_valid(tg->td); 1657 if (tg->td->limit_valid[LIMIT_LOW]) { 1658 if (index == LIMIT_LOW) 1659 tg->td->limit_index = LIMIT_LOW; 1660 } else 1661 tg->td->limit_index = LIMIT_MAX; 1662 tg_conf_updated(tg, index == LIMIT_LOW && 1663 tg->td->limit_valid[LIMIT_LOW]); 1664 ret = 0; 1665 out_finish: 1666 blkg_conf_finish(&ctx); 1667 return ret ?: nbytes; 1668 } 1669 1670 static struct cftype throtl_files[] = { 1671 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1672 { 1673 .name = "low", 1674 .flags = CFTYPE_NOT_ON_ROOT, 1675 .seq_show = tg_print_limit, 1676 .write = tg_set_limit, 1677 .private = LIMIT_LOW, 1678 }, 1679 #endif 1680 { 1681 .name = "max", 1682 .flags = CFTYPE_NOT_ON_ROOT, 1683 .seq_show = tg_print_limit, 1684 .write = tg_set_limit, 1685 .private = LIMIT_MAX, 1686 }, 1687 { } /* terminate */ 1688 }; 1689 1690 static void throtl_shutdown_wq(struct request_queue *q) 1691 { 1692 struct throtl_data *td = q->td; 1693 1694 cancel_work_sync(&td->dispatch_work); 1695 } 1696 1697 struct blkcg_policy blkcg_policy_throtl = { 1698 .dfl_cftypes = throtl_files, 1699 .legacy_cftypes = throtl_legacy_files, 1700 1701 .pd_alloc_fn = throtl_pd_alloc, 1702 .pd_init_fn = throtl_pd_init, 1703 .pd_online_fn = throtl_pd_online, 1704 .pd_offline_fn = throtl_pd_offline, 1705 .pd_free_fn = throtl_pd_free, 1706 }; 1707 1708 void blk_throtl_cancel_bios(struct gendisk *disk) 1709 { 1710 struct request_queue *q = disk->queue; 1711 struct cgroup_subsys_state *pos_css; 1712 struct blkcg_gq *blkg; 1713 1714 spin_lock_irq(&q->queue_lock); 1715 /* 1716 * queue_lock is held, rcu lock is not needed here technically. 1717 * However, rcu lock is still held to emphasize that following 1718 * path need RCU protection and to prevent warning from lockdep. 1719 */ 1720 rcu_read_lock(); 1721 blkg_for_each_descendant_post(blkg, pos_css, q->root_blkg) { 1722 struct throtl_grp *tg = blkg_to_tg(blkg); 1723 struct throtl_service_queue *sq = &tg->service_queue; 1724 1725 /* 1726 * Set the flag to make sure throtl_pending_timer_fn() won't 1727 * stop until all throttled bios are dispatched. 1728 */ 1729 tg->flags |= THROTL_TG_CANCELING; 1730 1731 /* 1732 * Do not dispatch cgroup without THROTL_TG_PENDING or cgroup 1733 * will be inserted to service queue without THROTL_TG_PENDING 1734 * set in tg_update_disptime below. Then IO dispatched from 1735 * child in tg_dispatch_one_bio will trigger double insertion 1736 * and corrupt the tree. 1737 */ 1738 if (!(tg->flags & THROTL_TG_PENDING)) 1739 continue; 1740 1741 /* 1742 * Update disptime after setting the above flag to make sure 1743 * throtl_select_dispatch() won't exit without dispatching. 1744 */ 1745 tg_update_disptime(tg); 1746 1747 throtl_schedule_pending_timer(sq, jiffies + 1); 1748 } 1749 rcu_read_unlock(); 1750 spin_unlock_irq(&q->queue_lock); 1751 } 1752 1753 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 1754 static unsigned long __tg_last_low_overflow_time(struct throtl_grp *tg) 1755 { 1756 unsigned long rtime = jiffies, wtime = jiffies; 1757 1758 if (tg->bps[READ][LIMIT_LOW] || tg->iops[READ][LIMIT_LOW]) 1759 rtime = tg->last_low_overflow_time[READ]; 1760 if (tg->bps[WRITE][LIMIT_LOW] || tg->iops[WRITE][LIMIT_LOW]) 1761 wtime = tg->last_low_overflow_time[WRITE]; 1762 return min(rtime, wtime); 1763 } 1764 1765 static unsigned long tg_last_low_overflow_time(struct throtl_grp *tg) 1766 { 1767 struct throtl_service_queue *parent_sq; 1768 struct throtl_grp *parent = tg; 1769 unsigned long ret = __tg_last_low_overflow_time(tg); 1770 1771 while (true) { 1772 parent_sq = parent->service_queue.parent_sq; 1773 parent = sq_to_tg(parent_sq); 1774 if (!parent) 1775 break; 1776 1777 /* 1778 * The parent doesn't have low limit, it always reaches low 1779 * limit. Its overflow time is useless for children 1780 */ 1781 if (!parent->bps[READ][LIMIT_LOW] && 1782 !parent->iops[READ][LIMIT_LOW] && 1783 !parent->bps[WRITE][LIMIT_LOW] && 1784 !parent->iops[WRITE][LIMIT_LOW]) 1785 continue; 1786 if (time_after(__tg_last_low_overflow_time(parent), ret)) 1787 ret = __tg_last_low_overflow_time(parent); 1788 } 1789 return ret; 1790 } 1791 1792 static bool throtl_tg_is_idle(struct throtl_grp *tg) 1793 { 1794 /* 1795 * cgroup is idle if: 1796 * - single idle is too long, longer than a fixed value (in case user 1797 * configure a too big threshold) or 4 times of idletime threshold 1798 * - average think time is more than threshold 1799 * - IO latency is largely below threshold 1800 */ 1801 unsigned long time; 1802 bool ret; 1803 1804 time = min_t(unsigned long, MAX_IDLE_TIME, 4 * tg->idletime_threshold); 1805 ret = tg->latency_target == DFL_LATENCY_TARGET || 1806 tg->idletime_threshold == DFL_IDLE_THRESHOLD || 1807 (ktime_get_ns() >> 10) - tg->last_finish_time > time || 1808 tg->avg_idletime > tg->idletime_threshold || 1809 (tg->latency_target && tg->bio_cnt && 1810 tg->bad_bio_cnt * 5 < tg->bio_cnt); 1811 throtl_log(&tg->service_queue, 1812 "avg_idle=%ld, idle_threshold=%ld, bad_bio=%d, total_bio=%d, is_idle=%d, scale=%d", 1813 tg->avg_idletime, tg->idletime_threshold, tg->bad_bio_cnt, 1814 tg->bio_cnt, ret, tg->td->scale); 1815 return ret; 1816 } 1817 1818 static bool throtl_low_limit_reached(struct throtl_grp *tg, int rw) 1819 { 1820 struct throtl_service_queue *sq = &tg->service_queue; 1821 bool limit = tg->bps[rw][LIMIT_LOW] || tg->iops[rw][LIMIT_LOW]; 1822 1823 /* 1824 * if low limit is zero, low limit is always reached. 1825 * if low limit is non-zero, we can check if there is any request 1826 * is queued to determine if low limit is reached as we throttle 1827 * request according to limit. 1828 */ 1829 return !limit || sq->nr_queued[rw]; 1830 } 1831 1832 static bool throtl_tg_can_upgrade(struct throtl_grp *tg) 1833 { 1834 /* 1835 * cgroup reaches low limit when low limit of READ and WRITE are 1836 * both reached, it's ok to upgrade to next limit if cgroup reaches 1837 * low limit 1838 */ 1839 if (throtl_low_limit_reached(tg, READ) && 1840 throtl_low_limit_reached(tg, WRITE)) 1841 return true; 1842 1843 if (time_after_eq(jiffies, 1844 tg_last_low_overflow_time(tg) + tg->td->throtl_slice) && 1845 throtl_tg_is_idle(tg)) 1846 return true; 1847 return false; 1848 } 1849 1850 static bool throtl_hierarchy_can_upgrade(struct throtl_grp *tg) 1851 { 1852 while (true) { 1853 if (throtl_tg_can_upgrade(tg)) 1854 return true; 1855 tg = sq_to_tg(tg->service_queue.parent_sq); 1856 if (!tg || !tg_to_blkg(tg)->parent) 1857 return false; 1858 } 1859 return false; 1860 } 1861 1862 static bool throtl_can_upgrade(struct throtl_data *td, 1863 struct throtl_grp *this_tg) 1864 { 1865 struct cgroup_subsys_state *pos_css; 1866 struct blkcg_gq *blkg; 1867 1868 if (td->limit_index != LIMIT_LOW) 1869 return false; 1870 1871 if (time_before(jiffies, td->low_downgrade_time + td->throtl_slice)) 1872 return false; 1873 1874 rcu_read_lock(); 1875 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1876 struct throtl_grp *tg = blkg_to_tg(blkg); 1877 1878 if (tg == this_tg) 1879 continue; 1880 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1881 continue; 1882 if (!throtl_hierarchy_can_upgrade(tg)) { 1883 rcu_read_unlock(); 1884 return false; 1885 } 1886 } 1887 rcu_read_unlock(); 1888 return true; 1889 } 1890 1891 static void throtl_upgrade_check(struct throtl_grp *tg) 1892 { 1893 unsigned long now = jiffies; 1894 1895 if (tg->td->limit_index != LIMIT_LOW) 1896 return; 1897 1898 if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1899 return; 1900 1901 tg->last_check_time = now; 1902 1903 if (!time_after_eq(now, 1904 __tg_last_low_overflow_time(tg) + tg->td->throtl_slice)) 1905 return; 1906 1907 if (throtl_can_upgrade(tg->td, NULL)) 1908 throtl_upgrade_state(tg->td); 1909 } 1910 1911 static void throtl_upgrade_state(struct throtl_data *td) 1912 { 1913 struct cgroup_subsys_state *pos_css; 1914 struct blkcg_gq *blkg; 1915 1916 throtl_log(&td->service_queue, "upgrade to max"); 1917 td->limit_index = LIMIT_MAX; 1918 td->low_upgrade_time = jiffies; 1919 td->scale = 0; 1920 rcu_read_lock(); 1921 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) { 1922 struct throtl_grp *tg = blkg_to_tg(blkg); 1923 struct throtl_service_queue *sq = &tg->service_queue; 1924 1925 tg->disptime = jiffies - 1; 1926 throtl_select_dispatch(sq); 1927 throtl_schedule_next_dispatch(sq, true); 1928 } 1929 rcu_read_unlock(); 1930 throtl_select_dispatch(&td->service_queue); 1931 throtl_schedule_next_dispatch(&td->service_queue, true); 1932 queue_work(kthrotld_workqueue, &td->dispatch_work); 1933 } 1934 1935 static void throtl_downgrade_state(struct throtl_data *td) 1936 { 1937 td->scale /= 2; 1938 1939 throtl_log(&td->service_queue, "downgrade, scale %d", td->scale); 1940 if (td->scale) { 1941 td->low_upgrade_time = jiffies - td->scale * td->throtl_slice; 1942 return; 1943 } 1944 1945 td->limit_index = LIMIT_LOW; 1946 td->low_downgrade_time = jiffies; 1947 } 1948 1949 static bool throtl_tg_can_downgrade(struct throtl_grp *tg) 1950 { 1951 struct throtl_data *td = tg->td; 1952 unsigned long now = jiffies; 1953 1954 /* 1955 * If cgroup is below low limit, consider downgrade and throttle other 1956 * cgroups 1957 */ 1958 if (time_after_eq(now, tg_last_low_overflow_time(tg) + 1959 td->throtl_slice) && 1960 (!throtl_tg_is_idle(tg) || 1961 !list_empty(&tg_to_blkg(tg)->blkcg->css.children))) 1962 return true; 1963 return false; 1964 } 1965 1966 static bool throtl_hierarchy_can_downgrade(struct throtl_grp *tg) 1967 { 1968 struct throtl_data *td = tg->td; 1969 1970 if (time_before(jiffies, td->low_upgrade_time + td->throtl_slice)) 1971 return false; 1972 1973 while (true) { 1974 if (!throtl_tg_can_downgrade(tg)) 1975 return false; 1976 tg = sq_to_tg(tg->service_queue.parent_sq); 1977 if (!tg || !tg_to_blkg(tg)->parent) 1978 break; 1979 } 1980 return true; 1981 } 1982 1983 static void throtl_downgrade_check(struct throtl_grp *tg) 1984 { 1985 uint64_t bps; 1986 unsigned int iops; 1987 unsigned long elapsed_time; 1988 unsigned long now = jiffies; 1989 1990 if (tg->td->limit_index != LIMIT_MAX || 1991 !tg->td->limit_valid[LIMIT_LOW]) 1992 return; 1993 if (!list_empty(&tg_to_blkg(tg)->blkcg->css.children)) 1994 return; 1995 if (time_after(tg->last_check_time + tg->td->throtl_slice, now)) 1996 return; 1997 1998 elapsed_time = now - tg->last_check_time; 1999 tg->last_check_time = now; 2000 2001 if (time_before(now, tg_last_low_overflow_time(tg) + 2002 tg->td->throtl_slice)) 2003 return; 2004 2005 if (tg->bps[READ][LIMIT_LOW]) { 2006 bps = tg->last_bytes_disp[READ] * HZ; 2007 do_div(bps, elapsed_time); 2008 if (bps >= tg->bps[READ][LIMIT_LOW]) 2009 tg->last_low_overflow_time[READ] = now; 2010 } 2011 2012 if (tg->bps[WRITE][LIMIT_LOW]) { 2013 bps = tg->last_bytes_disp[WRITE] * HZ; 2014 do_div(bps, elapsed_time); 2015 if (bps >= tg->bps[WRITE][LIMIT_LOW]) 2016 tg->last_low_overflow_time[WRITE] = now; 2017 } 2018 2019 if (tg->iops[READ][LIMIT_LOW]) { 2020 iops = tg->last_io_disp[READ] * HZ / elapsed_time; 2021 if (iops >= tg->iops[READ][LIMIT_LOW]) 2022 tg->last_low_overflow_time[READ] = now; 2023 } 2024 2025 if (tg->iops[WRITE][LIMIT_LOW]) { 2026 iops = tg->last_io_disp[WRITE] * HZ / elapsed_time; 2027 if (iops >= tg->iops[WRITE][LIMIT_LOW]) 2028 tg->last_low_overflow_time[WRITE] = now; 2029 } 2030 2031 /* 2032 * If cgroup is below low limit, consider downgrade and throttle other 2033 * cgroups 2034 */ 2035 if (throtl_hierarchy_can_downgrade(tg)) 2036 throtl_downgrade_state(tg->td); 2037 2038 tg->last_bytes_disp[READ] = 0; 2039 tg->last_bytes_disp[WRITE] = 0; 2040 tg->last_io_disp[READ] = 0; 2041 tg->last_io_disp[WRITE] = 0; 2042 } 2043 2044 static void blk_throtl_update_idletime(struct throtl_grp *tg) 2045 { 2046 unsigned long now; 2047 unsigned long last_finish_time = tg->last_finish_time; 2048 2049 if (last_finish_time == 0) 2050 return; 2051 2052 now = ktime_get_ns() >> 10; 2053 if (now <= last_finish_time || 2054 last_finish_time == tg->checked_last_finish_time) 2055 return; 2056 2057 tg->avg_idletime = (tg->avg_idletime * 7 + now - last_finish_time) >> 3; 2058 tg->checked_last_finish_time = last_finish_time; 2059 } 2060 2061 static void throtl_update_latency_buckets(struct throtl_data *td) 2062 { 2063 struct avg_latency_bucket avg_latency[2][LATENCY_BUCKET_SIZE]; 2064 int i, cpu, rw; 2065 unsigned long last_latency[2] = { 0 }; 2066 unsigned long latency[2]; 2067 2068 if (!blk_queue_nonrot(td->queue) || !td->limit_valid[LIMIT_LOW]) 2069 return; 2070 if (time_before(jiffies, td->last_calculate_time + HZ)) 2071 return; 2072 td->last_calculate_time = jiffies; 2073 2074 memset(avg_latency, 0, sizeof(avg_latency)); 2075 for (rw = READ; rw <= WRITE; rw++) { 2076 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2077 struct latency_bucket *tmp = &td->tmp_buckets[rw][i]; 2078 2079 for_each_possible_cpu(cpu) { 2080 struct latency_bucket *bucket; 2081 2082 /* this isn't race free, but ok in practice */ 2083 bucket = per_cpu_ptr(td->latency_buckets[rw], 2084 cpu); 2085 tmp->total_latency += bucket[i].total_latency; 2086 tmp->samples += bucket[i].samples; 2087 bucket[i].total_latency = 0; 2088 bucket[i].samples = 0; 2089 } 2090 2091 if (tmp->samples >= 32) { 2092 int samples = tmp->samples; 2093 2094 latency[rw] = tmp->total_latency; 2095 2096 tmp->total_latency = 0; 2097 tmp->samples = 0; 2098 latency[rw] /= samples; 2099 if (latency[rw] == 0) 2100 continue; 2101 avg_latency[rw][i].latency = latency[rw]; 2102 } 2103 } 2104 } 2105 2106 for (rw = READ; rw <= WRITE; rw++) { 2107 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2108 if (!avg_latency[rw][i].latency) { 2109 if (td->avg_buckets[rw][i].latency < last_latency[rw]) 2110 td->avg_buckets[rw][i].latency = 2111 last_latency[rw]; 2112 continue; 2113 } 2114 2115 if (!td->avg_buckets[rw][i].valid) 2116 latency[rw] = avg_latency[rw][i].latency; 2117 else 2118 latency[rw] = (td->avg_buckets[rw][i].latency * 7 + 2119 avg_latency[rw][i].latency) >> 3; 2120 2121 td->avg_buckets[rw][i].latency = max(latency[rw], 2122 last_latency[rw]); 2123 td->avg_buckets[rw][i].valid = true; 2124 last_latency[rw] = td->avg_buckets[rw][i].latency; 2125 } 2126 } 2127 2128 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) 2129 throtl_log(&td->service_queue, 2130 "Latency bucket %d: read latency=%ld, read valid=%d, " 2131 "write latency=%ld, write valid=%d", i, 2132 td->avg_buckets[READ][i].latency, 2133 td->avg_buckets[READ][i].valid, 2134 td->avg_buckets[WRITE][i].latency, 2135 td->avg_buckets[WRITE][i].valid); 2136 } 2137 #else 2138 static inline void throtl_update_latency_buckets(struct throtl_data *td) 2139 { 2140 } 2141 2142 static void blk_throtl_update_idletime(struct throtl_grp *tg) 2143 { 2144 } 2145 2146 static void throtl_downgrade_check(struct throtl_grp *tg) 2147 { 2148 } 2149 2150 static void throtl_upgrade_check(struct throtl_grp *tg) 2151 { 2152 } 2153 2154 static bool throtl_can_upgrade(struct throtl_data *td, 2155 struct throtl_grp *this_tg) 2156 { 2157 return false; 2158 } 2159 2160 static void throtl_upgrade_state(struct throtl_data *td) 2161 { 2162 } 2163 #endif 2164 2165 bool __blk_throtl_bio(struct bio *bio) 2166 { 2167 struct request_queue *q = bdev_get_queue(bio->bi_bdev); 2168 struct blkcg_gq *blkg = bio->bi_blkg; 2169 struct throtl_qnode *qn = NULL; 2170 struct throtl_grp *tg = blkg_to_tg(blkg); 2171 struct throtl_service_queue *sq; 2172 bool rw = bio_data_dir(bio); 2173 bool throttled = false; 2174 struct throtl_data *td = tg->td; 2175 2176 rcu_read_lock(); 2177 2178 if (!cgroup_subsys_on_dfl(io_cgrp_subsys)) { 2179 blkg_rwstat_add(&tg->stat_bytes, bio->bi_opf, 2180 bio->bi_iter.bi_size); 2181 blkg_rwstat_add(&tg->stat_ios, bio->bi_opf, 1); 2182 } 2183 2184 spin_lock_irq(&q->queue_lock); 2185 2186 throtl_update_latency_buckets(td); 2187 2188 blk_throtl_update_idletime(tg); 2189 2190 sq = &tg->service_queue; 2191 2192 again: 2193 while (true) { 2194 if (tg->last_low_overflow_time[rw] == 0) 2195 tg->last_low_overflow_time[rw] = jiffies; 2196 throtl_downgrade_check(tg); 2197 throtl_upgrade_check(tg); 2198 /* throtl is FIFO - if bios are already queued, should queue */ 2199 if (sq->nr_queued[rw]) 2200 break; 2201 2202 /* if above limits, break to queue */ 2203 if (!tg_may_dispatch(tg, bio, NULL)) { 2204 tg->last_low_overflow_time[rw] = jiffies; 2205 if (throtl_can_upgrade(td, tg)) { 2206 throtl_upgrade_state(td); 2207 goto again; 2208 } 2209 break; 2210 } 2211 2212 /* within limits, let's charge and dispatch directly */ 2213 throtl_charge_bio(tg, bio); 2214 2215 /* 2216 * We need to trim slice even when bios are not being queued 2217 * otherwise it might happen that a bio is not queued for 2218 * a long time and slice keeps on extending and trim is not 2219 * called for a long time. Now if limits are reduced suddenly 2220 * we take into account all the IO dispatched so far at new 2221 * low rate and * newly queued IO gets a really long dispatch 2222 * time. 2223 * 2224 * So keep on trimming slice even if bio is not queued. 2225 */ 2226 throtl_trim_slice(tg, rw); 2227 2228 /* 2229 * @bio passed through this layer without being throttled. 2230 * Climb up the ladder. If we're already at the top, it 2231 * can be executed directly. 2232 */ 2233 qn = &tg->qnode_on_parent[rw]; 2234 sq = sq->parent_sq; 2235 tg = sq_to_tg(sq); 2236 if (!tg) { 2237 bio_set_flag(bio, BIO_BPS_THROTTLED); 2238 goto out_unlock; 2239 } 2240 } 2241 2242 /* out-of-limit, queue to @tg */ 2243 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 2244 rw == READ ? 'R' : 'W', 2245 tg->bytes_disp[rw], bio->bi_iter.bi_size, 2246 tg_bps_limit(tg, rw), 2247 tg->io_disp[rw], tg_iops_limit(tg, rw), 2248 sq->nr_queued[READ], sq->nr_queued[WRITE]); 2249 2250 tg->last_low_overflow_time[rw] = jiffies; 2251 2252 td->nr_queued[rw]++; 2253 throtl_add_bio_tg(bio, qn, tg); 2254 throttled = true; 2255 2256 /* 2257 * Update @tg's dispatch time and force schedule dispatch if @tg 2258 * was empty before @bio. The forced scheduling isn't likely to 2259 * cause undue delay as @bio is likely to be dispatched directly if 2260 * its @tg's disptime is not in the future. 2261 */ 2262 if (tg->flags & THROTL_TG_WAS_EMPTY) { 2263 tg_update_disptime(tg); 2264 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 2265 } 2266 2267 out_unlock: 2268 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2269 if (throttled || !td->track_bio_latency) 2270 bio->bi_issue.value |= BIO_ISSUE_THROTL_SKIP_LATENCY; 2271 #endif 2272 spin_unlock_irq(&q->queue_lock); 2273 2274 rcu_read_unlock(); 2275 return throttled; 2276 } 2277 2278 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2279 static void throtl_track_latency(struct throtl_data *td, sector_t size, 2280 enum req_op op, unsigned long time) 2281 { 2282 const bool rw = op_is_write(op); 2283 struct latency_bucket *latency; 2284 int index; 2285 2286 if (!td || td->limit_index != LIMIT_LOW || 2287 !(op == REQ_OP_READ || op == REQ_OP_WRITE) || 2288 !blk_queue_nonrot(td->queue)) 2289 return; 2290 2291 index = request_bucket_index(size); 2292 2293 latency = get_cpu_ptr(td->latency_buckets[rw]); 2294 latency[index].total_latency += time; 2295 latency[index].samples++; 2296 put_cpu_ptr(td->latency_buckets[rw]); 2297 } 2298 2299 void blk_throtl_stat_add(struct request *rq, u64 time_ns) 2300 { 2301 struct request_queue *q = rq->q; 2302 struct throtl_data *td = q->td; 2303 2304 throtl_track_latency(td, blk_rq_stats_sectors(rq), req_op(rq), 2305 time_ns >> 10); 2306 } 2307 2308 void blk_throtl_bio_endio(struct bio *bio) 2309 { 2310 struct blkcg_gq *blkg; 2311 struct throtl_grp *tg; 2312 u64 finish_time_ns; 2313 unsigned long finish_time; 2314 unsigned long start_time; 2315 unsigned long lat; 2316 int rw = bio_data_dir(bio); 2317 2318 blkg = bio->bi_blkg; 2319 if (!blkg) 2320 return; 2321 tg = blkg_to_tg(blkg); 2322 if (!tg->td->limit_valid[LIMIT_LOW]) 2323 return; 2324 2325 finish_time_ns = ktime_get_ns(); 2326 tg->last_finish_time = finish_time_ns >> 10; 2327 2328 start_time = bio_issue_time(&bio->bi_issue) >> 10; 2329 finish_time = __bio_issue_time(finish_time_ns) >> 10; 2330 if (!start_time || finish_time <= start_time) 2331 return; 2332 2333 lat = finish_time - start_time; 2334 /* this is only for bio based driver */ 2335 if (!(bio->bi_issue.value & BIO_ISSUE_THROTL_SKIP_LATENCY)) 2336 throtl_track_latency(tg->td, bio_issue_size(&bio->bi_issue), 2337 bio_op(bio), lat); 2338 2339 if (tg->latency_target && lat >= tg->td->filtered_latency) { 2340 int bucket; 2341 unsigned int threshold; 2342 2343 bucket = request_bucket_index(bio_issue_size(&bio->bi_issue)); 2344 threshold = tg->td->avg_buckets[rw][bucket].latency + 2345 tg->latency_target; 2346 if (lat > threshold) 2347 tg->bad_bio_cnt++; 2348 /* 2349 * Not race free, could get wrong count, which means cgroups 2350 * will be throttled 2351 */ 2352 tg->bio_cnt++; 2353 } 2354 2355 if (time_after(jiffies, tg->bio_cnt_reset_time) || tg->bio_cnt > 1024) { 2356 tg->bio_cnt_reset_time = tg->td->throtl_slice + jiffies; 2357 tg->bio_cnt /= 2; 2358 tg->bad_bio_cnt /= 2; 2359 } 2360 } 2361 #endif 2362 2363 int blk_throtl_init(struct gendisk *disk) 2364 { 2365 struct request_queue *q = disk->queue; 2366 struct throtl_data *td; 2367 int ret; 2368 2369 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 2370 if (!td) 2371 return -ENOMEM; 2372 td->latency_buckets[READ] = __alloc_percpu(sizeof(struct latency_bucket) * 2373 LATENCY_BUCKET_SIZE, __alignof__(u64)); 2374 if (!td->latency_buckets[READ]) { 2375 kfree(td); 2376 return -ENOMEM; 2377 } 2378 td->latency_buckets[WRITE] = __alloc_percpu(sizeof(struct latency_bucket) * 2379 LATENCY_BUCKET_SIZE, __alignof__(u64)); 2380 if (!td->latency_buckets[WRITE]) { 2381 free_percpu(td->latency_buckets[READ]); 2382 kfree(td); 2383 return -ENOMEM; 2384 } 2385 2386 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 2387 throtl_service_queue_init(&td->service_queue); 2388 2389 q->td = td; 2390 td->queue = q; 2391 2392 td->limit_valid[LIMIT_MAX] = true; 2393 td->limit_index = LIMIT_MAX; 2394 td->low_upgrade_time = jiffies; 2395 td->low_downgrade_time = jiffies; 2396 2397 /* activate policy */ 2398 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 2399 if (ret) { 2400 free_percpu(td->latency_buckets[READ]); 2401 free_percpu(td->latency_buckets[WRITE]); 2402 kfree(td); 2403 } 2404 return ret; 2405 } 2406 2407 void blk_throtl_exit(struct gendisk *disk) 2408 { 2409 struct request_queue *q = disk->queue; 2410 2411 BUG_ON(!q->td); 2412 del_timer_sync(&q->td->service_queue.pending_timer); 2413 throtl_shutdown_wq(q); 2414 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 2415 free_percpu(q->td->latency_buckets[READ]); 2416 free_percpu(q->td->latency_buckets[WRITE]); 2417 kfree(q->td); 2418 } 2419 2420 void blk_throtl_register(struct gendisk *disk) 2421 { 2422 struct request_queue *q = disk->queue; 2423 struct throtl_data *td; 2424 int i; 2425 2426 td = q->td; 2427 BUG_ON(!td); 2428 2429 if (blk_queue_nonrot(q)) { 2430 td->throtl_slice = DFL_THROTL_SLICE_SSD; 2431 td->filtered_latency = LATENCY_FILTERED_SSD; 2432 } else { 2433 td->throtl_slice = DFL_THROTL_SLICE_HD; 2434 td->filtered_latency = LATENCY_FILTERED_HD; 2435 for (i = 0; i < LATENCY_BUCKET_SIZE; i++) { 2436 td->avg_buckets[READ][i].latency = DFL_HD_BASELINE_LATENCY; 2437 td->avg_buckets[WRITE][i].latency = DFL_HD_BASELINE_LATENCY; 2438 } 2439 } 2440 #ifndef CONFIG_BLK_DEV_THROTTLING_LOW 2441 /* if no low limit, use previous default */ 2442 td->throtl_slice = DFL_THROTL_SLICE_HD; 2443 #endif 2444 2445 td->track_bio_latency = !queue_is_mq(q); 2446 if (!td->track_bio_latency) 2447 blk_stat_enable_accounting(q); 2448 } 2449 2450 #ifdef CONFIG_BLK_DEV_THROTTLING_LOW 2451 ssize_t blk_throtl_sample_time_show(struct request_queue *q, char *page) 2452 { 2453 if (!q->td) 2454 return -EINVAL; 2455 return sprintf(page, "%u\n", jiffies_to_msecs(q->td->throtl_slice)); 2456 } 2457 2458 ssize_t blk_throtl_sample_time_store(struct request_queue *q, 2459 const char *page, size_t count) 2460 { 2461 unsigned long v; 2462 unsigned long t; 2463 2464 if (!q->td) 2465 return -EINVAL; 2466 if (kstrtoul(page, 10, &v)) 2467 return -EINVAL; 2468 t = msecs_to_jiffies(v); 2469 if (t == 0 || t > MAX_THROTL_SLICE) 2470 return -EINVAL; 2471 q->td->throtl_slice = t; 2472 return count; 2473 } 2474 #endif 2475 2476 static int __init throtl_init(void) 2477 { 2478 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 2479 if (!kthrotld_workqueue) 2480 panic("Failed to create kthrotld\n"); 2481 2482 return blkcg_policy_register(&blkcg_policy_throtl); 2483 } 2484 2485 module_init(throtl_init); 2486