1 /* 2 * Interface for controlling IO bandwidth on a request queue 3 * 4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5 */ 6 7 #include <linux/module.h> 8 #include <linux/slab.h> 9 #include <linux/blkdev.h> 10 #include <linux/bio.h> 11 #include <linux/blktrace_api.h> 12 #include <linux/blk-cgroup.h> 13 #include "blk.h" 14 15 /* Max dispatch from a group in 1 round */ 16 static int throtl_grp_quantum = 8; 17 18 /* Total max dispatch from all groups in one round */ 19 static int throtl_quantum = 32; 20 21 /* Throttling is performed over 100ms slice and after that slice is renewed */ 22 static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23 24 static struct blkcg_policy blkcg_policy_throtl; 25 26 /* A workqueue to queue throttle related work */ 27 static struct workqueue_struct *kthrotld_workqueue; 28 29 /* 30 * To implement hierarchical throttling, throtl_grps form a tree and bios 31 * are dispatched upwards level by level until they reach the top and get 32 * issued. When dispatching bios from the children and local group at each 33 * level, if the bios are dispatched into a single bio_list, there's a risk 34 * of a local or child group which can queue many bios at once filling up 35 * the list starving others. 36 * 37 * To avoid such starvation, dispatched bios are queued separately 38 * according to where they came from. When they are again dispatched to 39 * the parent, they're popped in round-robin order so that no single source 40 * hogs the dispatch window. 41 * 42 * throtl_qnode is used to keep the queued bios separated by their sources. 43 * Bios are queued to throtl_qnode which in turn is queued to 44 * throtl_service_queue and then dispatched in round-robin order. 45 * 46 * It's also used to track the reference counts on blkg's. A qnode always 47 * belongs to a throtl_grp and gets queued on itself or the parent, so 48 * incrementing the reference of the associated throtl_grp when a qnode is 49 * queued and decrementing when dequeued is enough to keep the whole blkg 50 * tree pinned while bios are in flight. 51 */ 52 struct throtl_qnode { 53 struct list_head node; /* service_queue->queued[] */ 54 struct bio_list bios; /* queued bios */ 55 struct throtl_grp *tg; /* tg this qnode belongs to */ 56 }; 57 58 struct throtl_service_queue { 59 struct throtl_service_queue *parent_sq; /* the parent service_queue */ 60 61 /* 62 * Bios queued directly to this service_queue or dispatched from 63 * children throtl_grp's. 64 */ 65 struct list_head queued[2]; /* throtl_qnode [READ/WRITE] */ 66 unsigned int nr_queued[2]; /* number of queued bios */ 67 68 /* 69 * RB tree of active children throtl_grp's, which are sorted by 70 * their ->disptime. 71 */ 72 struct rb_root pending_tree; /* RB tree of active tgs */ 73 struct rb_node *first_pending; /* first node in the tree */ 74 unsigned int nr_pending; /* # queued in the tree */ 75 unsigned long first_pending_disptime; /* disptime of the first tg */ 76 struct timer_list pending_timer; /* fires on first_pending_disptime */ 77 }; 78 79 enum tg_state_flags { 80 THROTL_TG_PENDING = 1 << 0, /* on parent's pending tree */ 81 THROTL_TG_WAS_EMPTY = 1 << 1, /* bio_lists[] became non-empty */ 82 }; 83 84 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 85 86 struct throtl_grp { 87 /* must be the first member */ 88 struct blkg_policy_data pd; 89 90 /* active throtl group service_queue member */ 91 struct rb_node rb_node; 92 93 /* throtl_data this group belongs to */ 94 struct throtl_data *td; 95 96 /* this group's service queue */ 97 struct throtl_service_queue service_queue; 98 99 /* 100 * qnode_on_self is used when bios are directly queued to this 101 * throtl_grp so that local bios compete fairly with bios 102 * dispatched from children. qnode_on_parent is used when bios are 103 * dispatched from this throtl_grp into its parent and will compete 104 * with the sibling qnode_on_parents and the parent's 105 * qnode_on_self. 106 */ 107 struct throtl_qnode qnode_on_self[2]; 108 struct throtl_qnode qnode_on_parent[2]; 109 110 /* 111 * Dispatch time in jiffies. This is the estimated time when group 112 * will unthrottle and is ready to dispatch more bio. It is used as 113 * key to sort active groups in service tree. 114 */ 115 unsigned long disptime; 116 117 unsigned int flags; 118 119 /* are there any throtl rules between this group and td? */ 120 bool has_rules[2]; 121 122 /* bytes per second rate limits */ 123 uint64_t bps[2]; 124 125 /* IOPS limits */ 126 unsigned int iops[2]; 127 128 /* Number of bytes disptached in current slice */ 129 uint64_t bytes_disp[2]; 130 /* Number of bio's dispatched in current slice */ 131 unsigned int io_disp[2]; 132 133 /* When did we start a new slice */ 134 unsigned long slice_start[2]; 135 unsigned long slice_end[2]; 136 }; 137 138 struct throtl_data 139 { 140 /* service tree for active throtl groups */ 141 struct throtl_service_queue service_queue; 142 143 struct request_queue *queue; 144 145 /* Total Number of queued bios on READ and WRITE lists */ 146 unsigned int nr_queued[2]; 147 148 /* 149 * number of total undestroyed groups 150 */ 151 unsigned int nr_undestroyed_grps; 152 153 /* Work for dispatching throttled bios */ 154 struct work_struct dispatch_work; 155 }; 156 157 static void throtl_pending_timer_fn(unsigned long arg); 158 159 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 160 { 161 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 162 } 163 164 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 165 { 166 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 167 } 168 169 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 170 { 171 return pd_to_blkg(&tg->pd); 172 } 173 174 /** 175 * sq_to_tg - return the throl_grp the specified service queue belongs to 176 * @sq: the throtl_service_queue of interest 177 * 178 * Return the throtl_grp @sq belongs to. If @sq is the top-level one 179 * embedded in throtl_data, %NULL is returned. 180 */ 181 static struct throtl_grp *sq_to_tg(struct throtl_service_queue *sq) 182 { 183 if (sq && sq->parent_sq) 184 return container_of(sq, struct throtl_grp, service_queue); 185 else 186 return NULL; 187 } 188 189 /** 190 * sq_to_td - return throtl_data the specified service queue belongs to 191 * @sq: the throtl_service_queue of interest 192 * 193 * A service_queue can be embeded in either a throtl_grp or throtl_data. 194 * Determine the associated throtl_data accordingly and return it. 195 */ 196 static struct throtl_data *sq_to_td(struct throtl_service_queue *sq) 197 { 198 struct throtl_grp *tg = sq_to_tg(sq); 199 200 if (tg) 201 return tg->td; 202 else 203 return container_of(sq, struct throtl_data, service_queue); 204 } 205 206 /** 207 * throtl_log - log debug message via blktrace 208 * @sq: the service_queue being reported 209 * @fmt: printf format string 210 * @args: printf args 211 * 212 * The messages are prefixed with "throtl BLKG_NAME" if @sq belongs to a 213 * throtl_grp; otherwise, just "throtl". 214 */ 215 #define throtl_log(sq, fmt, args...) do { \ 216 struct throtl_grp *__tg = sq_to_tg((sq)); \ 217 struct throtl_data *__td = sq_to_td((sq)); \ 218 \ 219 (void)__td; \ 220 if (likely(!blk_trace_note_message_enabled(__td->queue))) \ 221 break; \ 222 if ((__tg)) { \ 223 char __pbuf[128]; \ 224 \ 225 blkg_path(tg_to_blkg(__tg), __pbuf, sizeof(__pbuf)); \ 226 blk_add_trace_msg(__td->queue, "throtl %s " fmt, __pbuf, ##args); \ 227 } else { \ 228 blk_add_trace_msg(__td->queue, "throtl " fmt, ##args); \ 229 } \ 230 } while (0) 231 232 static void throtl_qnode_init(struct throtl_qnode *qn, struct throtl_grp *tg) 233 { 234 INIT_LIST_HEAD(&qn->node); 235 bio_list_init(&qn->bios); 236 qn->tg = tg; 237 } 238 239 /** 240 * throtl_qnode_add_bio - add a bio to a throtl_qnode and activate it 241 * @bio: bio being added 242 * @qn: qnode to add bio to 243 * @queued: the service_queue->queued[] list @qn belongs to 244 * 245 * Add @bio to @qn and put @qn on @queued if it's not already on. 246 * @qn->tg's reference count is bumped when @qn is activated. See the 247 * comment on top of throtl_qnode definition for details. 248 */ 249 static void throtl_qnode_add_bio(struct bio *bio, struct throtl_qnode *qn, 250 struct list_head *queued) 251 { 252 bio_list_add(&qn->bios, bio); 253 if (list_empty(&qn->node)) { 254 list_add_tail(&qn->node, queued); 255 blkg_get(tg_to_blkg(qn->tg)); 256 } 257 } 258 259 /** 260 * throtl_peek_queued - peek the first bio on a qnode list 261 * @queued: the qnode list to peek 262 */ 263 static struct bio *throtl_peek_queued(struct list_head *queued) 264 { 265 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 266 struct bio *bio; 267 268 if (list_empty(queued)) 269 return NULL; 270 271 bio = bio_list_peek(&qn->bios); 272 WARN_ON_ONCE(!bio); 273 return bio; 274 } 275 276 /** 277 * throtl_pop_queued - pop the first bio form a qnode list 278 * @queued: the qnode list to pop a bio from 279 * @tg_to_put: optional out argument for throtl_grp to put 280 * 281 * Pop the first bio from the qnode list @queued. After popping, the first 282 * qnode is removed from @queued if empty or moved to the end of @queued so 283 * that the popping order is round-robin. 284 * 285 * When the first qnode is removed, its associated throtl_grp should be put 286 * too. If @tg_to_put is NULL, this function automatically puts it; 287 * otherwise, *@tg_to_put is set to the throtl_grp to put and the caller is 288 * responsible for putting it. 289 */ 290 static struct bio *throtl_pop_queued(struct list_head *queued, 291 struct throtl_grp **tg_to_put) 292 { 293 struct throtl_qnode *qn = list_first_entry(queued, struct throtl_qnode, node); 294 struct bio *bio; 295 296 if (list_empty(queued)) 297 return NULL; 298 299 bio = bio_list_pop(&qn->bios); 300 WARN_ON_ONCE(!bio); 301 302 if (bio_list_empty(&qn->bios)) { 303 list_del_init(&qn->node); 304 if (tg_to_put) 305 *tg_to_put = qn->tg; 306 else 307 blkg_put(tg_to_blkg(qn->tg)); 308 } else { 309 list_move_tail(&qn->node, queued); 310 } 311 312 return bio; 313 } 314 315 /* init a service_queue, assumes the caller zeroed it */ 316 static void throtl_service_queue_init(struct throtl_service_queue *sq) 317 { 318 INIT_LIST_HEAD(&sq->queued[0]); 319 INIT_LIST_HEAD(&sq->queued[1]); 320 sq->pending_tree = RB_ROOT; 321 setup_timer(&sq->pending_timer, throtl_pending_timer_fn, 322 (unsigned long)sq); 323 } 324 325 static struct blkg_policy_data *throtl_pd_alloc(gfp_t gfp, int node) 326 { 327 struct throtl_grp *tg; 328 int rw; 329 330 tg = kzalloc_node(sizeof(*tg), gfp, node); 331 if (!tg) 332 return NULL; 333 334 throtl_service_queue_init(&tg->service_queue); 335 336 for (rw = READ; rw <= WRITE; rw++) { 337 throtl_qnode_init(&tg->qnode_on_self[rw], tg); 338 throtl_qnode_init(&tg->qnode_on_parent[rw], tg); 339 } 340 341 RB_CLEAR_NODE(&tg->rb_node); 342 tg->bps[READ] = -1; 343 tg->bps[WRITE] = -1; 344 tg->iops[READ] = -1; 345 tg->iops[WRITE] = -1; 346 347 return &tg->pd; 348 } 349 350 static void throtl_pd_init(struct blkg_policy_data *pd) 351 { 352 struct throtl_grp *tg = pd_to_tg(pd); 353 struct blkcg_gq *blkg = tg_to_blkg(tg); 354 struct throtl_data *td = blkg->q->td; 355 struct throtl_service_queue *sq = &tg->service_queue; 356 357 /* 358 * If on the default hierarchy, we switch to properly hierarchical 359 * behavior where limits on a given throtl_grp are applied to the 360 * whole subtree rather than just the group itself. e.g. If 16M 361 * read_bps limit is set on the root group, the whole system can't 362 * exceed 16M for the device. 363 * 364 * If not on the default hierarchy, the broken flat hierarchy 365 * behavior is retained where all throtl_grps are treated as if 366 * they're all separate root groups right below throtl_data. 367 * Limits of a group don't interact with limits of other groups 368 * regardless of the position of the group in the hierarchy. 369 */ 370 sq->parent_sq = &td->service_queue; 371 if (cgroup_subsys_on_dfl(io_cgrp_subsys) && blkg->parent) 372 sq->parent_sq = &blkg_to_tg(blkg->parent)->service_queue; 373 tg->td = td; 374 } 375 376 /* 377 * Set has_rules[] if @tg or any of its parents have limits configured. 378 * This doesn't require walking up to the top of the hierarchy as the 379 * parent's has_rules[] is guaranteed to be correct. 380 */ 381 static void tg_update_has_rules(struct throtl_grp *tg) 382 { 383 struct throtl_grp *parent_tg = sq_to_tg(tg->service_queue.parent_sq); 384 int rw; 385 386 for (rw = READ; rw <= WRITE; rw++) 387 tg->has_rules[rw] = (parent_tg && parent_tg->has_rules[rw]) || 388 (tg->bps[rw] != -1 || tg->iops[rw] != -1); 389 } 390 391 static void throtl_pd_online(struct blkg_policy_data *pd) 392 { 393 /* 394 * We don't want new groups to escape the limits of its ancestors. 395 * Update has_rules[] after a new group is brought online. 396 */ 397 tg_update_has_rules(pd_to_tg(pd)); 398 } 399 400 static void throtl_pd_free(struct blkg_policy_data *pd) 401 { 402 struct throtl_grp *tg = pd_to_tg(pd); 403 404 del_timer_sync(&tg->service_queue.pending_timer); 405 kfree(tg); 406 } 407 408 static struct throtl_grp * 409 throtl_rb_first(struct throtl_service_queue *parent_sq) 410 { 411 /* Service tree is empty */ 412 if (!parent_sq->nr_pending) 413 return NULL; 414 415 if (!parent_sq->first_pending) 416 parent_sq->first_pending = rb_first(&parent_sq->pending_tree); 417 418 if (parent_sq->first_pending) 419 return rb_entry_tg(parent_sq->first_pending); 420 421 return NULL; 422 } 423 424 static void rb_erase_init(struct rb_node *n, struct rb_root *root) 425 { 426 rb_erase(n, root); 427 RB_CLEAR_NODE(n); 428 } 429 430 static void throtl_rb_erase(struct rb_node *n, 431 struct throtl_service_queue *parent_sq) 432 { 433 if (parent_sq->first_pending == n) 434 parent_sq->first_pending = NULL; 435 rb_erase_init(n, &parent_sq->pending_tree); 436 --parent_sq->nr_pending; 437 } 438 439 static void update_min_dispatch_time(struct throtl_service_queue *parent_sq) 440 { 441 struct throtl_grp *tg; 442 443 tg = throtl_rb_first(parent_sq); 444 if (!tg) 445 return; 446 447 parent_sq->first_pending_disptime = tg->disptime; 448 } 449 450 static void tg_service_queue_add(struct throtl_grp *tg) 451 { 452 struct throtl_service_queue *parent_sq = tg->service_queue.parent_sq; 453 struct rb_node **node = &parent_sq->pending_tree.rb_node; 454 struct rb_node *parent = NULL; 455 struct throtl_grp *__tg; 456 unsigned long key = tg->disptime; 457 int left = 1; 458 459 while (*node != NULL) { 460 parent = *node; 461 __tg = rb_entry_tg(parent); 462 463 if (time_before(key, __tg->disptime)) 464 node = &parent->rb_left; 465 else { 466 node = &parent->rb_right; 467 left = 0; 468 } 469 } 470 471 if (left) 472 parent_sq->first_pending = &tg->rb_node; 473 474 rb_link_node(&tg->rb_node, parent, node); 475 rb_insert_color(&tg->rb_node, &parent_sq->pending_tree); 476 } 477 478 static void __throtl_enqueue_tg(struct throtl_grp *tg) 479 { 480 tg_service_queue_add(tg); 481 tg->flags |= THROTL_TG_PENDING; 482 tg->service_queue.parent_sq->nr_pending++; 483 } 484 485 static void throtl_enqueue_tg(struct throtl_grp *tg) 486 { 487 if (!(tg->flags & THROTL_TG_PENDING)) 488 __throtl_enqueue_tg(tg); 489 } 490 491 static void __throtl_dequeue_tg(struct throtl_grp *tg) 492 { 493 throtl_rb_erase(&tg->rb_node, tg->service_queue.parent_sq); 494 tg->flags &= ~THROTL_TG_PENDING; 495 } 496 497 static void throtl_dequeue_tg(struct throtl_grp *tg) 498 { 499 if (tg->flags & THROTL_TG_PENDING) 500 __throtl_dequeue_tg(tg); 501 } 502 503 /* Call with queue lock held */ 504 static void throtl_schedule_pending_timer(struct throtl_service_queue *sq, 505 unsigned long expires) 506 { 507 mod_timer(&sq->pending_timer, expires); 508 throtl_log(sq, "schedule timer. delay=%lu jiffies=%lu", 509 expires - jiffies, jiffies); 510 } 511 512 /** 513 * throtl_schedule_next_dispatch - schedule the next dispatch cycle 514 * @sq: the service_queue to schedule dispatch for 515 * @force: force scheduling 516 * 517 * Arm @sq->pending_timer so that the next dispatch cycle starts on the 518 * dispatch time of the first pending child. Returns %true if either timer 519 * is armed or there's no pending child left. %false if the current 520 * dispatch window is still open and the caller should continue 521 * dispatching. 522 * 523 * If @force is %true, the dispatch timer is always scheduled and this 524 * function is guaranteed to return %true. This is to be used when the 525 * caller can't dispatch itself and needs to invoke pending_timer 526 * unconditionally. Note that forced scheduling is likely to induce short 527 * delay before dispatch starts even if @sq->first_pending_disptime is not 528 * in the future and thus shouldn't be used in hot paths. 529 */ 530 static bool throtl_schedule_next_dispatch(struct throtl_service_queue *sq, 531 bool force) 532 { 533 /* any pending children left? */ 534 if (!sq->nr_pending) 535 return true; 536 537 update_min_dispatch_time(sq); 538 539 /* is the next dispatch time in the future? */ 540 if (force || time_after(sq->first_pending_disptime, jiffies)) { 541 throtl_schedule_pending_timer(sq, sq->first_pending_disptime); 542 return true; 543 } 544 545 /* tell the caller to continue dispatching */ 546 return false; 547 } 548 549 static inline void throtl_start_new_slice_with_credit(struct throtl_grp *tg, 550 bool rw, unsigned long start) 551 { 552 tg->bytes_disp[rw] = 0; 553 tg->io_disp[rw] = 0; 554 555 /* 556 * Previous slice has expired. We must have trimmed it after last 557 * bio dispatch. That means since start of last slice, we never used 558 * that bandwidth. Do try to make use of that bandwidth while giving 559 * credit. 560 */ 561 if (time_after_eq(start, tg->slice_start[rw])) 562 tg->slice_start[rw] = start; 563 564 tg->slice_end[rw] = jiffies + throtl_slice; 565 throtl_log(&tg->service_queue, 566 "[%c] new slice with credit start=%lu end=%lu jiffies=%lu", 567 rw == READ ? 'R' : 'W', tg->slice_start[rw], 568 tg->slice_end[rw], jiffies); 569 } 570 571 static inline void throtl_start_new_slice(struct throtl_grp *tg, bool rw) 572 { 573 tg->bytes_disp[rw] = 0; 574 tg->io_disp[rw] = 0; 575 tg->slice_start[rw] = jiffies; 576 tg->slice_end[rw] = jiffies + throtl_slice; 577 throtl_log(&tg->service_queue, 578 "[%c] new slice start=%lu end=%lu jiffies=%lu", 579 rw == READ ? 'R' : 'W', tg->slice_start[rw], 580 tg->slice_end[rw], jiffies); 581 } 582 583 static inline void throtl_set_slice_end(struct throtl_grp *tg, bool rw, 584 unsigned long jiffy_end) 585 { 586 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 587 } 588 589 static inline void throtl_extend_slice(struct throtl_grp *tg, bool rw, 590 unsigned long jiffy_end) 591 { 592 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 593 throtl_log(&tg->service_queue, 594 "[%c] extend slice start=%lu end=%lu jiffies=%lu", 595 rw == READ ? 'R' : 'W', tg->slice_start[rw], 596 tg->slice_end[rw], jiffies); 597 } 598 599 /* Determine if previously allocated or extended slice is complete or not */ 600 static bool throtl_slice_used(struct throtl_grp *tg, bool rw) 601 { 602 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 603 return false; 604 605 return 1; 606 } 607 608 /* Trim the used slices and adjust slice start accordingly */ 609 static inline void throtl_trim_slice(struct throtl_grp *tg, bool rw) 610 { 611 unsigned long nr_slices, time_elapsed, io_trim; 612 u64 bytes_trim, tmp; 613 614 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 615 616 /* 617 * If bps are unlimited (-1), then time slice don't get 618 * renewed. Don't try to trim the slice if slice is used. A new 619 * slice will start when appropriate. 620 */ 621 if (throtl_slice_used(tg, rw)) 622 return; 623 624 /* 625 * A bio has been dispatched. Also adjust slice_end. It might happen 626 * that initially cgroup limit was very low resulting in high 627 * slice_end, but later limit was bumped up and bio was dispached 628 * sooner, then we need to reduce slice_end. A high bogus slice_end 629 * is bad because it does not allow new slice to start. 630 */ 631 632 throtl_set_slice_end(tg, rw, jiffies + throtl_slice); 633 634 time_elapsed = jiffies - tg->slice_start[rw]; 635 636 nr_slices = time_elapsed / throtl_slice; 637 638 if (!nr_slices) 639 return; 640 tmp = tg->bps[rw] * throtl_slice * nr_slices; 641 do_div(tmp, HZ); 642 bytes_trim = tmp; 643 644 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 645 646 if (!bytes_trim && !io_trim) 647 return; 648 649 if (tg->bytes_disp[rw] >= bytes_trim) 650 tg->bytes_disp[rw] -= bytes_trim; 651 else 652 tg->bytes_disp[rw] = 0; 653 654 if (tg->io_disp[rw] >= io_trim) 655 tg->io_disp[rw] -= io_trim; 656 else 657 tg->io_disp[rw] = 0; 658 659 tg->slice_start[rw] += nr_slices * throtl_slice; 660 661 throtl_log(&tg->service_queue, 662 "[%c] trim slice nr=%lu bytes=%llu io=%lu start=%lu end=%lu jiffies=%lu", 663 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 664 tg->slice_start[rw], tg->slice_end[rw], jiffies); 665 } 666 667 static bool tg_with_in_iops_limit(struct throtl_grp *tg, struct bio *bio, 668 unsigned long *wait) 669 { 670 bool rw = bio_data_dir(bio); 671 unsigned int io_allowed; 672 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 673 u64 tmp; 674 675 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 676 677 /* Slice has just started. Consider one slice interval */ 678 if (!jiffy_elapsed) 679 jiffy_elapsed_rnd = throtl_slice; 680 681 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 682 683 /* 684 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 685 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 686 * will allow dispatch after 1 second and after that slice should 687 * have been trimmed. 688 */ 689 690 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 691 do_div(tmp, HZ); 692 693 if (tmp > UINT_MAX) 694 io_allowed = UINT_MAX; 695 else 696 io_allowed = tmp; 697 698 if (tg->io_disp[rw] + 1 <= io_allowed) { 699 if (wait) 700 *wait = 0; 701 return true; 702 } 703 704 /* Calc approx time to dispatch */ 705 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 706 707 if (jiffy_wait > jiffy_elapsed) 708 jiffy_wait = jiffy_wait - jiffy_elapsed; 709 else 710 jiffy_wait = 1; 711 712 if (wait) 713 *wait = jiffy_wait; 714 return 0; 715 } 716 717 static bool tg_with_in_bps_limit(struct throtl_grp *tg, struct bio *bio, 718 unsigned long *wait) 719 { 720 bool rw = bio_data_dir(bio); 721 u64 bytes_allowed, extra_bytes, tmp; 722 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 723 724 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 725 726 /* Slice has just started. Consider one slice interval */ 727 if (!jiffy_elapsed) 728 jiffy_elapsed_rnd = throtl_slice; 729 730 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 731 732 tmp = tg->bps[rw] * jiffy_elapsed_rnd; 733 do_div(tmp, HZ); 734 bytes_allowed = tmp; 735 736 if (tg->bytes_disp[rw] + bio->bi_iter.bi_size <= bytes_allowed) { 737 if (wait) 738 *wait = 0; 739 return true; 740 } 741 742 /* Calc approx time to dispatch */ 743 extra_bytes = tg->bytes_disp[rw] + bio->bi_iter.bi_size - bytes_allowed; 744 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 745 746 if (!jiffy_wait) 747 jiffy_wait = 1; 748 749 /* 750 * This wait time is without taking into consideration the rounding 751 * up we did. Add that time also. 752 */ 753 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 754 if (wait) 755 *wait = jiffy_wait; 756 return 0; 757 } 758 759 /* 760 * Returns whether one can dispatch a bio or not. Also returns approx number 761 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 762 */ 763 static bool tg_may_dispatch(struct throtl_grp *tg, struct bio *bio, 764 unsigned long *wait) 765 { 766 bool rw = bio_data_dir(bio); 767 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 768 769 /* 770 * Currently whole state machine of group depends on first bio 771 * queued in the group bio list. So one should not be calling 772 * this function with a different bio if there are other bios 773 * queued. 774 */ 775 BUG_ON(tg->service_queue.nr_queued[rw] && 776 bio != throtl_peek_queued(&tg->service_queue.queued[rw])); 777 778 /* If tg->bps = -1, then BW is unlimited */ 779 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 780 if (wait) 781 *wait = 0; 782 return true; 783 } 784 785 /* 786 * If previous slice expired, start a new one otherwise renew/extend 787 * existing slice to make sure it is at least throtl_slice interval 788 * long since now. 789 */ 790 if (throtl_slice_used(tg, rw)) 791 throtl_start_new_slice(tg, rw); 792 else { 793 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 794 throtl_extend_slice(tg, rw, jiffies + throtl_slice); 795 } 796 797 if (tg_with_in_bps_limit(tg, bio, &bps_wait) && 798 tg_with_in_iops_limit(tg, bio, &iops_wait)) { 799 if (wait) 800 *wait = 0; 801 return 1; 802 } 803 804 max_wait = max(bps_wait, iops_wait); 805 806 if (wait) 807 *wait = max_wait; 808 809 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 810 throtl_extend_slice(tg, rw, jiffies + max_wait); 811 812 return 0; 813 } 814 815 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 816 { 817 bool rw = bio_data_dir(bio); 818 819 /* Charge the bio to the group */ 820 tg->bytes_disp[rw] += bio->bi_iter.bi_size; 821 tg->io_disp[rw]++; 822 823 /* 824 * REQ_THROTTLED is used to prevent the same bio to be throttled 825 * more than once as a throttled bio will go through blk-throtl the 826 * second time when it eventually gets issued. Set it when a bio 827 * is being charged to a tg. 828 */ 829 if (!(bio->bi_rw & REQ_THROTTLED)) 830 bio->bi_rw |= REQ_THROTTLED; 831 } 832 833 /** 834 * throtl_add_bio_tg - add a bio to the specified throtl_grp 835 * @bio: bio to add 836 * @qn: qnode to use 837 * @tg: the target throtl_grp 838 * 839 * Add @bio to @tg's service_queue using @qn. If @qn is not specified, 840 * tg->qnode_on_self[] is used. 841 */ 842 static void throtl_add_bio_tg(struct bio *bio, struct throtl_qnode *qn, 843 struct throtl_grp *tg) 844 { 845 struct throtl_service_queue *sq = &tg->service_queue; 846 bool rw = bio_data_dir(bio); 847 848 if (!qn) 849 qn = &tg->qnode_on_self[rw]; 850 851 /* 852 * If @tg doesn't currently have any bios queued in the same 853 * direction, queueing @bio can change when @tg should be 854 * dispatched. Mark that @tg was empty. This is automatically 855 * cleaered on the next tg_update_disptime(). 856 */ 857 if (!sq->nr_queued[rw]) 858 tg->flags |= THROTL_TG_WAS_EMPTY; 859 860 throtl_qnode_add_bio(bio, qn, &sq->queued[rw]); 861 862 sq->nr_queued[rw]++; 863 throtl_enqueue_tg(tg); 864 } 865 866 static void tg_update_disptime(struct throtl_grp *tg) 867 { 868 struct throtl_service_queue *sq = &tg->service_queue; 869 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 870 struct bio *bio; 871 872 if ((bio = throtl_peek_queued(&sq->queued[READ]))) 873 tg_may_dispatch(tg, bio, &read_wait); 874 875 if ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 876 tg_may_dispatch(tg, bio, &write_wait); 877 878 min_wait = min(read_wait, write_wait); 879 disptime = jiffies + min_wait; 880 881 /* Update dispatch time */ 882 throtl_dequeue_tg(tg); 883 tg->disptime = disptime; 884 throtl_enqueue_tg(tg); 885 886 /* see throtl_add_bio_tg() */ 887 tg->flags &= ~THROTL_TG_WAS_EMPTY; 888 } 889 890 static void start_parent_slice_with_credit(struct throtl_grp *child_tg, 891 struct throtl_grp *parent_tg, bool rw) 892 { 893 if (throtl_slice_used(parent_tg, rw)) { 894 throtl_start_new_slice_with_credit(parent_tg, rw, 895 child_tg->slice_start[rw]); 896 } 897 898 } 899 900 static void tg_dispatch_one_bio(struct throtl_grp *tg, bool rw) 901 { 902 struct throtl_service_queue *sq = &tg->service_queue; 903 struct throtl_service_queue *parent_sq = sq->parent_sq; 904 struct throtl_grp *parent_tg = sq_to_tg(parent_sq); 905 struct throtl_grp *tg_to_put = NULL; 906 struct bio *bio; 907 908 /* 909 * @bio is being transferred from @tg to @parent_sq. Popping a bio 910 * from @tg may put its reference and @parent_sq might end up 911 * getting released prematurely. Remember the tg to put and put it 912 * after @bio is transferred to @parent_sq. 913 */ 914 bio = throtl_pop_queued(&sq->queued[rw], &tg_to_put); 915 sq->nr_queued[rw]--; 916 917 throtl_charge_bio(tg, bio); 918 919 /* 920 * If our parent is another tg, we just need to transfer @bio to 921 * the parent using throtl_add_bio_tg(). If our parent is 922 * @td->service_queue, @bio is ready to be issued. Put it on its 923 * bio_lists[] and decrease total number queued. The caller is 924 * responsible for issuing these bios. 925 */ 926 if (parent_tg) { 927 throtl_add_bio_tg(bio, &tg->qnode_on_parent[rw], parent_tg); 928 start_parent_slice_with_credit(tg, parent_tg, rw); 929 } else { 930 throtl_qnode_add_bio(bio, &tg->qnode_on_parent[rw], 931 &parent_sq->queued[rw]); 932 BUG_ON(tg->td->nr_queued[rw] <= 0); 933 tg->td->nr_queued[rw]--; 934 } 935 936 throtl_trim_slice(tg, rw); 937 938 if (tg_to_put) 939 blkg_put(tg_to_blkg(tg_to_put)); 940 } 941 942 static int throtl_dispatch_tg(struct throtl_grp *tg) 943 { 944 struct throtl_service_queue *sq = &tg->service_queue; 945 unsigned int nr_reads = 0, nr_writes = 0; 946 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 947 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 948 struct bio *bio; 949 950 /* Try to dispatch 75% READS and 25% WRITES */ 951 952 while ((bio = throtl_peek_queued(&sq->queued[READ])) && 953 tg_may_dispatch(tg, bio, NULL)) { 954 955 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 956 nr_reads++; 957 958 if (nr_reads >= max_nr_reads) 959 break; 960 } 961 962 while ((bio = throtl_peek_queued(&sq->queued[WRITE])) && 963 tg_may_dispatch(tg, bio, NULL)) { 964 965 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 966 nr_writes++; 967 968 if (nr_writes >= max_nr_writes) 969 break; 970 } 971 972 return nr_reads + nr_writes; 973 } 974 975 static int throtl_select_dispatch(struct throtl_service_queue *parent_sq) 976 { 977 unsigned int nr_disp = 0; 978 979 while (1) { 980 struct throtl_grp *tg = throtl_rb_first(parent_sq); 981 struct throtl_service_queue *sq = &tg->service_queue; 982 983 if (!tg) 984 break; 985 986 if (time_before(jiffies, tg->disptime)) 987 break; 988 989 throtl_dequeue_tg(tg); 990 991 nr_disp += throtl_dispatch_tg(tg); 992 993 if (sq->nr_queued[0] || sq->nr_queued[1]) 994 tg_update_disptime(tg); 995 996 if (nr_disp >= throtl_quantum) 997 break; 998 } 999 1000 return nr_disp; 1001 } 1002 1003 /** 1004 * throtl_pending_timer_fn - timer function for service_queue->pending_timer 1005 * @arg: the throtl_service_queue being serviced 1006 * 1007 * This timer is armed when a child throtl_grp with active bio's become 1008 * pending and queued on the service_queue's pending_tree and expires when 1009 * the first child throtl_grp should be dispatched. This function 1010 * dispatches bio's from the children throtl_grps to the parent 1011 * service_queue. 1012 * 1013 * If the parent's parent is another throtl_grp, dispatching is propagated 1014 * by either arming its pending_timer or repeating dispatch directly. If 1015 * the top-level service_tree is reached, throtl_data->dispatch_work is 1016 * kicked so that the ready bio's are issued. 1017 */ 1018 static void throtl_pending_timer_fn(unsigned long arg) 1019 { 1020 struct throtl_service_queue *sq = (void *)arg; 1021 struct throtl_grp *tg = sq_to_tg(sq); 1022 struct throtl_data *td = sq_to_td(sq); 1023 struct request_queue *q = td->queue; 1024 struct throtl_service_queue *parent_sq; 1025 bool dispatched; 1026 int ret; 1027 1028 spin_lock_irq(q->queue_lock); 1029 again: 1030 parent_sq = sq->parent_sq; 1031 dispatched = false; 1032 1033 while (true) { 1034 throtl_log(sq, "dispatch nr_queued=%u read=%u write=%u", 1035 sq->nr_queued[READ] + sq->nr_queued[WRITE], 1036 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1037 1038 ret = throtl_select_dispatch(sq); 1039 if (ret) { 1040 throtl_log(sq, "bios disp=%u", ret); 1041 dispatched = true; 1042 } 1043 1044 if (throtl_schedule_next_dispatch(sq, false)) 1045 break; 1046 1047 /* this dispatch windows is still open, relax and repeat */ 1048 spin_unlock_irq(q->queue_lock); 1049 cpu_relax(); 1050 spin_lock_irq(q->queue_lock); 1051 } 1052 1053 if (!dispatched) 1054 goto out_unlock; 1055 1056 if (parent_sq) { 1057 /* @parent_sq is another throl_grp, propagate dispatch */ 1058 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1059 tg_update_disptime(tg); 1060 if (!throtl_schedule_next_dispatch(parent_sq, false)) { 1061 /* window is already open, repeat dispatching */ 1062 sq = parent_sq; 1063 tg = sq_to_tg(sq); 1064 goto again; 1065 } 1066 } 1067 } else { 1068 /* reached the top-level, queue issueing */ 1069 queue_work(kthrotld_workqueue, &td->dispatch_work); 1070 } 1071 out_unlock: 1072 spin_unlock_irq(q->queue_lock); 1073 } 1074 1075 /** 1076 * blk_throtl_dispatch_work_fn - work function for throtl_data->dispatch_work 1077 * @work: work item being executed 1078 * 1079 * This function is queued for execution when bio's reach the bio_lists[] 1080 * of throtl_data->service_queue. Those bio's are ready and issued by this 1081 * function. 1082 */ 1083 static void blk_throtl_dispatch_work_fn(struct work_struct *work) 1084 { 1085 struct throtl_data *td = container_of(work, struct throtl_data, 1086 dispatch_work); 1087 struct throtl_service_queue *td_sq = &td->service_queue; 1088 struct request_queue *q = td->queue; 1089 struct bio_list bio_list_on_stack; 1090 struct bio *bio; 1091 struct blk_plug plug; 1092 int rw; 1093 1094 bio_list_init(&bio_list_on_stack); 1095 1096 spin_lock_irq(q->queue_lock); 1097 for (rw = READ; rw <= WRITE; rw++) 1098 while ((bio = throtl_pop_queued(&td_sq->queued[rw], NULL))) 1099 bio_list_add(&bio_list_on_stack, bio); 1100 spin_unlock_irq(q->queue_lock); 1101 1102 if (!bio_list_empty(&bio_list_on_stack)) { 1103 blk_start_plug(&plug); 1104 while((bio = bio_list_pop(&bio_list_on_stack))) 1105 generic_make_request(bio); 1106 blk_finish_plug(&plug); 1107 } 1108 } 1109 1110 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 1111 int off) 1112 { 1113 struct throtl_grp *tg = pd_to_tg(pd); 1114 u64 v = *(u64 *)((void *)tg + off); 1115 1116 if (v == -1) 1117 return 0; 1118 return __blkg_prfill_u64(sf, pd, v); 1119 } 1120 1121 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 1122 int off) 1123 { 1124 struct throtl_grp *tg = pd_to_tg(pd); 1125 unsigned int v = *(unsigned int *)((void *)tg + off); 1126 1127 if (v == -1) 1128 return 0; 1129 return __blkg_prfill_u64(sf, pd, v); 1130 } 1131 1132 static int tg_print_conf_u64(struct seq_file *sf, void *v) 1133 { 1134 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_u64, 1135 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1136 return 0; 1137 } 1138 1139 static int tg_print_conf_uint(struct seq_file *sf, void *v) 1140 { 1141 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_conf_uint, 1142 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1143 return 0; 1144 } 1145 1146 static void tg_conf_updated(struct throtl_grp *tg) 1147 { 1148 struct throtl_service_queue *sq = &tg->service_queue; 1149 struct cgroup_subsys_state *pos_css; 1150 struct blkcg_gq *blkg; 1151 1152 throtl_log(&tg->service_queue, 1153 "limit change rbps=%llu wbps=%llu riops=%u wiops=%u", 1154 tg->bps[READ], tg->bps[WRITE], 1155 tg->iops[READ], tg->iops[WRITE]); 1156 1157 /* 1158 * Update has_rules[] flags for the updated tg's subtree. A tg is 1159 * considered to have rules if either the tg itself or any of its 1160 * ancestors has rules. This identifies groups without any 1161 * restrictions in the whole hierarchy and allows them to bypass 1162 * blk-throttle. 1163 */ 1164 blkg_for_each_descendant_pre(blkg, pos_css, tg_to_blkg(tg)) 1165 tg_update_has_rules(blkg_to_tg(blkg)); 1166 1167 /* 1168 * We're already holding queue_lock and know @tg is valid. Let's 1169 * apply the new config directly. 1170 * 1171 * Restart the slices for both READ and WRITES. It might happen 1172 * that a group's limit are dropped suddenly and we don't want to 1173 * account recently dispatched IO with new low rate. 1174 */ 1175 throtl_start_new_slice(tg, 0); 1176 throtl_start_new_slice(tg, 1); 1177 1178 if (tg->flags & THROTL_TG_PENDING) { 1179 tg_update_disptime(tg); 1180 throtl_schedule_next_dispatch(sq->parent_sq, true); 1181 } 1182 } 1183 1184 static ssize_t tg_set_conf(struct kernfs_open_file *of, 1185 char *buf, size_t nbytes, loff_t off, bool is_u64) 1186 { 1187 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1188 struct blkg_conf_ctx ctx; 1189 struct throtl_grp *tg; 1190 int ret; 1191 u64 v; 1192 1193 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1194 if (ret) 1195 return ret; 1196 1197 ret = -EINVAL; 1198 if (sscanf(ctx.body, "%llu", &v) != 1) 1199 goto out_finish; 1200 if (!v) 1201 v = -1; 1202 1203 tg = blkg_to_tg(ctx.blkg); 1204 1205 if (is_u64) 1206 *(u64 *)((void *)tg + of_cft(of)->private) = v; 1207 else 1208 *(unsigned int *)((void *)tg + of_cft(of)->private) = v; 1209 1210 tg_conf_updated(tg); 1211 ret = 0; 1212 out_finish: 1213 blkg_conf_finish(&ctx); 1214 return ret ?: nbytes; 1215 } 1216 1217 static ssize_t tg_set_conf_u64(struct kernfs_open_file *of, 1218 char *buf, size_t nbytes, loff_t off) 1219 { 1220 return tg_set_conf(of, buf, nbytes, off, true); 1221 } 1222 1223 static ssize_t tg_set_conf_uint(struct kernfs_open_file *of, 1224 char *buf, size_t nbytes, loff_t off) 1225 { 1226 return tg_set_conf(of, buf, nbytes, off, false); 1227 } 1228 1229 static struct cftype throtl_legacy_files[] = { 1230 { 1231 .name = "throttle.read_bps_device", 1232 .private = offsetof(struct throtl_grp, bps[READ]), 1233 .seq_show = tg_print_conf_u64, 1234 .write = tg_set_conf_u64, 1235 }, 1236 { 1237 .name = "throttle.write_bps_device", 1238 .private = offsetof(struct throtl_grp, bps[WRITE]), 1239 .seq_show = tg_print_conf_u64, 1240 .write = tg_set_conf_u64, 1241 }, 1242 { 1243 .name = "throttle.read_iops_device", 1244 .private = offsetof(struct throtl_grp, iops[READ]), 1245 .seq_show = tg_print_conf_uint, 1246 .write = tg_set_conf_uint, 1247 }, 1248 { 1249 .name = "throttle.write_iops_device", 1250 .private = offsetof(struct throtl_grp, iops[WRITE]), 1251 .seq_show = tg_print_conf_uint, 1252 .write = tg_set_conf_uint, 1253 }, 1254 { 1255 .name = "throttle.io_service_bytes", 1256 .private = (unsigned long)&blkcg_policy_throtl, 1257 .seq_show = blkg_print_stat_bytes, 1258 }, 1259 { 1260 .name = "throttle.io_serviced", 1261 .private = (unsigned long)&blkcg_policy_throtl, 1262 .seq_show = blkg_print_stat_ios, 1263 }, 1264 { } /* terminate */ 1265 }; 1266 1267 static u64 tg_prfill_max(struct seq_file *sf, struct blkg_policy_data *pd, 1268 int off) 1269 { 1270 struct throtl_grp *tg = pd_to_tg(pd); 1271 const char *dname = blkg_dev_name(pd->blkg); 1272 char bufs[4][21] = { "max", "max", "max", "max" }; 1273 1274 if (!dname) 1275 return 0; 1276 if (tg->bps[READ] == -1 && tg->bps[WRITE] == -1 && 1277 tg->iops[READ] == -1 && tg->iops[WRITE] == -1) 1278 return 0; 1279 1280 if (tg->bps[READ] != -1) 1281 snprintf(bufs[0], sizeof(bufs[0]), "%llu", tg->bps[READ]); 1282 if (tg->bps[WRITE] != -1) 1283 snprintf(bufs[1], sizeof(bufs[1]), "%llu", tg->bps[WRITE]); 1284 if (tg->iops[READ] != -1) 1285 snprintf(bufs[2], sizeof(bufs[2]), "%u", tg->iops[READ]); 1286 if (tg->iops[WRITE] != -1) 1287 snprintf(bufs[3], sizeof(bufs[3]), "%u", tg->iops[WRITE]); 1288 1289 seq_printf(sf, "%s rbps=%s wbps=%s riops=%s wiops=%s\n", 1290 dname, bufs[0], bufs[1], bufs[2], bufs[3]); 1291 return 0; 1292 } 1293 1294 static int tg_print_max(struct seq_file *sf, void *v) 1295 { 1296 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), tg_prfill_max, 1297 &blkcg_policy_throtl, seq_cft(sf)->private, false); 1298 return 0; 1299 } 1300 1301 static ssize_t tg_set_max(struct kernfs_open_file *of, 1302 char *buf, size_t nbytes, loff_t off) 1303 { 1304 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 1305 struct blkg_conf_ctx ctx; 1306 struct throtl_grp *tg; 1307 u64 v[4]; 1308 int ret; 1309 1310 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1311 if (ret) 1312 return ret; 1313 1314 tg = blkg_to_tg(ctx.blkg); 1315 1316 v[0] = tg->bps[READ]; 1317 v[1] = tg->bps[WRITE]; 1318 v[2] = tg->iops[READ]; 1319 v[3] = tg->iops[WRITE]; 1320 1321 while (true) { 1322 char tok[27]; /* wiops=18446744073709551616 */ 1323 char *p; 1324 u64 val = -1; 1325 int len; 1326 1327 if (sscanf(ctx.body, "%26s%n", tok, &len) != 1) 1328 break; 1329 if (tok[0] == '\0') 1330 break; 1331 ctx.body += len; 1332 1333 ret = -EINVAL; 1334 p = tok; 1335 strsep(&p, "="); 1336 if (!p || (sscanf(p, "%llu", &val) != 1 && strcmp(p, "max"))) 1337 goto out_finish; 1338 1339 ret = -ERANGE; 1340 if (!val) 1341 goto out_finish; 1342 1343 ret = -EINVAL; 1344 if (!strcmp(tok, "rbps")) 1345 v[0] = val; 1346 else if (!strcmp(tok, "wbps")) 1347 v[1] = val; 1348 else if (!strcmp(tok, "riops")) 1349 v[2] = min_t(u64, val, UINT_MAX); 1350 else if (!strcmp(tok, "wiops")) 1351 v[3] = min_t(u64, val, UINT_MAX); 1352 else 1353 goto out_finish; 1354 } 1355 1356 tg->bps[READ] = v[0]; 1357 tg->bps[WRITE] = v[1]; 1358 tg->iops[READ] = v[2]; 1359 tg->iops[WRITE] = v[3]; 1360 1361 tg_conf_updated(tg); 1362 ret = 0; 1363 out_finish: 1364 blkg_conf_finish(&ctx); 1365 return ret ?: nbytes; 1366 } 1367 1368 static struct cftype throtl_files[] = { 1369 { 1370 .name = "max", 1371 .flags = CFTYPE_NOT_ON_ROOT, 1372 .seq_show = tg_print_max, 1373 .write = tg_set_max, 1374 }, 1375 { } /* terminate */ 1376 }; 1377 1378 static void throtl_shutdown_wq(struct request_queue *q) 1379 { 1380 struct throtl_data *td = q->td; 1381 1382 cancel_work_sync(&td->dispatch_work); 1383 } 1384 1385 static struct blkcg_policy blkcg_policy_throtl = { 1386 .dfl_cftypes = throtl_files, 1387 .legacy_cftypes = throtl_legacy_files, 1388 1389 .pd_alloc_fn = throtl_pd_alloc, 1390 .pd_init_fn = throtl_pd_init, 1391 .pd_online_fn = throtl_pd_online, 1392 .pd_free_fn = throtl_pd_free, 1393 }; 1394 1395 bool blk_throtl_bio(struct request_queue *q, struct blkcg_gq *blkg, 1396 struct bio *bio) 1397 { 1398 struct throtl_qnode *qn = NULL; 1399 struct throtl_grp *tg = blkg_to_tg(blkg ?: q->root_blkg); 1400 struct throtl_service_queue *sq; 1401 bool rw = bio_data_dir(bio); 1402 bool throttled = false; 1403 1404 WARN_ON_ONCE(!rcu_read_lock_held()); 1405 1406 /* see throtl_charge_bio() */ 1407 if ((bio->bi_rw & REQ_THROTTLED) || !tg->has_rules[rw]) 1408 goto out; 1409 1410 spin_lock_irq(q->queue_lock); 1411 1412 if (unlikely(blk_queue_bypass(q))) 1413 goto out_unlock; 1414 1415 sq = &tg->service_queue; 1416 1417 while (true) { 1418 /* throtl is FIFO - if bios are already queued, should queue */ 1419 if (sq->nr_queued[rw]) 1420 break; 1421 1422 /* if above limits, break to queue */ 1423 if (!tg_may_dispatch(tg, bio, NULL)) 1424 break; 1425 1426 /* within limits, let's charge and dispatch directly */ 1427 throtl_charge_bio(tg, bio); 1428 1429 /* 1430 * We need to trim slice even when bios are not being queued 1431 * otherwise it might happen that a bio is not queued for 1432 * a long time and slice keeps on extending and trim is not 1433 * called for a long time. Now if limits are reduced suddenly 1434 * we take into account all the IO dispatched so far at new 1435 * low rate and * newly queued IO gets a really long dispatch 1436 * time. 1437 * 1438 * So keep on trimming slice even if bio is not queued. 1439 */ 1440 throtl_trim_slice(tg, rw); 1441 1442 /* 1443 * @bio passed through this layer without being throttled. 1444 * Climb up the ladder. If we''re already at the top, it 1445 * can be executed directly. 1446 */ 1447 qn = &tg->qnode_on_parent[rw]; 1448 sq = sq->parent_sq; 1449 tg = sq_to_tg(sq); 1450 if (!tg) 1451 goto out_unlock; 1452 } 1453 1454 /* out-of-limit, queue to @tg */ 1455 throtl_log(sq, "[%c] bio. bdisp=%llu sz=%u bps=%llu iodisp=%u iops=%u queued=%d/%d", 1456 rw == READ ? 'R' : 'W', 1457 tg->bytes_disp[rw], bio->bi_iter.bi_size, tg->bps[rw], 1458 tg->io_disp[rw], tg->iops[rw], 1459 sq->nr_queued[READ], sq->nr_queued[WRITE]); 1460 1461 bio_associate_current(bio); 1462 tg->td->nr_queued[rw]++; 1463 throtl_add_bio_tg(bio, qn, tg); 1464 throttled = true; 1465 1466 /* 1467 * Update @tg's dispatch time and force schedule dispatch if @tg 1468 * was empty before @bio. The forced scheduling isn't likely to 1469 * cause undue delay as @bio is likely to be dispatched directly if 1470 * its @tg's disptime is not in the future. 1471 */ 1472 if (tg->flags & THROTL_TG_WAS_EMPTY) { 1473 tg_update_disptime(tg); 1474 throtl_schedule_next_dispatch(tg->service_queue.parent_sq, true); 1475 } 1476 1477 out_unlock: 1478 spin_unlock_irq(q->queue_lock); 1479 out: 1480 /* 1481 * As multiple blk-throtls may stack in the same issue path, we 1482 * don't want bios to leave with the flag set. Clear the flag if 1483 * being issued. 1484 */ 1485 if (!throttled) 1486 bio->bi_rw &= ~REQ_THROTTLED; 1487 return throttled; 1488 } 1489 1490 /* 1491 * Dispatch all bios from all children tg's queued on @parent_sq. On 1492 * return, @parent_sq is guaranteed to not have any active children tg's 1493 * and all bios from previously active tg's are on @parent_sq->bio_lists[]. 1494 */ 1495 static void tg_drain_bios(struct throtl_service_queue *parent_sq) 1496 { 1497 struct throtl_grp *tg; 1498 1499 while ((tg = throtl_rb_first(parent_sq))) { 1500 struct throtl_service_queue *sq = &tg->service_queue; 1501 struct bio *bio; 1502 1503 throtl_dequeue_tg(tg); 1504 1505 while ((bio = throtl_peek_queued(&sq->queued[READ]))) 1506 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1507 while ((bio = throtl_peek_queued(&sq->queued[WRITE]))) 1508 tg_dispatch_one_bio(tg, bio_data_dir(bio)); 1509 } 1510 } 1511 1512 /** 1513 * blk_throtl_drain - drain throttled bios 1514 * @q: request_queue to drain throttled bios for 1515 * 1516 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1517 */ 1518 void blk_throtl_drain(struct request_queue *q) 1519 __releases(q->queue_lock) __acquires(q->queue_lock) 1520 { 1521 struct throtl_data *td = q->td; 1522 struct blkcg_gq *blkg; 1523 struct cgroup_subsys_state *pos_css; 1524 struct bio *bio; 1525 int rw; 1526 1527 queue_lockdep_assert_held(q); 1528 rcu_read_lock(); 1529 1530 /* 1531 * Drain each tg while doing post-order walk on the blkg tree, so 1532 * that all bios are propagated to td->service_queue. It'd be 1533 * better to walk service_queue tree directly but blkg walk is 1534 * easier. 1535 */ 1536 blkg_for_each_descendant_post(blkg, pos_css, td->queue->root_blkg) 1537 tg_drain_bios(&blkg_to_tg(blkg)->service_queue); 1538 1539 /* finally, transfer bios from top-level tg's into the td */ 1540 tg_drain_bios(&td->service_queue); 1541 1542 rcu_read_unlock(); 1543 spin_unlock_irq(q->queue_lock); 1544 1545 /* all bios now should be in td->service_queue, issue them */ 1546 for (rw = READ; rw <= WRITE; rw++) 1547 while ((bio = throtl_pop_queued(&td->service_queue.queued[rw], 1548 NULL))) 1549 generic_make_request(bio); 1550 1551 spin_lock_irq(q->queue_lock); 1552 } 1553 1554 int blk_throtl_init(struct request_queue *q) 1555 { 1556 struct throtl_data *td; 1557 int ret; 1558 1559 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1560 if (!td) 1561 return -ENOMEM; 1562 1563 INIT_WORK(&td->dispatch_work, blk_throtl_dispatch_work_fn); 1564 throtl_service_queue_init(&td->service_queue); 1565 1566 q->td = td; 1567 td->queue = q; 1568 1569 /* activate policy */ 1570 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1571 if (ret) 1572 kfree(td); 1573 return ret; 1574 } 1575 1576 void blk_throtl_exit(struct request_queue *q) 1577 { 1578 BUG_ON(!q->td); 1579 throtl_shutdown_wq(q); 1580 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1581 kfree(q->td); 1582 } 1583 1584 static int __init throtl_init(void) 1585 { 1586 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1587 if (!kthrotld_workqueue) 1588 panic("Failed to create kthrotld\n"); 1589 1590 return blkcg_policy_register(&blkcg_policy_throtl); 1591 } 1592 1593 module_init(throtl_init); 1594