1 /* 2 * Interface for controlling IO bandwidth on a request queue 3 * 4 * Copyright (C) 2010 Vivek Goyal <vgoyal@redhat.com> 5 */ 6 7 #include <linux/module.h> 8 #include <linux/slab.h> 9 #include <linux/blkdev.h> 10 #include <linux/bio.h> 11 #include <linux/blktrace_api.h> 12 #include "blk-cgroup.h" 13 #include "blk.h" 14 15 /* Max dispatch from a group in 1 round */ 16 static int throtl_grp_quantum = 8; 17 18 /* Total max dispatch from all groups in one round */ 19 static int throtl_quantum = 32; 20 21 /* Throttling is performed over 100ms slice and after that slice is renewed */ 22 static unsigned long throtl_slice = HZ/10; /* 100 ms */ 23 24 static struct blkcg_policy blkcg_policy_throtl; 25 26 /* A workqueue to queue throttle related work */ 27 static struct workqueue_struct *kthrotld_workqueue; 28 static void throtl_schedule_delayed_work(struct throtl_data *td, 29 unsigned long delay); 30 31 struct throtl_rb_root { 32 struct rb_root rb; 33 struct rb_node *left; 34 unsigned int count; 35 unsigned long min_disptime; 36 }; 37 38 #define THROTL_RB_ROOT (struct throtl_rb_root) { .rb = RB_ROOT, .left = NULL, \ 39 .count = 0, .min_disptime = 0} 40 41 #define rb_entry_tg(node) rb_entry((node), struct throtl_grp, rb_node) 42 43 /* Per-cpu group stats */ 44 struct tg_stats_cpu { 45 /* total bytes transferred */ 46 struct blkg_rwstat service_bytes; 47 /* total IOs serviced, post merge */ 48 struct blkg_rwstat serviced; 49 }; 50 51 struct throtl_grp { 52 /* must be the first member */ 53 struct blkg_policy_data pd; 54 55 /* active throtl group service_tree member */ 56 struct rb_node rb_node; 57 58 /* 59 * Dispatch time in jiffies. This is the estimated time when group 60 * will unthrottle and is ready to dispatch more bio. It is used as 61 * key to sort active groups in service tree. 62 */ 63 unsigned long disptime; 64 65 unsigned int flags; 66 67 /* Two lists for READ and WRITE */ 68 struct bio_list bio_lists[2]; 69 70 /* Number of queued bios on READ and WRITE lists */ 71 unsigned int nr_queued[2]; 72 73 /* bytes per second rate limits */ 74 uint64_t bps[2]; 75 76 /* IOPS limits */ 77 unsigned int iops[2]; 78 79 /* Number of bytes disptached in current slice */ 80 uint64_t bytes_disp[2]; 81 /* Number of bio's dispatched in current slice */ 82 unsigned int io_disp[2]; 83 84 /* When did we start a new slice */ 85 unsigned long slice_start[2]; 86 unsigned long slice_end[2]; 87 88 /* Some throttle limits got updated for the group */ 89 int limits_changed; 90 91 /* Per cpu stats pointer */ 92 struct tg_stats_cpu __percpu *stats_cpu; 93 94 /* List of tgs waiting for per cpu stats memory to be allocated */ 95 struct list_head stats_alloc_node; 96 }; 97 98 struct throtl_data 99 { 100 /* service tree for active throtl groups */ 101 struct throtl_rb_root tg_service_tree; 102 103 struct request_queue *queue; 104 105 /* Total Number of queued bios on READ and WRITE lists */ 106 unsigned int nr_queued[2]; 107 108 /* 109 * number of total undestroyed groups 110 */ 111 unsigned int nr_undestroyed_grps; 112 113 /* Work for dispatching throttled bios */ 114 struct delayed_work throtl_work; 115 116 int limits_changed; 117 }; 118 119 /* list and work item to allocate percpu group stats */ 120 static DEFINE_SPINLOCK(tg_stats_alloc_lock); 121 static LIST_HEAD(tg_stats_alloc_list); 122 123 static void tg_stats_alloc_fn(struct work_struct *); 124 static DECLARE_DELAYED_WORK(tg_stats_alloc_work, tg_stats_alloc_fn); 125 126 static inline struct throtl_grp *pd_to_tg(struct blkg_policy_data *pd) 127 { 128 return pd ? container_of(pd, struct throtl_grp, pd) : NULL; 129 } 130 131 static inline struct throtl_grp *blkg_to_tg(struct blkcg_gq *blkg) 132 { 133 return pd_to_tg(blkg_to_pd(blkg, &blkcg_policy_throtl)); 134 } 135 136 static inline struct blkcg_gq *tg_to_blkg(struct throtl_grp *tg) 137 { 138 return pd_to_blkg(&tg->pd); 139 } 140 141 static inline struct throtl_grp *td_root_tg(struct throtl_data *td) 142 { 143 return blkg_to_tg(td->queue->root_blkg); 144 } 145 146 enum tg_state_flags { 147 THROTL_TG_FLAG_on_rr = 0, /* on round-robin busy list */ 148 }; 149 150 #define THROTL_TG_FNS(name) \ 151 static inline void throtl_mark_tg_##name(struct throtl_grp *tg) \ 152 { \ 153 (tg)->flags |= (1 << THROTL_TG_FLAG_##name); \ 154 } \ 155 static inline void throtl_clear_tg_##name(struct throtl_grp *tg) \ 156 { \ 157 (tg)->flags &= ~(1 << THROTL_TG_FLAG_##name); \ 158 } \ 159 static inline int throtl_tg_##name(const struct throtl_grp *tg) \ 160 { \ 161 return ((tg)->flags & (1 << THROTL_TG_FLAG_##name)) != 0; \ 162 } 163 164 THROTL_TG_FNS(on_rr); 165 166 #define throtl_log_tg(td, tg, fmt, args...) do { \ 167 char __pbuf[128]; \ 168 \ 169 blkg_path(tg_to_blkg(tg), __pbuf, sizeof(__pbuf)); \ 170 blk_add_trace_msg((td)->queue, "throtl %s " fmt, __pbuf, ##args); \ 171 } while (0) 172 173 #define throtl_log(td, fmt, args...) \ 174 blk_add_trace_msg((td)->queue, "throtl " fmt, ##args) 175 176 static inline unsigned int total_nr_queued(struct throtl_data *td) 177 { 178 return td->nr_queued[0] + td->nr_queued[1]; 179 } 180 181 /* 182 * Worker for allocating per cpu stat for tgs. This is scheduled on the 183 * system_wq once there are some groups on the alloc_list waiting for 184 * allocation. 185 */ 186 static void tg_stats_alloc_fn(struct work_struct *work) 187 { 188 static struct tg_stats_cpu *stats_cpu; /* this fn is non-reentrant */ 189 struct delayed_work *dwork = to_delayed_work(work); 190 bool empty = false; 191 192 alloc_stats: 193 if (!stats_cpu) { 194 stats_cpu = alloc_percpu(struct tg_stats_cpu); 195 if (!stats_cpu) { 196 /* allocation failed, try again after some time */ 197 schedule_delayed_work(dwork, msecs_to_jiffies(10)); 198 return; 199 } 200 } 201 202 spin_lock_irq(&tg_stats_alloc_lock); 203 204 if (!list_empty(&tg_stats_alloc_list)) { 205 struct throtl_grp *tg = list_first_entry(&tg_stats_alloc_list, 206 struct throtl_grp, 207 stats_alloc_node); 208 swap(tg->stats_cpu, stats_cpu); 209 list_del_init(&tg->stats_alloc_node); 210 } 211 212 empty = list_empty(&tg_stats_alloc_list); 213 spin_unlock_irq(&tg_stats_alloc_lock); 214 if (!empty) 215 goto alloc_stats; 216 } 217 218 static void throtl_pd_init(struct blkcg_gq *blkg) 219 { 220 struct throtl_grp *tg = blkg_to_tg(blkg); 221 unsigned long flags; 222 223 RB_CLEAR_NODE(&tg->rb_node); 224 bio_list_init(&tg->bio_lists[0]); 225 bio_list_init(&tg->bio_lists[1]); 226 tg->limits_changed = false; 227 228 tg->bps[READ] = -1; 229 tg->bps[WRITE] = -1; 230 tg->iops[READ] = -1; 231 tg->iops[WRITE] = -1; 232 233 /* 234 * Ugh... We need to perform per-cpu allocation for tg->stats_cpu 235 * but percpu allocator can't be called from IO path. Queue tg on 236 * tg_stats_alloc_list and allocate from work item. 237 */ 238 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 239 list_add(&tg->stats_alloc_node, &tg_stats_alloc_list); 240 schedule_delayed_work(&tg_stats_alloc_work, 0); 241 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 242 } 243 244 static void throtl_pd_exit(struct blkcg_gq *blkg) 245 { 246 struct throtl_grp *tg = blkg_to_tg(blkg); 247 unsigned long flags; 248 249 spin_lock_irqsave(&tg_stats_alloc_lock, flags); 250 list_del_init(&tg->stats_alloc_node); 251 spin_unlock_irqrestore(&tg_stats_alloc_lock, flags); 252 253 free_percpu(tg->stats_cpu); 254 } 255 256 static void throtl_pd_reset_stats(struct blkcg_gq *blkg) 257 { 258 struct throtl_grp *tg = blkg_to_tg(blkg); 259 int cpu; 260 261 if (tg->stats_cpu == NULL) 262 return; 263 264 for_each_possible_cpu(cpu) { 265 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 266 267 blkg_rwstat_reset(&sc->service_bytes); 268 blkg_rwstat_reset(&sc->serviced); 269 } 270 } 271 272 static struct throtl_grp *throtl_lookup_tg(struct throtl_data *td, 273 struct blkcg *blkcg) 274 { 275 /* 276 * This is the common case when there are no blkcgs. Avoid lookup 277 * in this case 278 */ 279 if (blkcg == &blkcg_root) 280 return td_root_tg(td); 281 282 return blkg_to_tg(blkg_lookup(blkcg, td->queue)); 283 } 284 285 static struct throtl_grp *throtl_lookup_create_tg(struct throtl_data *td, 286 struct blkcg *blkcg) 287 { 288 struct request_queue *q = td->queue; 289 struct throtl_grp *tg = NULL; 290 291 /* 292 * This is the common case when there are no blkcgs. Avoid lookup 293 * in this case 294 */ 295 if (blkcg == &blkcg_root) { 296 tg = td_root_tg(td); 297 } else { 298 struct blkcg_gq *blkg; 299 300 blkg = blkg_lookup_create(blkcg, q); 301 302 /* if %NULL and @q is alive, fall back to root_tg */ 303 if (!IS_ERR(blkg)) 304 tg = blkg_to_tg(blkg); 305 else if (!blk_queue_dying(q)) 306 tg = td_root_tg(td); 307 } 308 309 return tg; 310 } 311 312 static struct throtl_grp *throtl_rb_first(struct throtl_rb_root *root) 313 { 314 /* Service tree is empty */ 315 if (!root->count) 316 return NULL; 317 318 if (!root->left) 319 root->left = rb_first(&root->rb); 320 321 if (root->left) 322 return rb_entry_tg(root->left); 323 324 return NULL; 325 } 326 327 static void rb_erase_init(struct rb_node *n, struct rb_root *root) 328 { 329 rb_erase(n, root); 330 RB_CLEAR_NODE(n); 331 } 332 333 static void throtl_rb_erase(struct rb_node *n, struct throtl_rb_root *root) 334 { 335 if (root->left == n) 336 root->left = NULL; 337 rb_erase_init(n, &root->rb); 338 --root->count; 339 } 340 341 static void update_min_dispatch_time(struct throtl_rb_root *st) 342 { 343 struct throtl_grp *tg; 344 345 tg = throtl_rb_first(st); 346 if (!tg) 347 return; 348 349 st->min_disptime = tg->disptime; 350 } 351 352 static void 353 tg_service_tree_add(struct throtl_rb_root *st, struct throtl_grp *tg) 354 { 355 struct rb_node **node = &st->rb.rb_node; 356 struct rb_node *parent = NULL; 357 struct throtl_grp *__tg; 358 unsigned long key = tg->disptime; 359 int left = 1; 360 361 while (*node != NULL) { 362 parent = *node; 363 __tg = rb_entry_tg(parent); 364 365 if (time_before(key, __tg->disptime)) 366 node = &parent->rb_left; 367 else { 368 node = &parent->rb_right; 369 left = 0; 370 } 371 } 372 373 if (left) 374 st->left = &tg->rb_node; 375 376 rb_link_node(&tg->rb_node, parent, node); 377 rb_insert_color(&tg->rb_node, &st->rb); 378 } 379 380 static void __throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 381 { 382 struct throtl_rb_root *st = &td->tg_service_tree; 383 384 tg_service_tree_add(st, tg); 385 throtl_mark_tg_on_rr(tg); 386 st->count++; 387 } 388 389 static void throtl_enqueue_tg(struct throtl_data *td, struct throtl_grp *tg) 390 { 391 if (!throtl_tg_on_rr(tg)) 392 __throtl_enqueue_tg(td, tg); 393 } 394 395 static void __throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 396 { 397 throtl_rb_erase(&tg->rb_node, &td->tg_service_tree); 398 throtl_clear_tg_on_rr(tg); 399 } 400 401 static void throtl_dequeue_tg(struct throtl_data *td, struct throtl_grp *tg) 402 { 403 if (throtl_tg_on_rr(tg)) 404 __throtl_dequeue_tg(td, tg); 405 } 406 407 static void throtl_schedule_next_dispatch(struct throtl_data *td) 408 { 409 struct throtl_rb_root *st = &td->tg_service_tree; 410 411 /* 412 * If there are more bios pending, schedule more work. 413 */ 414 if (!total_nr_queued(td)) 415 return; 416 417 BUG_ON(!st->count); 418 419 update_min_dispatch_time(st); 420 421 if (time_before_eq(st->min_disptime, jiffies)) 422 throtl_schedule_delayed_work(td, 0); 423 else 424 throtl_schedule_delayed_work(td, (st->min_disptime - jiffies)); 425 } 426 427 static inline void 428 throtl_start_new_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 429 { 430 tg->bytes_disp[rw] = 0; 431 tg->io_disp[rw] = 0; 432 tg->slice_start[rw] = jiffies; 433 tg->slice_end[rw] = jiffies + throtl_slice; 434 throtl_log_tg(td, tg, "[%c] new slice start=%lu end=%lu jiffies=%lu", 435 rw == READ ? 'R' : 'W', tg->slice_start[rw], 436 tg->slice_end[rw], jiffies); 437 } 438 439 static inline void throtl_set_slice_end(struct throtl_data *td, 440 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 441 { 442 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 443 } 444 445 static inline void throtl_extend_slice(struct throtl_data *td, 446 struct throtl_grp *tg, bool rw, unsigned long jiffy_end) 447 { 448 tg->slice_end[rw] = roundup(jiffy_end, throtl_slice); 449 throtl_log_tg(td, tg, "[%c] extend slice start=%lu end=%lu jiffies=%lu", 450 rw == READ ? 'R' : 'W', tg->slice_start[rw], 451 tg->slice_end[rw], jiffies); 452 } 453 454 /* Determine if previously allocated or extended slice is complete or not */ 455 static bool 456 throtl_slice_used(struct throtl_data *td, struct throtl_grp *tg, bool rw) 457 { 458 if (time_in_range(jiffies, tg->slice_start[rw], tg->slice_end[rw])) 459 return 0; 460 461 return 1; 462 } 463 464 /* Trim the used slices and adjust slice start accordingly */ 465 static inline void 466 throtl_trim_slice(struct throtl_data *td, struct throtl_grp *tg, bool rw) 467 { 468 unsigned long nr_slices, time_elapsed, io_trim; 469 u64 bytes_trim, tmp; 470 471 BUG_ON(time_before(tg->slice_end[rw], tg->slice_start[rw])); 472 473 /* 474 * If bps are unlimited (-1), then time slice don't get 475 * renewed. Don't try to trim the slice if slice is used. A new 476 * slice will start when appropriate. 477 */ 478 if (throtl_slice_used(td, tg, rw)) 479 return; 480 481 /* 482 * A bio has been dispatched. Also adjust slice_end. It might happen 483 * that initially cgroup limit was very low resulting in high 484 * slice_end, but later limit was bumped up and bio was dispached 485 * sooner, then we need to reduce slice_end. A high bogus slice_end 486 * is bad because it does not allow new slice to start. 487 */ 488 489 throtl_set_slice_end(td, tg, rw, jiffies + throtl_slice); 490 491 time_elapsed = jiffies - tg->slice_start[rw]; 492 493 nr_slices = time_elapsed / throtl_slice; 494 495 if (!nr_slices) 496 return; 497 tmp = tg->bps[rw] * throtl_slice * nr_slices; 498 do_div(tmp, HZ); 499 bytes_trim = tmp; 500 501 io_trim = (tg->iops[rw] * throtl_slice * nr_slices)/HZ; 502 503 if (!bytes_trim && !io_trim) 504 return; 505 506 if (tg->bytes_disp[rw] >= bytes_trim) 507 tg->bytes_disp[rw] -= bytes_trim; 508 else 509 tg->bytes_disp[rw] = 0; 510 511 if (tg->io_disp[rw] >= io_trim) 512 tg->io_disp[rw] -= io_trim; 513 else 514 tg->io_disp[rw] = 0; 515 516 tg->slice_start[rw] += nr_slices * throtl_slice; 517 518 throtl_log_tg(td, tg, "[%c] trim slice nr=%lu bytes=%llu io=%lu" 519 " start=%lu end=%lu jiffies=%lu", 520 rw == READ ? 'R' : 'W', nr_slices, bytes_trim, io_trim, 521 tg->slice_start[rw], tg->slice_end[rw], jiffies); 522 } 523 524 static bool tg_with_in_iops_limit(struct throtl_data *td, struct throtl_grp *tg, 525 struct bio *bio, unsigned long *wait) 526 { 527 bool rw = bio_data_dir(bio); 528 unsigned int io_allowed; 529 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 530 u64 tmp; 531 532 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 533 534 /* Slice has just started. Consider one slice interval */ 535 if (!jiffy_elapsed) 536 jiffy_elapsed_rnd = throtl_slice; 537 538 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 539 540 /* 541 * jiffy_elapsed_rnd should not be a big value as minimum iops can be 542 * 1 then at max jiffy elapsed should be equivalent of 1 second as we 543 * will allow dispatch after 1 second and after that slice should 544 * have been trimmed. 545 */ 546 547 tmp = (u64)tg->iops[rw] * jiffy_elapsed_rnd; 548 do_div(tmp, HZ); 549 550 if (tmp > UINT_MAX) 551 io_allowed = UINT_MAX; 552 else 553 io_allowed = tmp; 554 555 if (tg->io_disp[rw] + 1 <= io_allowed) { 556 if (wait) 557 *wait = 0; 558 return 1; 559 } 560 561 /* Calc approx time to dispatch */ 562 jiffy_wait = ((tg->io_disp[rw] + 1) * HZ)/tg->iops[rw] + 1; 563 564 if (jiffy_wait > jiffy_elapsed) 565 jiffy_wait = jiffy_wait - jiffy_elapsed; 566 else 567 jiffy_wait = 1; 568 569 if (wait) 570 *wait = jiffy_wait; 571 return 0; 572 } 573 574 static bool tg_with_in_bps_limit(struct throtl_data *td, struct throtl_grp *tg, 575 struct bio *bio, unsigned long *wait) 576 { 577 bool rw = bio_data_dir(bio); 578 u64 bytes_allowed, extra_bytes, tmp; 579 unsigned long jiffy_elapsed, jiffy_wait, jiffy_elapsed_rnd; 580 581 jiffy_elapsed = jiffy_elapsed_rnd = jiffies - tg->slice_start[rw]; 582 583 /* Slice has just started. Consider one slice interval */ 584 if (!jiffy_elapsed) 585 jiffy_elapsed_rnd = throtl_slice; 586 587 jiffy_elapsed_rnd = roundup(jiffy_elapsed_rnd, throtl_slice); 588 589 tmp = tg->bps[rw] * jiffy_elapsed_rnd; 590 do_div(tmp, HZ); 591 bytes_allowed = tmp; 592 593 if (tg->bytes_disp[rw] + bio->bi_size <= bytes_allowed) { 594 if (wait) 595 *wait = 0; 596 return 1; 597 } 598 599 /* Calc approx time to dispatch */ 600 extra_bytes = tg->bytes_disp[rw] + bio->bi_size - bytes_allowed; 601 jiffy_wait = div64_u64(extra_bytes * HZ, tg->bps[rw]); 602 603 if (!jiffy_wait) 604 jiffy_wait = 1; 605 606 /* 607 * This wait time is without taking into consideration the rounding 608 * up we did. Add that time also. 609 */ 610 jiffy_wait = jiffy_wait + (jiffy_elapsed_rnd - jiffy_elapsed); 611 if (wait) 612 *wait = jiffy_wait; 613 return 0; 614 } 615 616 static bool tg_no_rule_group(struct throtl_grp *tg, bool rw) { 617 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) 618 return 1; 619 return 0; 620 } 621 622 /* 623 * Returns whether one can dispatch a bio or not. Also returns approx number 624 * of jiffies to wait before this bio is with-in IO rate and can be dispatched 625 */ 626 static bool tg_may_dispatch(struct throtl_data *td, struct throtl_grp *tg, 627 struct bio *bio, unsigned long *wait) 628 { 629 bool rw = bio_data_dir(bio); 630 unsigned long bps_wait = 0, iops_wait = 0, max_wait = 0; 631 632 /* 633 * Currently whole state machine of group depends on first bio 634 * queued in the group bio list. So one should not be calling 635 * this function with a different bio if there are other bios 636 * queued. 637 */ 638 BUG_ON(tg->nr_queued[rw] && bio != bio_list_peek(&tg->bio_lists[rw])); 639 640 /* If tg->bps = -1, then BW is unlimited */ 641 if (tg->bps[rw] == -1 && tg->iops[rw] == -1) { 642 if (wait) 643 *wait = 0; 644 return 1; 645 } 646 647 /* 648 * If previous slice expired, start a new one otherwise renew/extend 649 * existing slice to make sure it is at least throtl_slice interval 650 * long since now. 651 */ 652 if (throtl_slice_used(td, tg, rw)) 653 throtl_start_new_slice(td, tg, rw); 654 else { 655 if (time_before(tg->slice_end[rw], jiffies + throtl_slice)) 656 throtl_extend_slice(td, tg, rw, jiffies + throtl_slice); 657 } 658 659 if (tg_with_in_bps_limit(td, tg, bio, &bps_wait) 660 && tg_with_in_iops_limit(td, tg, bio, &iops_wait)) { 661 if (wait) 662 *wait = 0; 663 return 1; 664 } 665 666 max_wait = max(bps_wait, iops_wait); 667 668 if (wait) 669 *wait = max_wait; 670 671 if (time_before(tg->slice_end[rw], jiffies + max_wait)) 672 throtl_extend_slice(td, tg, rw, jiffies + max_wait); 673 674 return 0; 675 } 676 677 static void throtl_update_dispatch_stats(struct blkcg_gq *blkg, u64 bytes, 678 int rw) 679 { 680 struct throtl_grp *tg = blkg_to_tg(blkg); 681 struct tg_stats_cpu *stats_cpu; 682 unsigned long flags; 683 684 /* If per cpu stats are not allocated yet, don't do any accounting. */ 685 if (tg->stats_cpu == NULL) 686 return; 687 688 /* 689 * Disabling interrupts to provide mutual exclusion between two 690 * writes on same cpu. It probably is not needed for 64bit. Not 691 * optimizing that case yet. 692 */ 693 local_irq_save(flags); 694 695 stats_cpu = this_cpu_ptr(tg->stats_cpu); 696 697 blkg_rwstat_add(&stats_cpu->serviced, rw, 1); 698 blkg_rwstat_add(&stats_cpu->service_bytes, rw, bytes); 699 700 local_irq_restore(flags); 701 } 702 703 static void throtl_charge_bio(struct throtl_grp *tg, struct bio *bio) 704 { 705 bool rw = bio_data_dir(bio); 706 707 /* Charge the bio to the group */ 708 tg->bytes_disp[rw] += bio->bi_size; 709 tg->io_disp[rw]++; 710 711 throtl_update_dispatch_stats(tg_to_blkg(tg), bio->bi_size, bio->bi_rw); 712 } 713 714 static void throtl_add_bio_tg(struct throtl_data *td, struct throtl_grp *tg, 715 struct bio *bio) 716 { 717 bool rw = bio_data_dir(bio); 718 719 bio_list_add(&tg->bio_lists[rw], bio); 720 /* Take a bio reference on tg */ 721 blkg_get(tg_to_blkg(tg)); 722 tg->nr_queued[rw]++; 723 td->nr_queued[rw]++; 724 throtl_enqueue_tg(td, tg); 725 } 726 727 static void tg_update_disptime(struct throtl_data *td, struct throtl_grp *tg) 728 { 729 unsigned long read_wait = -1, write_wait = -1, min_wait = -1, disptime; 730 struct bio *bio; 731 732 if ((bio = bio_list_peek(&tg->bio_lists[READ]))) 733 tg_may_dispatch(td, tg, bio, &read_wait); 734 735 if ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 736 tg_may_dispatch(td, tg, bio, &write_wait); 737 738 min_wait = min(read_wait, write_wait); 739 disptime = jiffies + min_wait; 740 741 /* Update dispatch time */ 742 throtl_dequeue_tg(td, tg); 743 tg->disptime = disptime; 744 throtl_enqueue_tg(td, tg); 745 } 746 747 static void tg_dispatch_one_bio(struct throtl_data *td, struct throtl_grp *tg, 748 bool rw, struct bio_list *bl) 749 { 750 struct bio *bio; 751 752 bio = bio_list_pop(&tg->bio_lists[rw]); 753 tg->nr_queued[rw]--; 754 /* Drop bio reference on blkg */ 755 blkg_put(tg_to_blkg(tg)); 756 757 BUG_ON(td->nr_queued[rw] <= 0); 758 td->nr_queued[rw]--; 759 760 throtl_charge_bio(tg, bio); 761 bio_list_add(bl, bio); 762 bio->bi_rw |= REQ_THROTTLED; 763 764 throtl_trim_slice(td, tg, rw); 765 } 766 767 static int throtl_dispatch_tg(struct throtl_data *td, struct throtl_grp *tg, 768 struct bio_list *bl) 769 { 770 unsigned int nr_reads = 0, nr_writes = 0; 771 unsigned int max_nr_reads = throtl_grp_quantum*3/4; 772 unsigned int max_nr_writes = throtl_grp_quantum - max_nr_reads; 773 struct bio *bio; 774 775 /* Try to dispatch 75% READS and 25% WRITES */ 776 777 while ((bio = bio_list_peek(&tg->bio_lists[READ])) 778 && tg_may_dispatch(td, tg, bio, NULL)) { 779 780 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 781 nr_reads++; 782 783 if (nr_reads >= max_nr_reads) 784 break; 785 } 786 787 while ((bio = bio_list_peek(&tg->bio_lists[WRITE])) 788 && tg_may_dispatch(td, tg, bio, NULL)) { 789 790 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), bl); 791 nr_writes++; 792 793 if (nr_writes >= max_nr_writes) 794 break; 795 } 796 797 return nr_reads + nr_writes; 798 } 799 800 static int throtl_select_dispatch(struct throtl_data *td, struct bio_list *bl) 801 { 802 unsigned int nr_disp = 0; 803 struct throtl_grp *tg; 804 struct throtl_rb_root *st = &td->tg_service_tree; 805 806 while (1) { 807 tg = throtl_rb_first(st); 808 809 if (!tg) 810 break; 811 812 if (time_before(jiffies, tg->disptime)) 813 break; 814 815 throtl_dequeue_tg(td, tg); 816 817 nr_disp += throtl_dispatch_tg(td, tg, bl); 818 819 if (tg->nr_queued[0] || tg->nr_queued[1]) { 820 tg_update_disptime(td, tg); 821 throtl_enqueue_tg(td, tg); 822 } 823 824 if (nr_disp >= throtl_quantum) 825 break; 826 } 827 828 return nr_disp; 829 } 830 831 static void throtl_process_limit_change(struct throtl_data *td) 832 { 833 struct request_queue *q = td->queue; 834 struct blkcg_gq *blkg, *n; 835 836 if (!td->limits_changed) 837 return; 838 839 xchg(&td->limits_changed, false); 840 841 throtl_log(td, "limits changed"); 842 843 list_for_each_entry_safe(blkg, n, &q->blkg_list, q_node) { 844 struct throtl_grp *tg = blkg_to_tg(blkg); 845 846 if (!tg->limits_changed) 847 continue; 848 849 if (!xchg(&tg->limits_changed, false)) 850 continue; 851 852 throtl_log_tg(td, tg, "limit change rbps=%llu wbps=%llu" 853 " riops=%u wiops=%u", tg->bps[READ], tg->bps[WRITE], 854 tg->iops[READ], tg->iops[WRITE]); 855 856 /* 857 * Restart the slices for both READ and WRITES. It 858 * might happen that a group's limit are dropped 859 * suddenly and we don't want to account recently 860 * dispatched IO with new low rate 861 */ 862 throtl_start_new_slice(td, tg, 0); 863 throtl_start_new_slice(td, tg, 1); 864 865 if (throtl_tg_on_rr(tg)) 866 tg_update_disptime(td, tg); 867 } 868 } 869 870 /* Dispatch throttled bios. Should be called without queue lock held. */ 871 static int throtl_dispatch(struct request_queue *q) 872 { 873 struct throtl_data *td = q->td; 874 unsigned int nr_disp = 0; 875 struct bio_list bio_list_on_stack; 876 struct bio *bio; 877 struct blk_plug plug; 878 879 spin_lock_irq(q->queue_lock); 880 881 throtl_process_limit_change(td); 882 883 if (!total_nr_queued(td)) 884 goto out; 885 886 bio_list_init(&bio_list_on_stack); 887 888 throtl_log(td, "dispatch nr_queued=%u read=%u write=%u", 889 total_nr_queued(td), td->nr_queued[READ], 890 td->nr_queued[WRITE]); 891 892 nr_disp = throtl_select_dispatch(td, &bio_list_on_stack); 893 894 if (nr_disp) 895 throtl_log(td, "bios disp=%u", nr_disp); 896 897 throtl_schedule_next_dispatch(td); 898 out: 899 spin_unlock_irq(q->queue_lock); 900 901 /* 902 * If we dispatched some requests, unplug the queue to make sure 903 * immediate dispatch 904 */ 905 if (nr_disp) { 906 blk_start_plug(&plug); 907 while((bio = bio_list_pop(&bio_list_on_stack))) 908 generic_make_request(bio); 909 blk_finish_plug(&plug); 910 } 911 return nr_disp; 912 } 913 914 void blk_throtl_work(struct work_struct *work) 915 { 916 struct throtl_data *td = container_of(work, struct throtl_data, 917 throtl_work.work); 918 struct request_queue *q = td->queue; 919 920 throtl_dispatch(q); 921 } 922 923 /* Call with queue lock held */ 924 static void 925 throtl_schedule_delayed_work(struct throtl_data *td, unsigned long delay) 926 { 927 928 struct delayed_work *dwork = &td->throtl_work; 929 930 /* schedule work if limits changed even if no bio is queued */ 931 if (total_nr_queued(td) || td->limits_changed) { 932 mod_delayed_work(kthrotld_workqueue, dwork, delay); 933 throtl_log(td, "schedule work. delay=%lu jiffies=%lu", 934 delay, jiffies); 935 } 936 } 937 938 static u64 tg_prfill_cpu_rwstat(struct seq_file *sf, 939 struct blkg_policy_data *pd, int off) 940 { 941 struct throtl_grp *tg = pd_to_tg(pd); 942 struct blkg_rwstat rwstat = { }, tmp; 943 int i, cpu; 944 945 for_each_possible_cpu(cpu) { 946 struct tg_stats_cpu *sc = per_cpu_ptr(tg->stats_cpu, cpu); 947 948 tmp = blkg_rwstat_read((void *)sc + off); 949 for (i = 0; i < BLKG_RWSTAT_NR; i++) 950 rwstat.cnt[i] += tmp.cnt[i]; 951 } 952 953 return __blkg_prfill_rwstat(sf, pd, &rwstat); 954 } 955 956 static int tg_print_cpu_rwstat(struct cgroup *cgrp, struct cftype *cft, 957 struct seq_file *sf) 958 { 959 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 960 961 blkcg_print_blkgs(sf, blkcg, tg_prfill_cpu_rwstat, &blkcg_policy_throtl, 962 cft->private, true); 963 return 0; 964 } 965 966 static u64 tg_prfill_conf_u64(struct seq_file *sf, struct blkg_policy_data *pd, 967 int off) 968 { 969 struct throtl_grp *tg = pd_to_tg(pd); 970 u64 v = *(u64 *)((void *)tg + off); 971 972 if (v == -1) 973 return 0; 974 return __blkg_prfill_u64(sf, pd, v); 975 } 976 977 static u64 tg_prfill_conf_uint(struct seq_file *sf, struct blkg_policy_data *pd, 978 int off) 979 { 980 struct throtl_grp *tg = pd_to_tg(pd); 981 unsigned int v = *(unsigned int *)((void *)tg + off); 982 983 if (v == -1) 984 return 0; 985 return __blkg_prfill_u64(sf, pd, v); 986 } 987 988 static int tg_print_conf_u64(struct cgroup *cgrp, struct cftype *cft, 989 struct seq_file *sf) 990 { 991 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_u64, 992 &blkcg_policy_throtl, cft->private, false); 993 return 0; 994 } 995 996 static int tg_print_conf_uint(struct cgroup *cgrp, struct cftype *cft, 997 struct seq_file *sf) 998 { 999 blkcg_print_blkgs(sf, cgroup_to_blkcg(cgrp), tg_prfill_conf_uint, 1000 &blkcg_policy_throtl, cft->private, false); 1001 return 0; 1002 } 1003 1004 static int tg_set_conf(struct cgroup *cgrp, struct cftype *cft, const char *buf, 1005 bool is_u64) 1006 { 1007 struct blkcg *blkcg = cgroup_to_blkcg(cgrp); 1008 struct blkg_conf_ctx ctx; 1009 struct throtl_grp *tg; 1010 struct throtl_data *td; 1011 int ret; 1012 1013 ret = blkg_conf_prep(blkcg, &blkcg_policy_throtl, buf, &ctx); 1014 if (ret) 1015 return ret; 1016 1017 tg = blkg_to_tg(ctx.blkg); 1018 td = ctx.blkg->q->td; 1019 1020 if (!ctx.v) 1021 ctx.v = -1; 1022 1023 if (is_u64) 1024 *(u64 *)((void *)tg + cft->private) = ctx.v; 1025 else 1026 *(unsigned int *)((void *)tg + cft->private) = ctx.v; 1027 1028 /* XXX: we don't need the following deferred processing */ 1029 xchg(&tg->limits_changed, true); 1030 xchg(&td->limits_changed, true); 1031 throtl_schedule_delayed_work(td, 0); 1032 1033 blkg_conf_finish(&ctx); 1034 return 0; 1035 } 1036 1037 static int tg_set_conf_u64(struct cgroup *cgrp, struct cftype *cft, 1038 const char *buf) 1039 { 1040 return tg_set_conf(cgrp, cft, buf, true); 1041 } 1042 1043 static int tg_set_conf_uint(struct cgroup *cgrp, struct cftype *cft, 1044 const char *buf) 1045 { 1046 return tg_set_conf(cgrp, cft, buf, false); 1047 } 1048 1049 static struct cftype throtl_files[] = { 1050 { 1051 .name = "throttle.read_bps_device", 1052 .private = offsetof(struct throtl_grp, bps[READ]), 1053 .read_seq_string = tg_print_conf_u64, 1054 .write_string = tg_set_conf_u64, 1055 .max_write_len = 256, 1056 }, 1057 { 1058 .name = "throttle.write_bps_device", 1059 .private = offsetof(struct throtl_grp, bps[WRITE]), 1060 .read_seq_string = tg_print_conf_u64, 1061 .write_string = tg_set_conf_u64, 1062 .max_write_len = 256, 1063 }, 1064 { 1065 .name = "throttle.read_iops_device", 1066 .private = offsetof(struct throtl_grp, iops[READ]), 1067 .read_seq_string = tg_print_conf_uint, 1068 .write_string = tg_set_conf_uint, 1069 .max_write_len = 256, 1070 }, 1071 { 1072 .name = "throttle.write_iops_device", 1073 .private = offsetof(struct throtl_grp, iops[WRITE]), 1074 .read_seq_string = tg_print_conf_uint, 1075 .write_string = tg_set_conf_uint, 1076 .max_write_len = 256, 1077 }, 1078 { 1079 .name = "throttle.io_service_bytes", 1080 .private = offsetof(struct tg_stats_cpu, service_bytes), 1081 .read_seq_string = tg_print_cpu_rwstat, 1082 }, 1083 { 1084 .name = "throttle.io_serviced", 1085 .private = offsetof(struct tg_stats_cpu, serviced), 1086 .read_seq_string = tg_print_cpu_rwstat, 1087 }, 1088 { } /* terminate */ 1089 }; 1090 1091 static void throtl_shutdown_wq(struct request_queue *q) 1092 { 1093 struct throtl_data *td = q->td; 1094 1095 cancel_delayed_work_sync(&td->throtl_work); 1096 } 1097 1098 static struct blkcg_policy blkcg_policy_throtl = { 1099 .pd_size = sizeof(struct throtl_grp), 1100 .cftypes = throtl_files, 1101 1102 .pd_init_fn = throtl_pd_init, 1103 .pd_exit_fn = throtl_pd_exit, 1104 .pd_reset_stats_fn = throtl_pd_reset_stats, 1105 }; 1106 1107 bool blk_throtl_bio(struct request_queue *q, struct bio *bio) 1108 { 1109 struct throtl_data *td = q->td; 1110 struct throtl_grp *tg; 1111 bool rw = bio_data_dir(bio), update_disptime = true; 1112 struct blkcg *blkcg; 1113 bool throttled = false; 1114 1115 if (bio->bi_rw & REQ_THROTTLED) { 1116 bio->bi_rw &= ~REQ_THROTTLED; 1117 goto out; 1118 } 1119 1120 /* 1121 * A throtl_grp pointer retrieved under rcu can be used to access 1122 * basic fields like stats and io rates. If a group has no rules, 1123 * just update the dispatch stats in lockless manner and return. 1124 */ 1125 rcu_read_lock(); 1126 blkcg = bio_blkcg(bio); 1127 tg = throtl_lookup_tg(td, blkcg); 1128 if (tg) { 1129 if (tg_no_rule_group(tg, rw)) { 1130 throtl_update_dispatch_stats(tg_to_blkg(tg), 1131 bio->bi_size, bio->bi_rw); 1132 goto out_unlock_rcu; 1133 } 1134 } 1135 1136 /* 1137 * Either group has not been allocated yet or it is not an unlimited 1138 * IO group 1139 */ 1140 spin_lock_irq(q->queue_lock); 1141 tg = throtl_lookup_create_tg(td, blkcg); 1142 if (unlikely(!tg)) 1143 goto out_unlock; 1144 1145 if (tg->nr_queued[rw]) { 1146 /* 1147 * There is already another bio queued in same dir. No 1148 * need to update dispatch time. 1149 */ 1150 update_disptime = false; 1151 goto queue_bio; 1152 1153 } 1154 1155 /* Bio is with-in rate limit of group */ 1156 if (tg_may_dispatch(td, tg, bio, NULL)) { 1157 throtl_charge_bio(tg, bio); 1158 1159 /* 1160 * We need to trim slice even when bios are not being queued 1161 * otherwise it might happen that a bio is not queued for 1162 * a long time and slice keeps on extending and trim is not 1163 * called for a long time. Now if limits are reduced suddenly 1164 * we take into account all the IO dispatched so far at new 1165 * low rate and * newly queued IO gets a really long dispatch 1166 * time. 1167 * 1168 * So keep on trimming slice even if bio is not queued. 1169 */ 1170 throtl_trim_slice(td, tg, rw); 1171 goto out_unlock; 1172 } 1173 1174 queue_bio: 1175 throtl_log_tg(td, tg, "[%c] bio. bdisp=%llu sz=%u bps=%llu" 1176 " iodisp=%u iops=%u queued=%d/%d", 1177 rw == READ ? 'R' : 'W', 1178 tg->bytes_disp[rw], bio->bi_size, tg->bps[rw], 1179 tg->io_disp[rw], tg->iops[rw], 1180 tg->nr_queued[READ], tg->nr_queued[WRITE]); 1181 1182 bio_associate_current(bio); 1183 throtl_add_bio_tg(q->td, tg, bio); 1184 throttled = true; 1185 1186 if (update_disptime) { 1187 tg_update_disptime(td, tg); 1188 throtl_schedule_next_dispatch(td); 1189 } 1190 1191 out_unlock: 1192 spin_unlock_irq(q->queue_lock); 1193 out_unlock_rcu: 1194 rcu_read_unlock(); 1195 out: 1196 return throttled; 1197 } 1198 1199 /** 1200 * blk_throtl_drain - drain throttled bios 1201 * @q: request_queue to drain throttled bios for 1202 * 1203 * Dispatch all currently throttled bios on @q through ->make_request_fn(). 1204 */ 1205 void blk_throtl_drain(struct request_queue *q) 1206 __releases(q->queue_lock) __acquires(q->queue_lock) 1207 { 1208 struct throtl_data *td = q->td; 1209 struct throtl_rb_root *st = &td->tg_service_tree; 1210 struct throtl_grp *tg; 1211 struct bio_list bl; 1212 struct bio *bio; 1213 1214 queue_lockdep_assert_held(q); 1215 1216 bio_list_init(&bl); 1217 1218 while ((tg = throtl_rb_first(st))) { 1219 throtl_dequeue_tg(td, tg); 1220 1221 while ((bio = bio_list_peek(&tg->bio_lists[READ]))) 1222 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1223 while ((bio = bio_list_peek(&tg->bio_lists[WRITE]))) 1224 tg_dispatch_one_bio(td, tg, bio_data_dir(bio), &bl); 1225 } 1226 spin_unlock_irq(q->queue_lock); 1227 1228 while ((bio = bio_list_pop(&bl))) 1229 generic_make_request(bio); 1230 1231 spin_lock_irq(q->queue_lock); 1232 } 1233 1234 int blk_throtl_init(struct request_queue *q) 1235 { 1236 struct throtl_data *td; 1237 int ret; 1238 1239 td = kzalloc_node(sizeof(*td), GFP_KERNEL, q->node); 1240 if (!td) 1241 return -ENOMEM; 1242 1243 td->tg_service_tree = THROTL_RB_ROOT; 1244 td->limits_changed = false; 1245 INIT_DELAYED_WORK(&td->throtl_work, blk_throtl_work); 1246 1247 q->td = td; 1248 td->queue = q; 1249 1250 /* activate policy */ 1251 ret = blkcg_activate_policy(q, &blkcg_policy_throtl); 1252 if (ret) 1253 kfree(td); 1254 return ret; 1255 } 1256 1257 void blk_throtl_exit(struct request_queue *q) 1258 { 1259 BUG_ON(!q->td); 1260 throtl_shutdown_wq(q); 1261 blkcg_deactivate_policy(q, &blkcg_policy_throtl); 1262 kfree(q->td); 1263 } 1264 1265 static int __init throtl_init(void) 1266 { 1267 kthrotld_workqueue = alloc_workqueue("kthrotld", WQ_MEM_RECLAIM, 0); 1268 if (!kthrotld_workqueue) 1269 panic("Failed to create kthrotld\n"); 1270 1271 return blkcg_policy_register(&blkcg_policy_throtl); 1272 } 1273 1274 module_init(throtl_init); 1275