1 /* 2 * Block rq-qos base io controller 3 * 4 * This works similar to wbt with a few exceptions 5 * 6 * - It's bio based, so the latency covers the whole block layer in addition to 7 * the actual io. 8 * - We will throttle all IO that comes in here if we need to. 9 * - We use the mean latency over the 100ms window. This is because writes can 10 * be particularly fast, which could give us a false sense of the impact of 11 * other workloads on our protected workload. 12 * - By default there's no throttling, we set the queue_depth to UINT_MAX so 13 * that we can have as many outstanding bio's as we're allowed to. Only at 14 * throttle time do we pay attention to the actual queue depth. 15 * 16 * The hierarchy works like the cpu controller does, we track the latency at 17 * every configured node, and each configured node has it's own independent 18 * queue depth. This means that we only care about our latency targets at the 19 * peer level. Some group at the bottom of the hierarchy isn't going to affect 20 * a group at the end of some other path if we're only configred at leaf level. 21 * 22 * Consider the following 23 * 24 * root blkg 25 * / \ 26 * fast (target=5ms) slow (target=10ms) 27 * / \ / \ 28 * a b normal(15ms) unloved 29 * 30 * "a" and "b" have no target, but their combined io under "fast" cannot exceed 31 * an average latency of 5ms. If it does then we will throttle the "slow" 32 * group. In the case of "normal", if it exceeds its 15ms target, we will 33 * throttle "unloved", but nobody else. 34 * 35 * In this example "fast", "slow", and "normal" will be the only groups actually 36 * accounting their io latencies. We have to walk up the heirarchy to the root 37 * on every submit and complete so we can do the appropriate stat recording and 38 * adjust the queue depth of ourselves if needed. 39 * 40 * There are 2 ways we throttle IO. 41 * 42 * 1) Queue depth throttling. As we throttle down we will adjust the maximum 43 * number of IO's we're allowed to have in flight. This starts at (u64)-1 down 44 * to 1. If the group is only ever submitting IO for itself then this is the 45 * only way we throttle. 46 * 47 * 2) Induced delay throttling. This is for the case that a group is generating 48 * IO that has to be issued by the root cg to avoid priority inversion. So think 49 * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot 50 * of work done for us on behalf of the root cg and are being asked to scale 51 * down more then we induce a latency at userspace return. We accumulate the 52 * total amount of time we need to be punished by doing 53 * 54 * total_time += min_lat_nsec - actual_io_completion 55 * 56 * and then at throttle time will do 57 * 58 * throttle_time = min(total_time, NSEC_PER_SEC) 59 * 60 * This induced delay will throttle back the activity that is generating the 61 * root cg issued io's, wethere that's some metadata intensive operation or the 62 * group is using so much memory that it is pushing us into swap. 63 * 64 * Copyright (C) 2018 Josef Bacik 65 */ 66 #include <linux/kernel.h> 67 #include <linux/blk_types.h> 68 #include <linux/backing-dev.h> 69 #include <linux/module.h> 70 #include <linux/timer.h> 71 #include <linux/memcontrol.h> 72 #include <linux/sched/loadavg.h> 73 #include <linux/sched/signal.h> 74 #include <trace/events/block.h> 75 #include "blk-rq-qos.h" 76 #include "blk-stat.h" 77 78 #define DEFAULT_SCALE_COOKIE 1000000U 79 80 static struct blkcg_policy blkcg_policy_iolatency; 81 struct iolatency_grp; 82 83 struct blk_iolatency { 84 struct rq_qos rqos; 85 struct timer_list timer; 86 atomic_t enabled; 87 }; 88 89 static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) 90 { 91 return container_of(rqos, struct blk_iolatency, rqos); 92 } 93 94 static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) 95 { 96 return atomic_read(&blkiolat->enabled) > 0; 97 } 98 99 struct child_latency_info { 100 spinlock_t lock; 101 102 /* Last time we adjusted the scale of everybody. */ 103 u64 last_scale_event; 104 105 /* The latency that we missed. */ 106 u64 scale_lat; 107 108 /* Total io's from all of our children for the last summation. */ 109 u64 nr_samples; 110 111 /* The guy who actually changed the latency numbers. */ 112 struct iolatency_grp *scale_grp; 113 114 /* Cookie to tell if we need to scale up or down. */ 115 atomic_t scale_cookie; 116 }; 117 118 struct iolatency_grp { 119 struct blkg_policy_data pd; 120 struct blk_rq_stat __percpu *stats; 121 struct blk_iolatency *blkiolat; 122 struct rq_depth rq_depth; 123 struct rq_wait rq_wait; 124 atomic64_t window_start; 125 atomic_t scale_cookie; 126 u64 min_lat_nsec; 127 u64 cur_win_nsec; 128 129 /* total running average of our io latency. */ 130 u64 lat_avg; 131 132 /* Our current number of IO's for the last summation. */ 133 u64 nr_samples; 134 135 struct child_latency_info child_lat; 136 }; 137 138 #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) 139 #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC 140 /* 141 * These are the constants used to fake the fixed-point moving average 142 * calculation just like load average. The call to CALC_LOAD folds 143 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling 144 * window size is bucketed to try to approximately calculate average 145 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows 146 * elapse immediately. Note, windows only elapse with IO activity. Idle 147 * periods extend the most recent window. 148 */ 149 #define BLKIOLATENCY_NR_EXP_FACTORS 5 150 #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ 151 (BLKIOLATENCY_NR_EXP_FACTORS - 1)) 152 static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { 153 2045, // exp(1/600) - 600 samples 154 2039, // exp(1/240) - 240 samples 155 2031, // exp(1/120) - 120 samples 156 2023, // exp(1/80) - 80 samples 157 2014, // exp(1/60) - 60 samples 158 }; 159 160 static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) 161 { 162 return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; 163 } 164 165 static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) 166 { 167 return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); 168 } 169 170 static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) 171 { 172 return pd_to_blkg(&iolat->pd); 173 } 174 175 static inline bool iolatency_may_queue(struct iolatency_grp *iolat, 176 wait_queue_entry_t *wait, 177 bool first_block) 178 { 179 struct rq_wait *rqw = &iolat->rq_wait; 180 181 if (first_block && waitqueue_active(&rqw->wait) && 182 rqw->wait.head.next != &wait->entry) 183 return false; 184 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); 185 } 186 187 static void __blkcg_iolatency_throttle(struct rq_qos *rqos, 188 struct iolatency_grp *iolat, 189 spinlock_t *lock, bool issue_as_root, 190 bool use_memdelay) 191 __releases(lock) 192 __acquires(lock) 193 { 194 struct rq_wait *rqw = &iolat->rq_wait; 195 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 196 DEFINE_WAIT(wait); 197 bool first_block = true; 198 199 if (use_delay) 200 blkcg_schedule_throttle(rqos->q, use_memdelay); 201 202 /* 203 * To avoid priority inversions we want to just take a slot if we are 204 * issuing as root. If we're being killed off there's no point in 205 * delaying things, we may have been killed by OOM so throttling may 206 * make recovery take even longer, so just let the IO's through so the 207 * task can go away. 208 */ 209 if (issue_as_root || fatal_signal_pending(current)) { 210 atomic_inc(&rqw->inflight); 211 return; 212 } 213 214 if (iolatency_may_queue(iolat, &wait, first_block)) 215 return; 216 217 do { 218 prepare_to_wait_exclusive(&rqw->wait, &wait, 219 TASK_UNINTERRUPTIBLE); 220 221 if (iolatency_may_queue(iolat, &wait, first_block)) 222 break; 223 first_block = false; 224 225 if (lock) { 226 spin_unlock_irq(lock); 227 io_schedule(); 228 spin_lock_irq(lock); 229 } else { 230 io_schedule(); 231 } 232 } while (1); 233 234 finish_wait(&rqw->wait, &wait); 235 } 236 237 #define SCALE_DOWN_FACTOR 2 238 #define SCALE_UP_FACTOR 4 239 240 static inline unsigned long scale_amount(unsigned long qd, bool up) 241 { 242 return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); 243 } 244 245 /* 246 * We scale the qd down faster than we scale up, so we need to use this helper 247 * to adjust the scale_cookie accordingly so we don't prematurely get 248 * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. 249 * 250 * Each group has their own local copy of the last scale cookie they saw, so if 251 * the global scale cookie goes up or down they know which way they need to go 252 * based on their last knowledge of it. 253 */ 254 static void scale_cookie_change(struct blk_iolatency *blkiolat, 255 struct child_latency_info *lat_info, 256 bool up) 257 { 258 unsigned long qd = blk_queue_depth(blkiolat->rqos.q); 259 unsigned long scale = scale_amount(qd, up); 260 unsigned long old = atomic_read(&lat_info->scale_cookie); 261 unsigned long max_scale = qd << 1; 262 unsigned long diff = 0; 263 264 if (old < DEFAULT_SCALE_COOKIE) 265 diff = DEFAULT_SCALE_COOKIE - old; 266 267 if (up) { 268 if (scale + old > DEFAULT_SCALE_COOKIE) 269 atomic_set(&lat_info->scale_cookie, 270 DEFAULT_SCALE_COOKIE); 271 else if (diff > qd) 272 atomic_inc(&lat_info->scale_cookie); 273 else 274 atomic_add(scale, &lat_info->scale_cookie); 275 } else { 276 /* 277 * We don't want to dig a hole so deep that it takes us hours to 278 * dig out of it. Just enough that we don't throttle/unthrottle 279 * with jagged workloads but can still unthrottle once pressure 280 * has sufficiently dissipated. 281 */ 282 if (diff > qd) { 283 if (diff < max_scale) 284 atomic_dec(&lat_info->scale_cookie); 285 } else { 286 atomic_sub(scale, &lat_info->scale_cookie); 287 } 288 } 289 } 290 291 /* 292 * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the 293 * queue depth at a time so we don't get wild swings and hopefully dial in to 294 * fairer distribution of the overall queue depth. 295 */ 296 static void scale_change(struct iolatency_grp *iolat, bool up) 297 { 298 unsigned long qd = blk_queue_depth(iolat->blkiolat->rqos.q); 299 unsigned long scale = scale_amount(qd, up); 300 unsigned long old = iolat->rq_depth.max_depth; 301 bool changed = false; 302 303 if (old > qd) 304 old = qd; 305 306 if (up) { 307 if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) 308 return; 309 310 if (old < qd) { 311 changed = true; 312 old += scale; 313 old = min(old, qd); 314 iolat->rq_depth.max_depth = old; 315 wake_up_all(&iolat->rq_wait.wait); 316 } 317 } else if (old > 1) { 318 old >>= 1; 319 changed = true; 320 iolat->rq_depth.max_depth = max(old, 1UL); 321 } 322 } 323 324 /* Check our parent and see if the scale cookie has changed. */ 325 static void check_scale_change(struct iolatency_grp *iolat) 326 { 327 struct iolatency_grp *parent; 328 struct child_latency_info *lat_info; 329 unsigned int cur_cookie; 330 unsigned int our_cookie = atomic_read(&iolat->scale_cookie); 331 u64 scale_lat; 332 unsigned int old; 333 int direction = 0; 334 335 if (lat_to_blkg(iolat)->parent == NULL) 336 return; 337 338 parent = blkg_to_lat(lat_to_blkg(iolat)->parent); 339 if (!parent) 340 return; 341 342 lat_info = &parent->child_lat; 343 cur_cookie = atomic_read(&lat_info->scale_cookie); 344 scale_lat = READ_ONCE(lat_info->scale_lat); 345 346 if (cur_cookie < our_cookie) 347 direction = -1; 348 else if (cur_cookie > our_cookie) 349 direction = 1; 350 else 351 return; 352 353 old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); 354 355 /* Somebody beat us to the punch, just bail. */ 356 if (old != our_cookie) 357 return; 358 359 if (direction < 0 && iolat->min_lat_nsec) { 360 u64 samples_thresh; 361 362 if (!scale_lat || iolat->min_lat_nsec <= scale_lat) 363 return; 364 365 /* 366 * Sometimes high priority groups are their own worst enemy, so 367 * instead of taking it out on some poor other group that did 5% 368 * or less of the IO's for the last summation just skip this 369 * scale down event. 370 */ 371 samples_thresh = lat_info->nr_samples * 5; 372 samples_thresh = div64_u64(samples_thresh, 100); 373 if (iolat->nr_samples <= samples_thresh) 374 return; 375 } 376 377 /* We're as low as we can go. */ 378 if (iolat->rq_depth.max_depth == 1 && direction < 0) { 379 blkcg_use_delay(lat_to_blkg(iolat)); 380 return; 381 } 382 383 /* We're back to the default cookie, unthrottle all the things. */ 384 if (cur_cookie == DEFAULT_SCALE_COOKIE) { 385 blkcg_clear_delay(lat_to_blkg(iolat)); 386 iolat->rq_depth.max_depth = UINT_MAX; 387 wake_up_all(&iolat->rq_wait.wait); 388 return; 389 } 390 391 scale_change(iolat, direction > 0); 392 } 393 394 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio, 395 spinlock_t *lock) 396 { 397 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 398 struct blkcg *blkcg; 399 struct blkcg_gq *blkg; 400 struct request_queue *q = rqos->q; 401 bool issue_as_root = bio_issue_as_root_blkg(bio); 402 403 if (!blk_iolatency_enabled(blkiolat)) 404 return; 405 406 rcu_read_lock(); 407 blkcg = bio_blkcg(bio); 408 bio_associate_blkcg(bio, &blkcg->css); 409 blkg = blkg_lookup(blkcg, q); 410 if (unlikely(!blkg)) { 411 if (!lock) 412 spin_lock_irq(q->queue_lock); 413 blkg = blkg_lookup_create(blkcg, q); 414 if (IS_ERR(blkg)) 415 blkg = NULL; 416 if (!lock) 417 spin_unlock_irq(q->queue_lock); 418 } 419 if (!blkg) 420 goto out; 421 422 bio_issue_init(&bio->bi_issue, bio_sectors(bio)); 423 bio_associate_blkg(bio, blkg); 424 out: 425 rcu_read_unlock(); 426 while (blkg && blkg->parent) { 427 struct iolatency_grp *iolat = blkg_to_lat(blkg); 428 if (!iolat) { 429 blkg = blkg->parent; 430 continue; 431 } 432 433 check_scale_change(iolat); 434 __blkcg_iolatency_throttle(rqos, iolat, lock, issue_as_root, 435 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 436 blkg = blkg->parent; 437 } 438 if (!timer_pending(&blkiolat->timer)) 439 mod_timer(&blkiolat->timer, jiffies + HZ); 440 } 441 442 static void iolatency_record_time(struct iolatency_grp *iolat, 443 struct bio_issue *issue, u64 now, 444 bool issue_as_root) 445 { 446 struct blk_rq_stat *rq_stat; 447 u64 start = bio_issue_time(issue); 448 u64 req_time; 449 450 /* 451 * Have to do this so we are truncated to the correct time that our 452 * issue is truncated to. 453 */ 454 now = __bio_issue_time(now); 455 456 if (now <= start) 457 return; 458 459 req_time = now - start; 460 461 /* 462 * We don't want to count issue_as_root bio's in the cgroups latency 463 * statistics as it could skew the numbers downwards. 464 */ 465 if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { 466 u64 sub = iolat->min_lat_nsec; 467 if (req_time < sub) 468 blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); 469 return; 470 } 471 472 rq_stat = get_cpu_ptr(iolat->stats); 473 blk_rq_stat_add(rq_stat, req_time); 474 put_cpu_ptr(rq_stat); 475 } 476 477 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) 478 #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 479 480 static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) 481 { 482 struct blkcg_gq *blkg = lat_to_blkg(iolat); 483 struct iolatency_grp *parent; 484 struct child_latency_info *lat_info; 485 struct blk_rq_stat stat; 486 unsigned long flags; 487 int cpu, exp_idx; 488 489 blk_rq_stat_init(&stat); 490 preempt_disable(); 491 for_each_online_cpu(cpu) { 492 struct blk_rq_stat *s; 493 s = per_cpu_ptr(iolat->stats, cpu); 494 blk_rq_stat_sum(&stat, s); 495 blk_rq_stat_init(s); 496 } 497 preempt_enable(); 498 499 parent = blkg_to_lat(blkg->parent); 500 if (!parent) 501 return; 502 503 lat_info = &parent->child_lat; 504 505 /* 506 * CALC_LOAD takes in a number stored in fixed point representation. 507 * Because we are using this for IO time in ns, the values stored 508 * are significantly larger than the FIXED_1 denominator (2048). 509 * Therefore, rounding errors in the calculation are negligible and 510 * can be ignored. 511 */ 512 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 513 div64_u64(iolat->cur_win_nsec, 514 BLKIOLATENCY_EXP_BUCKET_SIZE)); 515 CALC_LOAD(iolat->lat_avg, iolatency_exp_factors[exp_idx], stat.mean); 516 517 /* Everything is ok and we don't need to adjust the scale. */ 518 if (stat.mean <= iolat->min_lat_nsec && 519 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) 520 return; 521 522 /* Somebody beat us to the punch, just bail. */ 523 spin_lock_irqsave(&lat_info->lock, flags); 524 lat_info->nr_samples -= iolat->nr_samples; 525 lat_info->nr_samples += stat.nr_samples; 526 iolat->nr_samples = stat.nr_samples; 527 528 if ((lat_info->last_scale_event >= now || 529 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME) && 530 lat_info->scale_lat <= iolat->min_lat_nsec) 531 goto out; 532 533 if (stat.mean <= iolat->min_lat_nsec && 534 stat.nr_samples >= BLKIOLATENCY_MIN_GOOD_SAMPLES) { 535 if (lat_info->scale_grp == iolat) { 536 lat_info->last_scale_event = now; 537 scale_cookie_change(iolat->blkiolat, lat_info, true); 538 } 539 } else if (stat.mean > iolat->min_lat_nsec) { 540 lat_info->last_scale_event = now; 541 if (!lat_info->scale_grp || 542 lat_info->scale_lat > iolat->min_lat_nsec) { 543 WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); 544 lat_info->scale_grp = iolat; 545 } 546 scale_cookie_change(iolat->blkiolat, lat_info, false); 547 } 548 out: 549 spin_unlock_irqrestore(&lat_info->lock, flags); 550 } 551 552 static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) 553 { 554 struct blkcg_gq *blkg; 555 struct rq_wait *rqw; 556 struct iolatency_grp *iolat; 557 u64 window_start; 558 u64 now = ktime_to_ns(ktime_get()); 559 bool issue_as_root = bio_issue_as_root_blkg(bio); 560 bool enabled = false; 561 562 blkg = bio->bi_blkg; 563 if (!blkg) 564 return; 565 566 iolat = blkg_to_lat(bio->bi_blkg); 567 if (!iolat) 568 return; 569 570 enabled = blk_iolatency_enabled(iolat->blkiolat); 571 while (blkg && blkg->parent) { 572 iolat = blkg_to_lat(blkg); 573 if (!iolat) { 574 blkg = blkg->parent; 575 continue; 576 } 577 rqw = &iolat->rq_wait; 578 579 atomic_dec(&rqw->inflight); 580 if (!enabled || iolat->min_lat_nsec == 0) 581 goto next; 582 iolatency_record_time(iolat, &bio->bi_issue, now, 583 issue_as_root); 584 window_start = atomic64_read(&iolat->window_start); 585 if (now > window_start && 586 (now - window_start) >= iolat->cur_win_nsec) { 587 if (atomic64_cmpxchg(&iolat->window_start, 588 window_start, now) == window_start) 589 iolatency_check_latencies(iolat, now); 590 } 591 next: 592 wake_up(&rqw->wait); 593 blkg = blkg->parent; 594 } 595 } 596 597 static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) 598 { 599 struct blkcg_gq *blkg; 600 601 blkg = bio->bi_blkg; 602 while (blkg && blkg->parent) { 603 struct rq_wait *rqw; 604 struct iolatency_grp *iolat; 605 606 iolat = blkg_to_lat(blkg); 607 if (!iolat) 608 goto next; 609 610 rqw = &iolat->rq_wait; 611 atomic_dec(&rqw->inflight); 612 wake_up(&rqw->wait); 613 next: 614 blkg = blkg->parent; 615 } 616 } 617 618 static void blkcg_iolatency_exit(struct rq_qos *rqos) 619 { 620 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 621 622 del_timer_sync(&blkiolat->timer); 623 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); 624 kfree(blkiolat); 625 } 626 627 static struct rq_qos_ops blkcg_iolatency_ops = { 628 .throttle = blkcg_iolatency_throttle, 629 .cleanup = blkcg_iolatency_cleanup, 630 .done_bio = blkcg_iolatency_done_bio, 631 .exit = blkcg_iolatency_exit, 632 }; 633 634 static void blkiolatency_timer_fn(struct timer_list *t) 635 { 636 struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); 637 struct blkcg_gq *blkg; 638 struct cgroup_subsys_state *pos_css; 639 u64 now = ktime_to_ns(ktime_get()); 640 641 rcu_read_lock(); 642 blkg_for_each_descendant_pre(blkg, pos_css, 643 blkiolat->rqos.q->root_blkg) { 644 struct iolatency_grp *iolat; 645 struct child_latency_info *lat_info; 646 unsigned long flags; 647 u64 cookie; 648 649 /* 650 * We could be exiting, don't access the pd unless we have a 651 * ref on the blkg. 652 */ 653 if (!blkg_try_get(blkg)) 654 continue; 655 656 iolat = blkg_to_lat(blkg); 657 if (!iolat) 658 goto next; 659 660 lat_info = &iolat->child_lat; 661 cookie = atomic_read(&lat_info->scale_cookie); 662 663 if (cookie >= DEFAULT_SCALE_COOKIE) 664 goto next; 665 666 spin_lock_irqsave(&lat_info->lock, flags); 667 if (lat_info->last_scale_event >= now) 668 goto next_lock; 669 670 /* 671 * We scaled down but don't have a scale_grp, scale up and carry 672 * on. 673 */ 674 if (lat_info->scale_grp == NULL) { 675 scale_cookie_change(iolat->blkiolat, lat_info, true); 676 goto next_lock; 677 } 678 679 /* 680 * It's been 5 seconds since our last scale event, clear the 681 * scale grp in case the group that needed the scale down isn't 682 * doing any IO currently. 683 */ 684 if (now - lat_info->last_scale_event >= 685 ((u64)NSEC_PER_SEC * 5)) 686 lat_info->scale_grp = NULL; 687 next_lock: 688 spin_unlock_irqrestore(&lat_info->lock, flags); 689 next: 690 blkg_put(blkg); 691 } 692 rcu_read_unlock(); 693 } 694 695 int blk_iolatency_init(struct request_queue *q) 696 { 697 struct blk_iolatency *blkiolat; 698 struct rq_qos *rqos; 699 int ret; 700 701 blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); 702 if (!blkiolat) 703 return -ENOMEM; 704 705 rqos = &blkiolat->rqos; 706 rqos->id = RQ_QOS_CGROUP; 707 rqos->ops = &blkcg_iolatency_ops; 708 rqos->q = q; 709 710 rq_qos_add(q, rqos); 711 712 ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); 713 if (ret) { 714 rq_qos_del(q, rqos); 715 kfree(blkiolat); 716 return ret; 717 } 718 719 timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); 720 721 return 0; 722 } 723 724 static void iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) 725 { 726 struct iolatency_grp *iolat = blkg_to_lat(blkg); 727 struct blk_iolatency *blkiolat = iolat->blkiolat; 728 u64 oldval = iolat->min_lat_nsec; 729 730 iolat->min_lat_nsec = val; 731 iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); 732 iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, 733 BLKIOLATENCY_MAX_WIN_SIZE); 734 735 if (!oldval && val) 736 atomic_inc(&blkiolat->enabled); 737 if (oldval && !val) 738 atomic_dec(&blkiolat->enabled); 739 } 740 741 static void iolatency_clear_scaling(struct blkcg_gq *blkg) 742 { 743 if (blkg->parent) { 744 struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); 745 struct child_latency_info *lat_info; 746 if (!iolat) 747 return; 748 749 lat_info = &iolat->child_lat; 750 spin_lock(&lat_info->lock); 751 atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); 752 lat_info->last_scale_event = 0; 753 lat_info->scale_grp = NULL; 754 lat_info->scale_lat = 0; 755 spin_unlock(&lat_info->lock); 756 } 757 } 758 759 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, 760 size_t nbytes, loff_t off) 761 { 762 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 763 struct blkcg_gq *blkg; 764 struct blk_iolatency *blkiolat; 765 struct blkg_conf_ctx ctx; 766 struct iolatency_grp *iolat; 767 char *p, *tok; 768 u64 lat_val = 0; 769 u64 oldval; 770 int ret; 771 772 ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); 773 if (ret) 774 return ret; 775 776 iolat = blkg_to_lat(ctx.blkg); 777 blkiolat = iolat->blkiolat; 778 p = ctx.body; 779 780 ret = -EINVAL; 781 while ((tok = strsep(&p, " "))) { 782 char key[16]; 783 char val[21]; /* 18446744073709551616 */ 784 785 if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) 786 goto out; 787 788 if (!strcmp(key, "target")) { 789 u64 v; 790 791 if (!strcmp(val, "max")) 792 lat_val = 0; 793 else if (sscanf(val, "%llu", &v) == 1) 794 lat_val = v * NSEC_PER_USEC; 795 else 796 goto out; 797 } else { 798 goto out; 799 } 800 } 801 802 /* Walk up the tree to see if our new val is lower than it should be. */ 803 blkg = ctx.blkg; 804 oldval = iolat->min_lat_nsec; 805 806 iolatency_set_min_lat_nsec(blkg, lat_val); 807 if (oldval != iolat->min_lat_nsec) { 808 iolatency_clear_scaling(blkg); 809 } 810 811 ret = 0; 812 out: 813 blkg_conf_finish(&ctx); 814 return ret ?: nbytes; 815 } 816 817 static u64 iolatency_prfill_limit(struct seq_file *sf, 818 struct blkg_policy_data *pd, int off) 819 { 820 struct iolatency_grp *iolat = pd_to_lat(pd); 821 const char *dname = blkg_dev_name(pd->blkg); 822 823 if (!dname || !iolat->min_lat_nsec) 824 return 0; 825 seq_printf(sf, "%s target=%llu\n", 826 dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); 827 return 0; 828 } 829 830 static int iolatency_print_limit(struct seq_file *sf, void *v) 831 { 832 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 833 iolatency_prfill_limit, 834 &blkcg_policy_iolatency, seq_cft(sf)->private, false); 835 return 0; 836 } 837 838 static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, 839 size_t size) 840 { 841 struct iolatency_grp *iolat = pd_to_lat(pd); 842 unsigned long long avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 843 unsigned long long cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 844 845 if (iolat->rq_depth.max_depth == UINT_MAX) 846 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", 847 avg_lat, cur_win); 848 849 return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", 850 iolat->rq_depth.max_depth, avg_lat, cur_win); 851 } 852 853 854 static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) 855 { 856 struct iolatency_grp *iolat; 857 858 iolat = kzalloc_node(sizeof(*iolat), gfp, node); 859 if (!iolat) 860 return NULL; 861 iolat->stats = __alloc_percpu_gfp(sizeof(struct blk_rq_stat), 862 __alignof__(struct blk_rq_stat), gfp); 863 if (!iolat->stats) { 864 kfree(iolat); 865 return NULL; 866 } 867 return &iolat->pd; 868 } 869 870 static void iolatency_pd_init(struct blkg_policy_data *pd) 871 { 872 struct iolatency_grp *iolat = pd_to_lat(pd); 873 struct blkcg_gq *blkg = lat_to_blkg(iolat); 874 struct rq_qos *rqos = blkcg_rq_qos(blkg->q); 875 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 876 u64 now = ktime_to_ns(ktime_get()); 877 int cpu; 878 879 for_each_possible_cpu(cpu) { 880 struct blk_rq_stat *stat; 881 stat = per_cpu_ptr(iolat->stats, cpu); 882 blk_rq_stat_init(stat); 883 } 884 885 rq_wait_init(&iolat->rq_wait); 886 spin_lock_init(&iolat->child_lat.lock); 887 iolat->rq_depth.queue_depth = blk_queue_depth(blkg->q); 888 iolat->rq_depth.max_depth = UINT_MAX; 889 iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; 890 iolat->blkiolat = blkiolat; 891 iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; 892 atomic64_set(&iolat->window_start, now); 893 894 /* 895 * We init things in list order, so the pd for the parent may not be 896 * init'ed yet for whatever reason. 897 */ 898 if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { 899 struct iolatency_grp *parent = blkg_to_lat(blkg->parent); 900 atomic_set(&iolat->scale_cookie, 901 atomic_read(&parent->child_lat.scale_cookie)); 902 } else { 903 atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); 904 } 905 906 atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); 907 } 908 909 static void iolatency_pd_offline(struct blkg_policy_data *pd) 910 { 911 struct iolatency_grp *iolat = pd_to_lat(pd); 912 struct blkcg_gq *blkg = lat_to_blkg(iolat); 913 914 iolatency_set_min_lat_nsec(blkg, 0); 915 iolatency_clear_scaling(blkg); 916 } 917 918 static void iolatency_pd_free(struct blkg_policy_data *pd) 919 { 920 struct iolatency_grp *iolat = pd_to_lat(pd); 921 free_percpu(iolat->stats); 922 kfree(iolat); 923 } 924 925 static struct cftype iolatency_files[] = { 926 { 927 .name = "latency", 928 .flags = CFTYPE_NOT_ON_ROOT, 929 .seq_show = iolatency_print_limit, 930 .write = iolatency_set_limit, 931 }, 932 {} 933 }; 934 935 static struct blkcg_policy blkcg_policy_iolatency = { 936 .dfl_cftypes = iolatency_files, 937 .pd_alloc_fn = iolatency_pd_alloc, 938 .pd_init_fn = iolatency_pd_init, 939 .pd_offline_fn = iolatency_pd_offline, 940 .pd_free_fn = iolatency_pd_free, 941 .pd_stat_fn = iolatency_pd_stat, 942 }; 943 944 static int __init iolatency_init(void) 945 { 946 return blkcg_policy_register(&blkcg_policy_iolatency); 947 } 948 949 static void __exit iolatency_exit(void) 950 { 951 return blkcg_policy_unregister(&blkcg_policy_iolatency); 952 } 953 954 module_init(iolatency_init); 955 module_exit(iolatency_exit); 956