1 /* 2 * Block rq-qos base io controller 3 * 4 * This works similar to wbt with a few exceptions 5 * 6 * - It's bio based, so the latency covers the whole block layer in addition to 7 * the actual io. 8 * - We will throttle all IO that comes in here if we need to. 9 * - We use the mean latency over the 100ms window. This is because writes can 10 * be particularly fast, which could give us a false sense of the impact of 11 * other workloads on our protected workload. 12 * - By default there's no throttling, we set the queue_depth to UINT_MAX so 13 * that we can have as many outstanding bio's as we're allowed to. Only at 14 * throttle time do we pay attention to the actual queue depth. 15 * 16 * The hierarchy works like the cpu controller does, we track the latency at 17 * every configured node, and each configured node has it's own independent 18 * queue depth. This means that we only care about our latency targets at the 19 * peer level. Some group at the bottom of the hierarchy isn't going to affect 20 * a group at the end of some other path if we're only configred at leaf level. 21 * 22 * Consider the following 23 * 24 * root blkg 25 * / \ 26 * fast (target=5ms) slow (target=10ms) 27 * / \ / \ 28 * a b normal(15ms) unloved 29 * 30 * "a" and "b" have no target, but their combined io under "fast" cannot exceed 31 * an average latency of 5ms. If it does then we will throttle the "slow" 32 * group. In the case of "normal", if it exceeds its 15ms target, we will 33 * throttle "unloved", but nobody else. 34 * 35 * In this example "fast", "slow", and "normal" will be the only groups actually 36 * accounting their io latencies. We have to walk up the heirarchy to the root 37 * on every submit and complete so we can do the appropriate stat recording and 38 * adjust the queue depth of ourselves if needed. 39 * 40 * There are 2 ways we throttle IO. 41 * 42 * 1) Queue depth throttling. As we throttle down we will adjust the maximum 43 * number of IO's we're allowed to have in flight. This starts at (u64)-1 down 44 * to 1. If the group is only ever submitting IO for itself then this is the 45 * only way we throttle. 46 * 47 * 2) Induced delay throttling. This is for the case that a group is generating 48 * IO that has to be issued by the root cg to avoid priority inversion. So think 49 * REQ_META or REQ_SWAP. If we are already at qd == 1 and we're getting a lot 50 * of work done for us on behalf of the root cg and are being asked to scale 51 * down more then we induce a latency at userspace return. We accumulate the 52 * total amount of time we need to be punished by doing 53 * 54 * total_time += min_lat_nsec - actual_io_completion 55 * 56 * and then at throttle time will do 57 * 58 * throttle_time = min(total_time, NSEC_PER_SEC) 59 * 60 * This induced delay will throttle back the activity that is generating the 61 * root cg issued io's, wethere that's some metadata intensive operation or the 62 * group is using so much memory that it is pushing us into swap. 63 * 64 * Copyright (C) 2018 Josef Bacik 65 */ 66 #include <linux/kernel.h> 67 #include <linux/blk_types.h> 68 #include <linux/backing-dev.h> 69 #include <linux/module.h> 70 #include <linux/timer.h> 71 #include <linux/memcontrol.h> 72 #include <linux/sched/loadavg.h> 73 #include <linux/sched/signal.h> 74 #include <trace/events/block.h> 75 #include <linux/blk-mq.h> 76 #include "blk-rq-qos.h" 77 #include "blk-stat.h" 78 #include "blk.h" 79 80 #define DEFAULT_SCALE_COOKIE 1000000U 81 82 static struct blkcg_policy blkcg_policy_iolatency; 83 struct iolatency_grp; 84 85 struct blk_iolatency { 86 struct rq_qos rqos; 87 struct timer_list timer; 88 atomic_t enabled; 89 }; 90 91 static inline struct blk_iolatency *BLKIOLATENCY(struct rq_qos *rqos) 92 { 93 return container_of(rqos, struct blk_iolatency, rqos); 94 } 95 96 static inline bool blk_iolatency_enabled(struct blk_iolatency *blkiolat) 97 { 98 return atomic_read(&blkiolat->enabled) > 0; 99 } 100 101 struct child_latency_info { 102 spinlock_t lock; 103 104 /* Last time we adjusted the scale of everybody. */ 105 u64 last_scale_event; 106 107 /* The latency that we missed. */ 108 u64 scale_lat; 109 110 /* Total io's from all of our children for the last summation. */ 111 u64 nr_samples; 112 113 /* The guy who actually changed the latency numbers. */ 114 struct iolatency_grp *scale_grp; 115 116 /* Cookie to tell if we need to scale up or down. */ 117 atomic_t scale_cookie; 118 }; 119 120 struct percentile_stats { 121 u64 total; 122 u64 missed; 123 }; 124 125 struct latency_stat { 126 union { 127 struct percentile_stats ps; 128 struct blk_rq_stat rqs; 129 }; 130 }; 131 132 struct iolatency_grp { 133 struct blkg_policy_data pd; 134 struct latency_stat __percpu *stats; 135 struct latency_stat cur_stat; 136 struct blk_iolatency *blkiolat; 137 struct rq_depth rq_depth; 138 struct rq_wait rq_wait; 139 atomic64_t window_start; 140 atomic_t scale_cookie; 141 u64 min_lat_nsec; 142 u64 cur_win_nsec; 143 144 /* total running average of our io latency. */ 145 u64 lat_avg; 146 147 /* Our current number of IO's for the last summation. */ 148 u64 nr_samples; 149 150 bool ssd; 151 struct child_latency_info child_lat; 152 }; 153 154 #define BLKIOLATENCY_MIN_WIN_SIZE (100 * NSEC_PER_MSEC) 155 #define BLKIOLATENCY_MAX_WIN_SIZE NSEC_PER_SEC 156 /* 157 * These are the constants used to fake the fixed-point moving average 158 * calculation just like load average. The call to calc_load() folds 159 * (FIXED_1 (2048) - exp_factor) * new_sample into lat_avg. The sampling 160 * window size is bucketed to try to approximately calculate average 161 * latency such that 1/exp (decay rate) is [1 min, 2.5 min) when windows 162 * elapse immediately. Note, windows only elapse with IO activity. Idle 163 * periods extend the most recent window. 164 */ 165 #define BLKIOLATENCY_NR_EXP_FACTORS 5 166 #define BLKIOLATENCY_EXP_BUCKET_SIZE (BLKIOLATENCY_MAX_WIN_SIZE / \ 167 (BLKIOLATENCY_NR_EXP_FACTORS - 1)) 168 static const u64 iolatency_exp_factors[BLKIOLATENCY_NR_EXP_FACTORS] = { 169 2045, // exp(1/600) - 600 samples 170 2039, // exp(1/240) - 240 samples 171 2031, // exp(1/120) - 120 samples 172 2023, // exp(1/80) - 80 samples 173 2014, // exp(1/60) - 60 samples 174 }; 175 176 static inline struct iolatency_grp *pd_to_lat(struct blkg_policy_data *pd) 177 { 178 return pd ? container_of(pd, struct iolatency_grp, pd) : NULL; 179 } 180 181 static inline struct iolatency_grp *blkg_to_lat(struct blkcg_gq *blkg) 182 { 183 return pd_to_lat(blkg_to_pd(blkg, &blkcg_policy_iolatency)); 184 } 185 186 static inline struct blkcg_gq *lat_to_blkg(struct iolatency_grp *iolat) 187 { 188 return pd_to_blkg(&iolat->pd); 189 } 190 191 static inline void latency_stat_init(struct iolatency_grp *iolat, 192 struct latency_stat *stat) 193 { 194 if (iolat->ssd) { 195 stat->ps.total = 0; 196 stat->ps.missed = 0; 197 } else 198 blk_rq_stat_init(&stat->rqs); 199 } 200 201 static inline void latency_stat_sum(struct iolatency_grp *iolat, 202 struct latency_stat *sum, 203 struct latency_stat *stat) 204 { 205 if (iolat->ssd) { 206 sum->ps.total += stat->ps.total; 207 sum->ps.missed += stat->ps.missed; 208 } else 209 blk_rq_stat_sum(&sum->rqs, &stat->rqs); 210 } 211 212 static inline void latency_stat_record_time(struct iolatency_grp *iolat, 213 u64 req_time) 214 { 215 struct latency_stat *stat = get_cpu_ptr(iolat->stats); 216 if (iolat->ssd) { 217 if (req_time >= iolat->min_lat_nsec) 218 stat->ps.missed++; 219 stat->ps.total++; 220 } else 221 blk_rq_stat_add(&stat->rqs, req_time); 222 put_cpu_ptr(stat); 223 } 224 225 static inline bool latency_sum_ok(struct iolatency_grp *iolat, 226 struct latency_stat *stat) 227 { 228 if (iolat->ssd) { 229 u64 thresh = div64_u64(stat->ps.total, 10); 230 thresh = max(thresh, 1ULL); 231 return stat->ps.missed < thresh; 232 } 233 return stat->rqs.mean <= iolat->min_lat_nsec; 234 } 235 236 static inline u64 latency_stat_samples(struct iolatency_grp *iolat, 237 struct latency_stat *stat) 238 { 239 if (iolat->ssd) 240 return stat->ps.total; 241 return stat->rqs.nr_samples; 242 } 243 244 static inline void iolat_update_total_lat_avg(struct iolatency_grp *iolat, 245 struct latency_stat *stat) 246 { 247 int exp_idx; 248 249 if (iolat->ssd) 250 return; 251 252 /* 253 * calc_load() takes in a number stored in fixed point representation. 254 * Because we are using this for IO time in ns, the values stored 255 * are significantly larger than the FIXED_1 denominator (2048). 256 * Therefore, rounding errors in the calculation are negligible and 257 * can be ignored. 258 */ 259 exp_idx = min_t(int, BLKIOLATENCY_NR_EXP_FACTORS - 1, 260 div64_u64(iolat->cur_win_nsec, 261 BLKIOLATENCY_EXP_BUCKET_SIZE)); 262 iolat->lat_avg = calc_load(iolat->lat_avg, 263 iolatency_exp_factors[exp_idx], 264 stat->rqs.mean); 265 } 266 267 static void iolat_cleanup_cb(struct rq_wait *rqw, void *private_data) 268 { 269 atomic_dec(&rqw->inflight); 270 wake_up(&rqw->wait); 271 } 272 273 static bool iolat_acquire_inflight(struct rq_wait *rqw, void *private_data) 274 { 275 struct iolatency_grp *iolat = private_data; 276 return rq_wait_inc_below(rqw, iolat->rq_depth.max_depth); 277 } 278 279 static void __blkcg_iolatency_throttle(struct rq_qos *rqos, 280 struct iolatency_grp *iolat, 281 bool issue_as_root, 282 bool use_memdelay) 283 { 284 struct rq_wait *rqw = &iolat->rq_wait; 285 unsigned use_delay = atomic_read(&lat_to_blkg(iolat)->use_delay); 286 287 if (use_delay) 288 blkcg_schedule_throttle(rqos->q, use_memdelay); 289 290 /* 291 * To avoid priority inversions we want to just take a slot if we are 292 * issuing as root. If we're being killed off there's no point in 293 * delaying things, we may have been killed by OOM so throttling may 294 * make recovery take even longer, so just let the IO's through so the 295 * task can go away. 296 */ 297 if (issue_as_root || fatal_signal_pending(current)) { 298 atomic_inc(&rqw->inflight); 299 return; 300 } 301 302 rq_qos_wait(rqw, iolat, iolat_acquire_inflight, iolat_cleanup_cb); 303 } 304 305 #define SCALE_DOWN_FACTOR 2 306 #define SCALE_UP_FACTOR 4 307 308 static inline unsigned long scale_amount(unsigned long qd, bool up) 309 { 310 return max(up ? qd >> SCALE_UP_FACTOR : qd >> SCALE_DOWN_FACTOR, 1UL); 311 } 312 313 /* 314 * We scale the qd down faster than we scale up, so we need to use this helper 315 * to adjust the scale_cookie accordingly so we don't prematurely get 316 * scale_cookie at DEFAULT_SCALE_COOKIE and unthrottle too much. 317 * 318 * Each group has their own local copy of the last scale cookie they saw, so if 319 * the global scale cookie goes up or down they know which way they need to go 320 * based on their last knowledge of it. 321 */ 322 static void scale_cookie_change(struct blk_iolatency *blkiolat, 323 struct child_latency_info *lat_info, 324 bool up) 325 { 326 unsigned long qd = blkiolat->rqos.q->nr_requests; 327 unsigned long scale = scale_amount(qd, up); 328 unsigned long old = atomic_read(&lat_info->scale_cookie); 329 unsigned long max_scale = qd << 1; 330 unsigned long diff = 0; 331 332 if (old < DEFAULT_SCALE_COOKIE) 333 diff = DEFAULT_SCALE_COOKIE - old; 334 335 if (up) { 336 if (scale + old > DEFAULT_SCALE_COOKIE) 337 atomic_set(&lat_info->scale_cookie, 338 DEFAULT_SCALE_COOKIE); 339 else if (diff > qd) 340 atomic_inc(&lat_info->scale_cookie); 341 else 342 atomic_add(scale, &lat_info->scale_cookie); 343 } else { 344 /* 345 * We don't want to dig a hole so deep that it takes us hours to 346 * dig out of it. Just enough that we don't throttle/unthrottle 347 * with jagged workloads but can still unthrottle once pressure 348 * has sufficiently dissipated. 349 */ 350 if (diff > qd) { 351 if (diff < max_scale) 352 atomic_dec(&lat_info->scale_cookie); 353 } else { 354 atomic_sub(scale, &lat_info->scale_cookie); 355 } 356 } 357 } 358 359 /* 360 * Change the queue depth of the iolatency_grp. We add/subtract 1/16th of the 361 * queue depth at a time so we don't get wild swings and hopefully dial in to 362 * fairer distribution of the overall queue depth. 363 */ 364 static void scale_change(struct iolatency_grp *iolat, bool up) 365 { 366 unsigned long qd = iolat->blkiolat->rqos.q->nr_requests; 367 unsigned long scale = scale_amount(qd, up); 368 unsigned long old = iolat->rq_depth.max_depth; 369 370 if (old > qd) 371 old = qd; 372 373 if (up) { 374 if (old == 1 && blkcg_unuse_delay(lat_to_blkg(iolat))) 375 return; 376 377 if (old < qd) { 378 old += scale; 379 old = min(old, qd); 380 iolat->rq_depth.max_depth = old; 381 wake_up_all(&iolat->rq_wait.wait); 382 } 383 } else { 384 old >>= 1; 385 iolat->rq_depth.max_depth = max(old, 1UL); 386 } 387 } 388 389 /* Check our parent and see if the scale cookie has changed. */ 390 static void check_scale_change(struct iolatency_grp *iolat) 391 { 392 struct iolatency_grp *parent; 393 struct child_latency_info *lat_info; 394 unsigned int cur_cookie; 395 unsigned int our_cookie = atomic_read(&iolat->scale_cookie); 396 u64 scale_lat; 397 unsigned int old; 398 int direction = 0; 399 400 if (lat_to_blkg(iolat)->parent == NULL) 401 return; 402 403 parent = blkg_to_lat(lat_to_blkg(iolat)->parent); 404 if (!parent) 405 return; 406 407 lat_info = &parent->child_lat; 408 cur_cookie = atomic_read(&lat_info->scale_cookie); 409 scale_lat = READ_ONCE(lat_info->scale_lat); 410 411 if (cur_cookie < our_cookie) 412 direction = -1; 413 else if (cur_cookie > our_cookie) 414 direction = 1; 415 else 416 return; 417 418 old = atomic_cmpxchg(&iolat->scale_cookie, our_cookie, cur_cookie); 419 420 /* Somebody beat us to the punch, just bail. */ 421 if (old != our_cookie) 422 return; 423 424 if (direction < 0 && iolat->min_lat_nsec) { 425 u64 samples_thresh; 426 427 if (!scale_lat || iolat->min_lat_nsec <= scale_lat) 428 return; 429 430 /* 431 * Sometimes high priority groups are their own worst enemy, so 432 * instead of taking it out on some poor other group that did 5% 433 * or less of the IO's for the last summation just skip this 434 * scale down event. 435 */ 436 samples_thresh = lat_info->nr_samples * 5; 437 samples_thresh = max(1ULL, div64_u64(samples_thresh, 100)); 438 if (iolat->nr_samples <= samples_thresh) 439 return; 440 } 441 442 /* We're as low as we can go. */ 443 if (iolat->rq_depth.max_depth == 1 && direction < 0) { 444 blkcg_use_delay(lat_to_blkg(iolat)); 445 return; 446 } 447 448 /* We're back to the default cookie, unthrottle all the things. */ 449 if (cur_cookie == DEFAULT_SCALE_COOKIE) { 450 blkcg_clear_delay(lat_to_blkg(iolat)); 451 iolat->rq_depth.max_depth = UINT_MAX; 452 wake_up_all(&iolat->rq_wait.wait); 453 return; 454 } 455 456 scale_change(iolat, direction > 0); 457 } 458 459 static void blkcg_iolatency_throttle(struct rq_qos *rqos, struct bio *bio) 460 { 461 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 462 struct blkcg_gq *blkg = bio->bi_blkg; 463 bool issue_as_root = bio_issue_as_root_blkg(bio); 464 465 if (!blk_iolatency_enabled(blkiolat)) 466 return; 467 468 while (blkg && blkg->parent) { 469 struct iolatency_grp *iolat = blkg_to_lat(blkg); 470 if (!iolat) { 471 blkg = blkg->parent; 472 continue; 473 } 474 475 check_scale_change(iolat); 476 __blkcg_iolatency_throttle(rqos, iolat, issue_as_root, 477 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); 478 blkg = blkg->parent; 479 } 480 if (!timer_pending(&blkiolat->timer)) 481 mod_timer(&blkiolat->timer, jiffies + HZ); 482 } 483 484 static void iolatency_record_time(struct iolatency_grp *iolat, 485 struct bio_issue *issue, u64 now, 486 bool issue_as_root) 487 { 488 u64 start = bio_issue_time(issue); 489 u64 req_time; 490 491 /* 492 * Have to do this so we are truncated to the correct time that our 493 * issue is truncated to. 494 */ 495 now = __bio_issue_time(now); 496 497 if (now <= start) 498 return; 499 500 req_time = now - start; 501 502 /* 503 * We don't want to count issue_as_root bio's in the cgroups latency 504 * statistics as it could skew the numbers downwards. 505 */ 506 if (unlikely(issue_as_root && iolat->rq_depth.max_depth != UINT_MAX)) { 507 u64 sub = iolat->min_lat_nsec; 508 if (req_time < sub) 509 blkcg_add_delay(lat_to_blkg(iolat), now, sub - req_time); 510 return; 511 } 512 513 latency_stat_record_time(iolat, req_time); 514 } 515 516 #define BLKIOLATENCY_MIN_ADJUST_TIME (500 * NSEC_PER_MSEC) 517 #define BLKIOLATENCY_MIN_GOOD_SAMPLES 5 518 519 static void iolatency_check_latencies(struct iolatency_grp *iolat, u64 now) 520 { 521 struct blkcg_gq *blkg = lat_to_blkg(iolat); 522 struct iolatency_grp *parent; 523 struct child_latency_info *lat_info; 524 struct latency_stat stat; 525 unsigned long flags; 526 int cpu; 527 528 latency_stat_init(iolat, &stat); 529 preempt_disable(); 530 for_each_online_cpu(cpu) { 531 struct latency_stat *s; 532 s = per_cpu_ptr(iolat->stats, cpu); 533 latency_stat_sum(iolat, &stat, s); 534 latency_stat_init(iolat, s); 535 } 536 preempt_enable(); 537 538 parent = blkg_to_lat(blkg->parent); 539 if (!parent) 540 return; 541 542 lat_info = &parent->child_lat; 543 544 iolat_update_total_lat_avg(iolat, &stat); 545 546 /* Everything is ok and we don't need to adjust the scale. */ 547 if (latency_sum_ok(iolat, &stat) && 548 atomic_read(&lat_info->scale_cookie) == DEFAULT_SCALE_COOKIE) 549 return; 550 551 /* Somebody beat us to the punch, just bail. */ 552 spin_lock_irqsave(&lat_info->lock, flags); 553 554 latency_stat_sum(iolat, &iolat->cur_stat, &stat); 555 lat_info->nr_samples -= iolat->nr_samples; 556 lat_info->nr_samples += latency_stat_samples(iolat, &iolat->cur_stat); 557 iolat->nr_samples = latency_stat_samples(iolat, &iolat->cur_stat); 558 559 if ((lat_info->last_scale_event >= now || 560 now - lat_info->last_scale_event < BLKIOLATENCY_MIN_ADJUST_TIME)) 561 goto out; 562 563 if (latency_sum_ok(iolat, &iolat->cur_stat) && 564 latency_sum_ok(iolat, &stat)) { 565 if (latency_stat_samples(iolat, &iolat->cur_stat) < 566 BLKIOLATENCY_MIN_GOOD_SAMPLES) 567 goto out; 568 if (lat_info->scale_grp == iolat) { 569 lat_info->last_scale_event = now; 570 scale_cookie_change(iolat->blkiolat, lat_info, true); 571 } 572 } else if (lat_info->scale_lat == 0 || 573 lat_info->scale_lat >= iolat->min_lat_nsec) { 574 lat_info->last_scale_event = now; 575 if (!lat_info->scale_grp || 576 lat_info->scale_lat > iolat->min_lat_nsec) { 577 WRITE_ONCE(lat_info->scale_lat, iolat->min_lat_nsec); 578 lat_info->scale_grp = iolat; 579 } 580 scale_cookie_change(iolat->blkiolat, lat_info, false); 581 } 582 latency_stat_init(iolat, &iolat->cur_stat); 583 out: 584 spin_unlock_irqrestore(&lat_info->lock, flags); 585 } 586 587 static void blkcg_iolatency_done_bio(struct rq_qos *rqos, struct bio *bio) 588 { 589 struct blkcg_gq *blkg; 590 struct rq_wait *rqw; 591 struct iolatency_grp *iolat; 592 u64 window_start; 593 u64 now = ktime_to_ns(ktime_get()); 594 bool issue_as_root = bio_issue_as_root_blkg(bio); 595 bool enabled = false; 596 int inflight = 0; 597 598 blkg = bio->bi_blkg; 599 if (!blkg || !bio_flagged(bio, BIO_TRACKED)) 600 return; 601 602 iolat = blkg_to_lat(bio->bi_blkg); 603 if (!iolat) 604 return; 605 606 enabled = blk_iolatency_enabled(iolat->blkiolat); 607 if (!enabled) 608 return; 609 610 while (blkg && blkg->parent) { 611 iolat = blkg_to_lat(blkg); 612 if (!iolat) { 613 blkg = blkg->parent; 614 continue; 615 } 616 rqw = &iolat->rq_wait; 617 618 inflight = atomic_dec_return(&rqw->inflight); 619 WARN_ON_ONCE(inflight < 0); 620 if (iolat->min_lat_nsec == 0) 621 goto next; 622 iolatency_record_time(iolat, &bio->bi_issue, now, 623 issue_as_root); 624 window_start = atomic64_read(&iolat->window_start); 625 if (now > window_start && 626 (now - window_start) >= iolat->cur_win_nsec) { 627 if (atomic64_cmpxchg(&iolat->window_start, 628 window_start, now) == window_start) 629 iolatency_check_latencies(iolat, now); 630 } 631 next: 632 wake_up(&rqw->wait); 633 blkg = blkg->parent; 634 } 635 } 636 637 static void blkcg_iolatency_cleanup(struct rq_qos *rqos, struct bio *bio) 638 { 639 struct blkcg_gq *blkg; 640 641 blkg = bio->bi_blkg; 642 while (blkg && blkg->parent) { 643 struct rq_wait *rqw; 644 struct iolatency_grp *iolat; 645 646 iolat = blkg_to_lat(blkg); 647 if (!iolat) 648 goto next; 649 650 rqw = &iolat->rq_wait; 651 atomic_dec(&rqw->inflight); 652 wake_up(&rqw->wait); 653 next: 654 blkg = blkg->parent; 655 } 656 } 657 658 static void blkcg_iolatency_exit(struct rq_qos *rqos) 659 { 660 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 661 662 del_timer_sync(&blkiolat->timer); 663 blkcg_deactivate_policy(rqos->q, &blkcg_policy_iolatency); 664 kfree(blkiolat); 665 } 666 667 static struct rq_qos_ops blkcg_iolatency_ops = { 668 .throttle = blkcg_iolatency_throttle, 669 .cleanup = blkcg_iolatency_cleanup, 670 .done_bio = blkcg_iolatency_done_bio, 671 .exit = blkcg_iolatency_exit, 672 }; 673 674 static void blkiolatency_timer_fn(struct timer_list *t) 675 { 676 struct blk_iolatency *blkiolat = from_timer(blkiolat, t, timer); 677 struct blkcg_gq *blkg; 678 struct cgroup_subsys_state *pos_css; 679 u64 now = ktime_to_ns(ktime_get()); 680 681 rcu_read_lock(); 682 blkg_for_each_descendant_pre(blkg, pos_css, 683 blkiolat->rqos.q->root_blkg) { 684 struct iolatency_grp *iolat; 685 struct child_latency_info *lat_info; 686 unsigned long flags; 687 u64 cookie; 688 689 /* 690 * We could be exiting, don't access the pd unless we have a 691 * ref on the blkg. 692 */ 693 if (!blkg_tryget(blkg)) 694 continue; 695 696 iolat = blkg_to_lat(blkg); 697 if (!iolat) 698 goto next; 699 700 lat_info = &iolat->child_lat; 701 cookie = atomic_read(&lat_info->scale_cookie); 702 703 if (cookie >= DEFAULT_SCALE_COOKIE) 704 goto next; 705 706 spin_lock_irqsave(&lat_info->lock, flags); 707 if (lat_info->last_scale_event >= now) 708 goto next_lock; 709 710 /* 711 * We scaled down but don't have a scale_grp, scale up and carry 712 * on. 713 */ 714 if (lat_info->scale_grp == NULL) { 715 scale_cookie_change(iolat->blkiolat, lat_info, true); 716 goto next_lock; 717 } 718 719 /* 720 * It's been 5 seconds since our last scale event, clear the 721 * scale grp in case the group that needed the scale down isn't 722 * doing any IO currently. 723 */ 724 if (now - lat_info->last_scale_event >= 725 ((u64)NSEC_PER_SEC * 5)) 726 lat_info->scale_grp = NULL; 727 next_lock: 728 spin_unlock_irqrestore(&lat_info->lock, flags); 729 next: 730 blkg_put(blkg); 731 } 732 rcu_read_unlock(); 733 } 734 735 int blk_iolatency_init(struct request_queue *q) 736 { 737 struct blk_iolatency *blkiolat; 738 struct rq_qos *rqos; 739 int ret; 740 741 blkiolat = kzalloc(sizeof(*blkiolat), GFP_KERNEL); 742 if (!blkiolat) 743 return -ENOMEM; 744 745 rqos = &blkiolat->rqos; 746 rqos->id = RQ_QOS_CGROUP; 747 rqos->ops = &blkcg_iolatency_ops; 748 rqos->q = q; 749 750 rq_qos_add(q, rqos); 751 752 ret = blkcg_activate_policy(q, &blkcg_policy_iolatency); 753 if (ret) { 754 rq_qos_del(q, rqos); 755 kfree(blkiolat); 756 return ret; 757 } 758 759 timer_setup(&blkiolat->timer, blkiolatency_timer_fn, 0); 760 761 return 0; 762 } 763 764 /* 765 * return 1 for enabling iolatency, return -1 for disabling iolatency, otherwise 766 * return 0. 767 */ 768 static int iolatency_set_min_lat_nsec(struct blkcg_gq *blkg, u64 val) 769 { 770 struct iolatency_grp *iolat = blkg_to_lat(blkg); 771 u64 oldval = iolat->min_lat_nsec; 772 773 iolat->min_lat_nsec = val; 774 iolat->cur_win_nsec = max_t(u64, val << 4, BLKIOLATENCY_MIN_WIN_SIZE); 775 iolat->cur_win_nsec = min_t(u64, iolat->cur_win_nsec, 776 BLKIOLATENCY_MAX_WIN_SIZE); 777 778 if (!oldval && val) 779 return 1; 780 if (oldval && !val) 781 return -1; 782 return 0; 783 } 784 785 static void iolatency_clear_scaling(struct blkcg_gq *blkg) 786 { 787 if (blkg->parent) { 788 struct iolatency_grp *iolat = blkg_to_lat(blkg->parent); 789 struct child_latency_info *lat_info; 790 if (!iolat) 791 return; 792 793 lat_info = &iolat->child_lat; 794 spin_lock(&lat_info->lock); 795 atomic_set(&lat_info->scale_cookie, DEFAULT_SCALE_COOKIE); 796 lat_info->last_scale_event = 0; 797 lat_info->scale_grp = NULL; 798 lat_info->scale_lat = 0; 799 spin_unlock(&lat_info->lock); 800 } 801 } 802 803 static ssize_t iolatency_set_limit(struct kernfs_open_file *of, char *buf, 804 size_t nbytes, loff_t off) 805 { 806 struct blkcg *blkcg = css_to_blkcg(of_css(of)); 807 struct blkcg_gq *blkg; 808 struct blkg_conf_ctx ctx; 809 struct iolatency_grp *iolat; 810 char *p, *tok; 811 u64 lat_val = 0; 812 u64 oldval; 813 int ret; 814 int enable = 0; 815 816 ret = blkg_conf_prep(blkcg, &blkcg_policy_iolatency, buf, &ctx); 817 if (ret) 818 return ret; 819 820 iolat = blkg_to_lat(ctx.blkg); 821 p = ctx.body; 822 823 ret = -EINVAL; 824 while ((tok = strsep(&p, " "))) { 825 char key[16]; 826 char val[21]; /* 18446744073709551616 */ 827 828 if (sscanf(tok, "%15[^=]=%20s", key, val) != 2) 829 goto out; 830 831 if (!strcmp(key, "target")) { 832 u64 v; 833 834 if (!strcmp(val, "max")) 835 lat_val = 0; 836 else if (sscanf(val, "%llu", &v) == 1) 837 lat_val = v * NSEC_PER_USEC; 838 else 839 goto out; 840 } else { 841 goto out; 842 } 843 } 844 845 /* Walk up the tree to see if our new val is lower than it should be. */ 846 blkg = ctx.blkg; 847 oldval = iolat->min_lat_nsec; 848 849 enable = iolatency_set_min_lat_nsec(blkg, lat_val); 850 if (enable) { 851 WARN_ON_ONCE(!blk_get_queue(blkg->q)); 852 blkg_get(blkg); 853 } 854 855 if (oldval != iolat->min_lat_nsec) { 856 iolatency_clear_scaling(blkg); 857 } 858 859 ret = 0; 860 out: 861 blkg_conf_finish(&ctx); 862 if (ret == 0 && enable) { 863 struct iolatency_grp *tmp = blkg_to_lat(blkg); 864 struct blk_iolatency *blkiolat = tmp->blkiolat; 865 866 blk_mq_freeze_queue(blkg->q); 867 868 if (enable == 1) 869 atomic_inc(&blkiolat->enabled); 870 else if (enable == -1) 871 atomic_dec(&blkiolat->enabled); 872 else 873 WARN_ON_ONCE(1); 874 875 blk_mq_unfreeze_queue(blkg->q); 876 877 blkg_put(blkg); 878 blk_put_queue(blkg->q); 879 } 880 return ret ?: nbytes; 881 } 882 883 static u64 iolatency_prfill_limit(struct seq_file *sf, 884 struct blkg_policy_data *pd, int off) 885 { 886 struct iolatency_grp *iolat = pd_to_lat(pd); 887 const char *dname = blkg_dev_name(pd->blkg); 888 889 if (!dname || !iolat->min_lat_nsec) 890 return 0; 891 seq_printf(sf, "%s target=%llu\n", 892 dname, div_u64(iolat->min_lat_nsec, NSEC_PER_USEC)); 893 return 0; 894 } 895 896 static int iolatency_print_limit(struct seq_file *sf, void *v) 897 { 898 blkcg_print_blkgs(sf, css_to_blkcg(seq_css(sf)), 899 iolatency_prfill_limit, 900 &blkcg_policy_iolatency, seq_cft(sf)->private, false); 901 return 0; 902 } 903 904 static size_t iolatency_ssd_stat(struct iolatency_grp *iolat, char *buf, 905 size_t size) 906 { 907 struct latency_stat stat; 908 int cpu; 909 910 latency_stat_init(iolat, &stat); 911 preempt_disable(); 912 for_each_online_cpu(cpu) { 913 struct latency_stat *s; 914 s = per_cpu_ptr(iolat->stats, cpu); 915 latency_stat_sum(iolat, &stat, s); 916 } 917 preempt_enable(); 918 919 if (iolat->rq_depth.max_depth == UINT_MAX) 920 return scnprintf(buf, size, " missed=%llu total=%llu depth=max", 921 (unsigned long long)stat.ps.missed, 922 (unsigned long long)stat.ps.total); 923 return scnprintf(buf, size, " missed=%llu total=%llu depth=%u", 924 (unsigned long long)stat.ps.missed, 925 (unsigned long long)stat.ps.total, 926 iolat->rq_depth.max_depth); 927 } 928 929 static size_t iolatency_pd_stat(struct blkg_policy_data *pd, char *buf, 930 size_t size) 931 { 932 struct iolatency_grp *iolat = pd_to_lat(pd); 933 unsigned long long avg_lat; 934 unsigned long long cur_win; 935 936 if (iolat->ssd) 937 return iolatency_ssd_stat(iolat, buf, size); 938 939 avg_lat = div64_u64(iolat->lat_avg, NSEC_PER_USEC); 940 cur_win = div64_u64(iolat->cur_win_nsec, NSEC_PER_MSEC); 941 if (iolat->rq_depth.max_depth == UINT_MAX) 942 return scnprintf(buf, size, " depth=max avg_lat=%llu win=%llu", 943 avg_lat, cur_win); 944 945 return scnprintf(buf, size, " depth=%u avg_lat=%llu win=%llu", 946 iolat->rq_depth.max_depth, avg_lat, cur_win); 947 } 948 949 950 static struct blkg_policy_data *iolatency_pd_alloc(gfp_t gfp, int node) 951 { 952 struct iolatency_grp *iolat; 953 954 iolat = kzalloc_node(sizeof(*iolat), gfp, node); 955 if (!iolat) 956 return NULL; 957 iolat->stats = __alloc_percpu_gfp(sizeof(struct latency_stat), 958 __alignof__(struct latency_stat), gfp); 959 if (!iolat->stats) { 960 kfree(iolat); 961 return NULL; 962 } 963 return &iolat->pd; 964 } 965 966 static void iolatency_pd_init(struct blkg_policy_data *pd) 967 { 968 struct iolatency_grp *iolat = pd_to_lat(pd); 969 struct blkcg_gq *blkg = lat_to_blkg(iolat); 970 struct rq_qos *rqos = blkcg_rq_qos(blkg->q); 971 struct blk_iolatency *blkiolat = BLKIOLATENCY(rqos); 972 u64 now = ktime_to_ns(ktime_get()); 973 int cpu; 974 975 if (blk_queue_nonrot(blkg->q)) 976 iolat->ssd = true; 977 else 978 iolat->ssd = false; 979 980 for_each_possible_cpu(cpu) { 981 struct latency_stat *stat; 982 stat = per_cpu_ptr(iolat->stats, cpu); 983 latency_stat_init(iolat, stat); 984 } 985 986 latency_stat_init(iolat, &iolat->cur_stat); 987 rq_wait_init(&iolat->rq_wait); 988 spin_lock_init(&iolat->child_lat.lock); 989 iolat->rq_depth.queue_depth = blkg->q->nr_requests; 990 iolat->rq_depth.max_depth = UINT_MAX; 991 iolat->rq_depth.default_depth = iolat->rq_depth.queue_depth; 992 iolat->blkiolat = blkiolat; 993 iolat->cur_win_nsec = 100 * NSEC_PER_MSEC; 994 atomic64_set(&iolat->window_start, now); 995 996 /* 997 * We init things in list order, so the pd for the parent may not be 998 * init'ed yet for whatever reason. 999 */ 1000 if (blkg->parent && blkg_to_pd(blkg->parent, &blkcg_policy_iolatency)) { 1001 struct iolatency_grp *parent = blkg_to_lat(blkg->parent); 1002 atomic_set(&iolat->scale_cookie, 1003 atomic_read(&parent->child_lat.scale_cookie)); 1004 } else { 1005 atomic_set(&iolat->scale_cookie, DEFAULT_SCALE_COOKIE); 1006 } 1007 1008 atomic_set(&iolat->child_lat.scale_cookie, DEFAULT_SCALE_COOKIE); 1009 } 1010 1011 static void iolatency_pd_offline(struct blkg_policy_data *pd) 1012 { 1013 struct iolatency_grp *iolat = pd_to_lat(pd); 1014 struct blkcg_gq *blkg = lat_to_blkg(iolat); 1015 struct blk_iolatency *blkiolat = iolat->blkiolat; 1016 int ret; 1017 1018 ret = iolatency_set_min_lat_nsec(blkg, 0); 1019 if (ret == 1) 1020 atomic_inc(&blkiolat->enabled); 1021 if (ret == -1) 1022 atomic_dec(&blkiolat->enabled); 1023 iolatency_clear_scaling(blkg); 1024 } 1025 1026 static void iolatency_pd_free(struct blkg_policy_data *pd) 1027 { 1028 struct iolatency_grp *iolat = pd_to_lat(pd); 1029 free_percpu(iolat->stats); 1030 kfree(iolat); 1031 } 1032 1033 static struct cftype iolatency_files[] = { 1034 { 1035 .name = "latency", 1036 .flags = CFTYPE_NOT_ON_ROOT, 1037 .seq_show = iolatency_print_limit, 1038 .write = iolatency_set_limit, 1039 }, 1040 {} 1041 }; 1042 1043 static struct blkcg_policy blkcg_policy_iolatency = { 1044 .dfl_cftypes = iolatency_files, 1045 .pd_alloc_fn = iolatency_pd_alloc, 1046 .pd_init_fn = iolatency_pd_init, 1047 .pd_offline_fn = iolatency_pd_offline, 1048 .pd_free_fn = iolatency_pd_free, 1049 .pd_stat_fn = iolatency_pd_stat, 1050 }; 1051 1052 static int __init iolatency_init(void) 1053 { 1054 return blkcg_policy_register(&blkcg_policy_iolatency); 1055 } 1056 1057 static void __exit iolatency_exit(void) 1058 { 1059 return blkcg_policy_unregister(&blkcg_policy_iolatency); 1060 } 1061 1062 module_init(iolatency_init); 1063 module_exit(iolatency_exit); 1064