Lines Matching +full:cost +full:- +full:effective

1 /* SPDX-License-Identifier: GPL-2.0
3 * IO cost model based controller.
10 * observable cost metric. This is distinguished from CPU and memory where
17 * useless for the purpose of IO capacity distribution. While on-device
19 * non-queued rotational devices, this is no longer viable with modern
22 * While there is no cost metric we can trivially observe, it isn't a
23 * complete mystery. For example, on a rotational device, seek cost
27 * implement a reasonable work-conserving proportional IO resource
30 * 1. IO Cost Model
32 * IO cost model estimates the cost of an IO given its basic parameters and
33 * history (e.g. the end sector of the last IO). The cost is measured in
34 * device time. If a given IO is estimated to cost 10ms, the device should
37 * Currently, there's only one builtin cost model - linear. Each IO is
38 * classified as sequential or random and given a base cost accordingly.
39 * On top of that, a size cost proportional to the length of the IO is
44 * /sys/fs/cgroup/io.cost.model.
47 * device-specific coefficients.
54 * 2-1. Vtime Distribution
75 * against the device vtime - an IO which takes 10ms on the underlying
84 * 2-2. Vrate Adjustment
86 * It's unrealistic to expect the cost model to be perfect. There are too
97 * To slow down, we lower the vrate - the rate at which the device vtime
100 * 750ms worth of IOs per second, and vice-versa for speeding up.
102 * Device business is determined using two criteria - rq wait and
105 * When a device gets saturated, the on-device and then the request queues
121 * service. There is an inherent trade-off - the tighter the latency QoS,
123 * and can be set through /sys/fs/cgroup/io.cost.qos.
125 * 2-3. Work Conservation
130 * cost per second, i.e., 10% of the device capacity. The naive
133 * compared to free-for-all competition. This is too high a cost to pay
156 * controller uses a drgn based monitoring script -
165 * - per : Timer period
166 * - cur_per : Internal wall and device vtime clock
167 * - vrate : Device virtual time rate against wall clock
168 * - weight : Surplus-adjusted and configured weights
169 * - hweight : Surplus-adjusted and configured hierarchical weights
170 * - inflt : The percentage of in-flight IO cost at the end of last period
171 * - del_ms : Deferred issuer delay induction level and duration
172 * - usages : Usage history
183 #include "blk-rq-qos.h"
184 #include "blk-stat.h"
185 #include "blk-wbt.h"
186 #include "blk-cgroup.h"
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
200 cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
220 * iocg->vtime is targeted at 50% behind the device vtime, which
239 * As vtime is used to calculate the cost of each IO, it needs to
241 * represent the cost of a single page worth of discard with
246 * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
247 * granularity and days of wrap-around time even at extreme vrates.
273 * The effect of delay is indirect and non-linear and a huge amount of
288 * cache, the kernel doesn't have well-defined back-pressure propagation
306 * size-proportional components of cost calculation in closer
307 * numbers of digits to per-IO cost components.
311 IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
323 /* io.cost.qos controls including per-dev enable of the whole controller */
330 /* io.cost.qos params */
341 /* io.cost.model controls */
348 /* builtin linear cost model coefficients */
461 /* per device-cgroup pair */
467 * A iocg can get its weight from two sources - an explicit
468 * per-device-cgroup configuration or the default weight of the
469 * cgroup. `cfg_weight` is the explicit per-device-cgroup
470 * configuration. `weight` is the effective considering both
501 * than issue. The delta behind `vtime` represents the cost of
502 * currently in-flight IOs.
648 * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
672 struct gendisk *disk = ioc->rqos.disk; in ioc_name()
676 return disk->disk_name; in ioc_name()
691 return pd_to_blkg(&iocg->pd); in iocg_to_blkg()
712 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse) in cost_to_abs_cost() argument
714 return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE); in cost_to_abs_cost()
718 u64 abs_cost, u64 cost) in iocg_commit_bio() argument
722 bio->bi_iocost_cost = cost; in iocg_commit_bio()
723 atomic64_add(cost, &iocg->vtime); in iocg_commit_bio()
725 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_commit_bio()
726 local64_add(abs_cost, &gcs->abs_vusage); in iocg_commit_bio()
733 spin_lock_irqsave(&iocg->ioc->lock, *flags); in iocg_lock()
734 spin_lock(&iocg->waitq.lock); in iocg_lock()
736 spin_lock_irqsave(&iocg->waitq.lock, *flags); in iocg_lock()
743 spin_unlock(&iocg->waitq.lock); in iocg_unlock()
744 spin_unlock_irqrestore(&iocg->ioc->lock, *flags); in iocg_unlock()
746 spin_unlock_irqrestore(&iocg->waitq.lock, *flags); in iocg_unlock()
755 struct ioc_margins *margins = &ioc->margins; in ioc_refresh_margins()
756 u32 period_us = ioc->period_us; in ioc_refresh_margins()
757 u64 vrate = ioc->vtime_base_rate; in ioc_refresh_margins()
759 margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate; in ioc_refresh_margins()
760 margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate; in ioc_refresh_margins()
761 margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate; in ioc_refresh_margins()
769 lockdep_assert_held(&ioc->lock); in ioc_refresh_period_us()
772 if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) { in ioc_refresh_period_us()
773 ppm = ioc->params.qos[QOS_RPPM]; in ioc_refresh_period_us()
774 lat = ioc->params.qos[QOS_RLAT]; in ioc_refresh_period_us()
776 ppm = ioc->params.qos[QOS_WPPM]; in ioc_refresh_period_us()
777 lat = ioc->params.qos[QOS_WLAT]; in ioc_refresh_period_us()
789 multi = max_t(u32, (MILLION - ppm) / 50000, 2); in ioc_refresh_period_us()
796 ioc->period_us = period_us; in ioc_refresh_period_us()
797 ioc->timer_slack_ns = div64_u64( in ioc_refresh_period_us()
804 * ioc->rqos.disk isn't initialized when this function is called from
809 int idx = ioc->autop_idx; in ioc_autop_idx()
815 if (!blk_queue_nonrot(disk->queue)) in ioc_autop_idx()
819 if (blk_queue_depth(disk->queue) == 1) in ioc_autop_idx()
827 if (ioc->user_qos_params || ioc->user_cost_model) in ioc_autop_idx()
831 vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC); in ioc_autop_idx()
834 if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) { in ioc_autop_idx()
835 if (!ioc->autop_too_fast_at) in ioc_autop_idx()
836 ioc->autop_too_fast_at = now_ns; in ioc_autop_idx()
837 if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
840 ioc->autop_too_fast_at = 0; in ioc_autop_idx()
843 if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) { in ioc_autop_idx()
844 if (!ioc->autop_too_slow_at) in ioc_autop_idx()
845 ioc->autop_too_slow_at = now_ns; in ioc_autop_idx()
846 if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC) in ioc_autop_idx()
847 return idx - 1; in ioc_autop_idx()
849 ioc->autop_too_slow_at = 0; in ioc_autop_idx()
862 * and calculate the linear model cost coefficients.
864 * *@page per-page cost 1s / (@bps / 4096)
865 * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
866 * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
887 *seqio = v - *page; in calc_lcoefs()
893 *randio = v - *page; in calc_lcoefs()
899 u64 *u = ioc->params.i_lcoefs; in ioc_refresh_lcoefs()
900 u64 *c = ioc->params.lcoefs; in ioc_refresh_lcoefs()
909 * struct gendisk is required as an argument because ioc->rqos.disk
918 lockdep_assert_held(&ioc->lock); in ioc_refresh_params_disk()
923 if (idx == ioc->autop_idx && !force) in ioc_refresh_params_disk()
926 if (idx != ioc->autop_idx) { in ioc_refresh_params_disk()
927 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in ioc_refresh_params_disk()
928 ioc->vtime_base_rate = VTIME_PER_USEC; in ioc_refresh_params_disk()
931 ioc->autop_idx = idx; in ioc_refresh_params_disk()
932 ioc->autop_too_fast_at = 0; in ioc_refresh_params_disk()
933 ioc->autop_too_slow_at = 0; in ioc_refresh_params_disk()
935 if (!ioc->user_qos_params) in ioc_refresh_params_disk()
936 memcpy(ioc->params.qos, p->qos, sizeof(p->qos)); in ioc_refresh_params_disk()
937 if (!ioc->user_cost_model) in ioc_refresh_params_disk()
938 memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs)); in ioc_refresh_params_disk()
943 ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] * in ioc_refresh_params_disk()
945 ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] * in ioc_refresh_params_disk()
953 return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk); in ioc_refresh_params()
965 s64 pleft = ioc->period_at + ioc->period_us - now->now; in ioc_refresh_vrate()
966 s64 vperiod = ioc->period_us * ioc->vtime_base_rate; in ioc_refresh_vrate()
969 lockdep_assert_held(&ioc->lock); in ioc_refresh_vrate()
980 vcomp = -div64_s64(ioc->vtime_err, pleft); in ioc_refresh_vrate()
981 vcomp_min = -(ioc->vtime_base_rate >> 1); in ioc_refresh_vrate()
982 vcomp_max = ioc->vtime_base_rate; in ioc_refresh_vrate()
985 ioc->vtime_err += vcomp * pleft; in ioc_refresh_vrate()
987 atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp); in ioc_refresh_vrate()
990 ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod); in ioc_refresh_vrate()
997 u64 vrate = ioc->vtime_base_rate; in ioc_adjust_base_vrate()
998 u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max; in ioc_adjust_base_vrate()
1000 if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) { in ioc_adjust_base_vrate()
1001 if (ioc->busy_level != prev_busy_level || nr_lagging) in ioc_adjust_base_vrate()
1018 vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100); in ioc_adjust_base_vrate()
1021 int idx = min_t(int, abs(ioc->busy_level), in ioc_adjust_base_vrate()
1022 ARRAY_SIZE(vrate_adj_pct) - 1); in ioc_adjust_base_vrate()
1025 if (ioc->busy_level > 0) in ioc_adjust_base_vrate()
1026 adj_pct = 100 - adj_pct; in ioc_adjust_base_vrate()
1037 ioc->vtime_base_rate = vrate; in ioc_adjust_base_vrate()
1047 now->now_ns = ktime_get(); in ioc_now()
1048 now->now = ktime_to_us(now->now_ns); in ioc_now()
1049 vrate = atomic64_read(&ioc->vtime_rate); in ioc_now()
1060 seq = read_seqcount_begin(&ioc->period_seqcount); in ioc_now()
1061 now->vnow = ioc->period_at_vtime + in ioc_now()
1062 (now->now - ioc->period_at) * vrate; in ioc_now()
1063 } while (read_seqcount_retry(&ioc->period_seqcount, seq)); in ioc_now()
1068 WARN_ON_ONCE(ioc->running != IOC_RUNNING); in ioc_start_period()
1070 write_seqcount_begin(&ioc->period_seqcount); in ioc_start_period()
1071 ioc->period_at = now->now; in ioc_start_period()
1072 ioc->period_at_vtime = now->vnow; in ioc_start_period()
1073 write_seqcount_end(&ioc->period_seqcount); in ioc_start_period()
1075 ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us); in ioc_start_period()
1076 add_timer(&ioc->timer); in ioc_start_period()
1082 * is saved to be used as reference for later inuse in-period adjustments.
1087 struct ioc *ioc = iocg->ioc; in __propagate_weights()
1090 lockdep_assert_held(&ioc->lock); in __propagate_weights()
1097 if (list_empty(&iocg->active_list) && iocg->child_active_sum) { in __propagate_weights()
1098 inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum, in __propagate_weights()
1099 iocg->child_active_sum); in __propagate_weights()
1111 iocg->last_inuse = iocg->inuse; in __propagate_weights()
1113 iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime); in __propagate_weights()
1115 if (active == iocg->active && inuse == iocg->inuse) in __propagate_weights()
1118 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in __propagate_weights()
1119 struct ioc_gq *parent = iocg->ancestors[lvl]; in __propagate_weights()
1120 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in __propagate_weights()
1124 parent->child_active_sum += (s32)(active - child->active); in __propagate_weights()
1125 parent->child_inuse_sum += (s32)(inuse - child->inuse); in __propagate_weights()
1127 child->active = active; in __propagate_weights()
1128 child->inuse = inuse; in __propagate_weights()
1135 if (parent->child_active_sum) { in __propagate_weights()
1136 parent_active = parent->weight; in __propagate_weights()
1138 parent_active * parent->child_inuse_sum, in __propagate_weights()
1139 parent->child_active_sum); in __propagate_weights()
1143 if (parent_active == parent->active && in __propagate_weights()
1144 parent_inuse == parent->inuse) in __propagate_weights()
1151 ioc->weights_updated = true; in __propagate_weights()
1156 lockdep_assert_held(&ioc->lock); in commit_weights()
1158 if (ioc->weights_updated) { in commit_weights()
1161 atomic_inc(&ioc->hweight_gen); in commit_weights()
1162 ioc->weights_updated = false; in commit_weights()
1170 commit_weights(iocg->ioc); in propagate_weights()
1175 struct ioc *ioc = iocg->ioc; in current_hweight()
1180 /* hot path - if uptodate, use cached */ in current_hweight()
1181 ioc_gen = atomic_read(&ioc->hweight_gen); in current_hweight()
1182 if (ioc_gen == iocg->hweight_gen) in current_hweight()
1198 for (lvl = 0; lvl <= iocg->level - 1; lvl++) { in current_hweight()
1199 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight()
1200 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight()
1201 u64 active_sum = READ_ONCE(parent->child_active_sum); in current_hweight()
1202 u64 inuse_sum = READ_ONCE(parent->child_inuse_sum); in current_hweight()
1203 u32 active = READ_ONCE(child->active); in current_hweight()
1204 u32 inuse = READ_ONCE(child->inuse); in current_hweight()
1217 iocg->hweight_active = max_t(u32, hwa, 1); in current_hweight()
1218 iocg->hweight_inuse = max_t(u32, hwi, 1); in current_hweight()
1219 iocg->hweight_gen = ioc_gen; in current_hweight()
1222 *hw_activep = iocg->hweight_active; in current_hweight()
1224 *hw_inusep = iocg->hweight_inuse; in current_hweight()
1234 u32 inuse = iocg->active; in current_hweight_max()
1238 lockdep_assert_held(&iocg->ioc->lock); in current_hweight_max()
1240 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in current_hweight_max()
1241 struct ioc_gq *parent = iocg->ancestors[lvl]; in current_hweight_max()
1242 struct ioc_gq *child = iocg->ancestors[lvl + 1]; in current_hweight_max()
1244 child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse; in current_hweight_max()
1246 inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum, in current_hweight_max()
1247 parent->child_active_sum); in current_hweight_max()
1255 struct ioc *ioc = iocg->ioc; in weight_updated()
1257 struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg); in weight_updated()
1260 lockdep_assert_held(&ioc->lock); in weight_updated()
1262 weight = iocg->cfg_weight ?: iocc->dfl_weight; in weight_updated()
1263 if (weight != iocg->weight && iocg->active) in weight_updated()
1264 propagate_weights(iocg, weight, iocg->inuse, true, now); in weight_updated()
1265 iocg->weight = weight; in weight_updated()
1270 struct ioc *ioc = iocg->ioc; in iocg_activate()
1279 if (!list_empty(&iocg->active_list)) { in iocg_activate()
1281 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1282 if (atomic64_read(&iocg->active_period) != cur_period) in iocg_activate()
1283 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1288 if (iocg->child_active_sum) in iocg_activate()
1291 spin_lock_irq(&ioc->lock); in iocg_activate()
1296 cur_period = atomic64_read(&ioc->cur_period); in iocg_activate()
1297 last_period = atomic64_read(&iocg->active_period); in iocg_activate()
1298 atomic64_set(&iocg->active_period, cur_period); in iocg_activate()
1300 /* already activated or breaking leaf-only constraint? */ in iocg_activate()
1301 if (!list_empty(&iocg->active_list)) in iocg_activate()
1303 for (i = iocg->level - 1; i > 0; i--) in iocg_activate()
1304 if (!list_empty(&iocg->ancestors[i]->active_list)) in iocg_activate()
1307 if (iocg->child_active_sum) in iocg_activate()
1314 vtarget = now->vnow - ioc->margins.target; in iocg_activate()
1315 vtime = atomic64_read(&iocg->vtime); in iocg_activate()
1317 atomic64_add(vtarget - vtime, &iocg->vtime); in iocg_activate()
1318 atomic64_add(vtarget - vtime, &iocg->done_vtime); in iocg_activate()
1326 iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1; in iocg_activate()
1327 list_add(&iocg->active_list, &ioc->active_iocgs); in iocg_activate()
1329 propagate_weights(iocg, iocg->weight, in iocg_activate()
1330 iocg->last_inuse ?: iocg->weight, true, now); in iocg_activate()
1335 iocg->activated_at = now->now; in iocg_activate()
1337 if (ioc->running == IOC_IDLE) { in iocg_activate()
1338 ioc->running = IOC_RUNNING; in iocg_activate()
1339 ioc->dfgv_period_at = now->now; in iocg_activate()
1340 ioc->dfgv_period_rem = 0; in iocg_activate()
1345 spin_unlock_irq(&ioc->lock); in iocg_activate()
1349 spin_unlock_irq(&ioc->lock); in iocg_activate()
1355 struct ioc *ioc = iocg->ioc; in iocg_kick_delay()
1361 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_delay()
1367 if (time_before64(now->now, iocg->delay_at)) in iocg_kick_delay()
1370 /* calculate the current delay in effect - 1/2 every second */ in iocg_kick_delay()
1371 tdelta = now->now - iocg->delay_at; in iocg_kick_delay()
1373 if (iocg->delay && shift < BITS_PER_LONG) in iocg_kick_delay()
1374 delay = iocg->delay >> shift; in iocg_kick_delay()
1380 vover = atomic64_read(&iocg->vtime) + in iocg_kick_delay()
1381 abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow; in iocg_kick_delay()
1383 ioc->period_us * ioc->vtime_base_rate); in iocg_kick_delay()
1391 div_u64((MAX_DELAY - MIN_DELAY) * in iocg_kick_delay()
1392 (vover_pct - MIN_DELAY_THR_PCT), in iocg_kick_delay()
1393 MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT); in iocg_kick_delay()
1397 iocg->delay = new_delay; in iocg_kick_delay()
1398 iocg->delay_at = now->now; in iocg_kick_delay()
1403 if (!iocg->indelay_since) in iocg_kick_delay()
1404 iocg->indelay_since = now->now; in iocg_kick_delay()
1408 if (iocg->indelay_since) { in iocg_kick_delay()
1409 iocg->stat.indelay_us += now->now - iocg->indelay_since; in iocg_kick_delay()
1410 iocg->indelay_since = 0; in iocg_kick_delay()
1412 iocg->delay = 0; in iocg_kick_delay()
1423 lockdep_assert_held(&iocg->ioc->lock); in iocg_incur_debt()
1424 lockdep_assert_held(&iocg->waitq.lock); in iocg_incur_debt()
1425 WARN_ON_ONCE(list_empty(&iocg->active_list)); in iocg_incur_debt()
1431 if (!iocg->abs_vdebt && abs_cost) { in iocg_incur_debt()
1432 iocg->indebt_since = now->now; in iocg_incur_debt()
1433 propagate_weights(iocg, iocg->active, 0, false, now); in iocg_incur_debt()
1436 iocg->abs_vdebt += abs_cost; in iocg_incur_debt()
1438 gcs = get_cpu_ptr(iocg->pcpu_stat); in iocg_incur_debt()
1439 local64_add(abs_cost, &gcs->abs_vusage); in iocg_incur_debt()
1446 lockdep_assert_held(&iocg->ioc->lock); in iocg_pay_debt()
1447 lockdep_assert_held(&iocg->waitq.lock); in iocg_pay_debt()
1450 * make sure that nobody messed with @iocg. Check iocg->pd.online in iocg_pay_debt()
1453 WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online); in iocg_pay_debt()
1454 WARN_ON_ONCE(iocg->inuse > 1); in iocg_pay_debt()
1456 iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt); in iocg_pay_debt()
1459 if (!iocg->abs_vdebt) { in iocg_pay_debt()
1460 iocg->stat.indebt_us += now->now - iocg->indebt_since; in iocg_pay_debt()
1461 iocg->indebt_since = 0; in iocg_pay_debt()
1463 propagate_weights(iocg, iocg->active, iocg->last_inuse, in iocg_pay_debt()
1473 u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse); in iocg_wake_fn() local
1475 ctx->vbudget -= cost; in iocg_wake_fn()
1477 if (ctx->vbudget < 0) in iocg_wake_fn()
1478 return -1; in iocg_wake_fn()
1480 iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost); in iocg_wake_fn()
1481 wait->committed = true; in iocg_wake_fn()
1491 list_del_init_careful(&wq_entry->entry); in iocg_wake_fn()
1497 * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1498 * addition to iocg->waitq.lock.
1503 struct ioc *ioc = iocg->ioc; in iocg_kick_waitq()
1509 lockdep_assert_held(&iocg->waitq.lock); in iocg_kick_waitq()
1512 vbudget = now->vnow - atomic64_read(&iocg->vtime); in iocg_kick_waitq()
1515 if (pay_debt && iocg->abs_vdebt && vbudget > 0) { in iocg_kick_waitq()
1517 u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt); in iocg_kick_waitq()
1520 lockdep_assert_held(&ioc->lock); in iocg_kick_waitq()
1522 atomic64_add(vpay, &iocg->vtime); in iocg_kick_waitq()
1523 atomic64_add(vpay, &iocg->done_vtime); in iocg_kick_waitq()
1525 vbudget -= vpay; in iocg_kick_waitq()
1528 if (iocg->abs_vdebt || iocg->delay) in iocg_kick_waitq()
1537 if (iocg->abs_vdebt) { in iocg_kick_waitq()
1538 s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa); in iocg_kick_waitq()
1539 vbudget = min_t(s64, 0, vbudget - vdebt); in iocg_kick_waitq()
1550 __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx); in iocg_kick_waitq()
1552 if (!waitqueue_active(&iocg->waitq)) { in iocg_kick_waitq()
1553 if (iocg->wait_since) { in iocg_kick_waitq()
1554 iocg->stat.wait_us += now->now - iocg->wait_since; in iocg_kick_waitq()
1555 iocg->wait_since = 0; in iocg_kick_waitq()
1560 if (!iocg->wait_since) in iocg_kick_waitq()
1561 iocg->wait_since = now->now; in iocg_kick_waitq()
1567 vshortage = -ctx.vbudget; in iocg_kick_waitq()
1568 expires = now->now_ns + in iocg_kick_waitq()
1569 DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) * in iocg_kick_waitq()
1571 expires += ioc->timer_slack_ns; in iocg_kick_waitq()
1574 oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer)); in iocg_kick_waitq()
1575 if (hrtimer_is_queued(&iocg->waitq_timer) && in iocg_kick_waitq()
1576 abs(oexpires - expires) <= ioc->timer_slack_ns) in iocg_kick_waitq()
1579 hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires), in iocg_kick_waitq()
1580 ioc->timer_slack_ns, HRTIMER_MODE_ABS); in iocg_kick_waitq()
1586 bool pay_debt = READ_ONCE(iocg->abs_vdebt); in iocg_waitq_timer_fn()
1590 ioc_now(iocg->ioc, &now); in iocg_waitq_timer_fn()
1607 struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu); in ioc_lat_stat()
1611 u32 this_met = local_read(&stat->missed[rw].nr_met); in ioc_lat_stat()
1612 u32 this_missed = local_read(&stat->missed[rw].nr_missed); in ioc_lat_stat()
1614 nr_met[rw] += this_met - stat->missed[rw].last_met; in ioc_lat_stat()
1615 nr_missed[rw] += this_missed - stat->missed[rw].last_missed; in ioc_lat_stat()
1616 stat->missed[rw].last_met = this_met; in ioc_lat_stat()
1617 stat->missed[rw].last_missed = this_missed; in ioc_lat_stat()
1620 this_rq_wait_ns = local64_read(&stat->rq_wait_ns); in ioc_lat_stat()
1621 rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns; in ioc_lat_stat()
1622 stat->last_rq_wait_ns = this_rq_wait_ns; in ioc_lat_stat()
1635 ioc->period_us * NSEC_PER_USEC); in ioc_lat_stat()
1641 struct ioc *ioc = iocg->ioc; in iocg_is_idle()
1644 if (atomic64_read(&iocg->active_period) == in iocg_is_idle()
1645 atomic64_read(&ioc->cur_period)) in iocg_is_idle()
1649 if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime)) in iocg_is_idle()
1656 * Call this function on the target leaf @iocg's to build pre-order traversal
1658 * ->walk_list and the caller is responsible for dissolving the list after use.
1665 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in iocg_build_inner_walk()
1668 for (lvl = iocg->level - 1; lvl >= 0; lvl--) { in iocg_build_inner_walk()
1669 if (!list_empty(&iocg->ancestors[lvl]->walk_list)) in iocg_build_inner_walk()
1673 /* walk down and visit the inner nodes to get pre-order traversal */ in iocg_build_inner_walk()
1674 while (++lvl <= iocg->level - 1) { in iocg_build_inner_walk()
1675 struct ioc_gq *inner = iocg->ancestors[lvl]; in iocg_build_inner_walk()
1678 list_add_tail(&inner->walk_list, inner_walk); in iocg_build_inner_walk()
1685 if (iocg->level > 0) { in iocg_flush_stat_upward()
1687 &iocg->ancestors[iocg->level - 1]->stat; in iocg_flush_stat_upward()
1689 parent_stat->usage_us += in iocg_flush_stat_upward()
1690 iocg->stat.usage_us - iocg->last_stat.usage_us; in iocg_flush_stat_upward()
1691 parent_stat->wait_us += in iocg_flush_stat_upward()
1692 iocg->stat.wait_us - iocg->last_stat.wait_us; in iocg_flush_stat_upward()
1693 parent_stat->indebt_us += in iocg_flush_stat_upward()
1694 iocg->stat.indebt_us - iocg->last_stat.indebt_us; in iocg_flush_stat_upward()
1695 parent_stat->indelay_us += in iocg_flush_stat_upward()
1696 iocg->stat.indelay_us - iocg->last_stat.indelay_us; in iocg_flush_stat_upward()
1699 iocg->last_stat = iocg->stat; in iocg_flush_stat_upward()
1702 /* collect per-cpu counters and propagate the deltas to the parent */
1705 struct ioc *ioc = iocg->ioc; in iocg_flush_stat_leaf()
1710 lockdep_assert_held(&iocg->ioc->lock); in iocg_flush_stat_leaf()
1712 /* collect per-cpu counters */ in iocg_flush_stat_leaf()
1715 per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu)); in iocg_flush_stat_leaf()
1717 vusage_delta = abs_vusage - iocg->last_stat_abs_vusage; in iocg_flush_stat_leaf()
1718 iocg->last_stat_abs_vusage = abs_vusage; in iocg_flush_stat_leaf()
1720 iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate); in iocg_flush_stat_leaf()
1721 iocg->stat.usage_us += iocg->usage_delta_us; in iocg_flush_stat_leaf()
1741 list_del_init(&iocg->walk_list); in iocg_flush_stat()
1753 struct ioc *ioc = iocg->ioc; in hweight_after_donation()
1754 u64 vtime = atomic64_read(&iocg->vtime); in hweight_after_donation()
1758 if (iocg->abs_vdebt) in hweight_after_donation()
1762 if (waitqueue_active(&iocg->waitq) || in hweight_after_donation()
1763 time_after64(vtime, now->vnow - ioc->margins.min)) in hweight_after_donation()
1767 excess = now->vnow - vtime - ioc->margins.target; in hweight_after_donation()
1769 atomic64_add(excess, &iocg->vtime); in hweight_after_donation()
1770 atomic64_add(excess, &iocg->done_vtime); in hweight_after_donation()
1772 ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE); in hweight_after_donation()
1782 * new budget (1 - MARGIN_TARGET) and the leftover from the last period in hweight_after_donation()
1785 * usage = (1 - MARGIN_TARGET + delta) * new_hwi in hweight_after_donation()
1789 * new_hwi = usage / (1 - MARGIN_TARGET + delta) in hweight_after_donation()
1791 delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime), in hweight_after_donation()
1792 now->vnow - ioc->period_at_vtime); in hweight_after_donation()
1794 new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta); in hweight_after_donation()
1800 * For work-conservation, an iocg which isn't using all of its share should
1801 * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1822 * Given the weights and target after-donation hweight_inuse values, Andy's
1824 * sibling level to maintain the relative relationship between all non-donating
1826 * non-donating parts, calculates global donation rate which is used to
1827 * determine the target hweight_inuse for each node, and then derives per-level
1834 * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1844 * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1847 * w_f is the non-donating portion of w. w_f = w * f / b
1850 * s_f and s_t are the non-donating and donating portions of s.
1852 * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1876 after_sum += iocg->hweight_after_donation; in transfer_surpluses()
1878 if (iocg->hweight_after_donation > hwa) { in transfer_surpluses()
1879 over_sum += iocg->hweight_after_donation; in transfer_surpluses()
1880 list_add(&iocg->walk_list, &over_hwa); in transfer_surpluses()
1889 u32 over_delta = after_sum - (WEIGHT_ONE - 1); in transfer_surpluses()
1891 over_target = over_sum - over_delta; in transfer_surpluses()
1898 iocg->hweight_after_donation = in transfer_surpluses()
1899 div_u64((u64)iocg->hweight_after_donation * in transfer_surpluses()
1901 list_del_init(&iocg->walk_list); in transfer_surpluses()
1905 * Build pre-order inner node walk list and prepare for donation in transfer_surpluses()
1913 WARN_ON_ONCE(root_iocg->level > 0); in transfer_surpluses()
1916 iocg->child_adjusted_sum = 0; in transfer_surpluses()
1917 iocg->hweight_donating = 0; in transfer_surpluses()
1918 iocg->hweight_after_donation = 0; in transfer_surpluses()
1926 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1928 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1929 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1933 if (iocg->level > 0) { in transfer_surpluses()
1934 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1936 parent->hweight_donating += iocg->hweight_donating; in transfer_surpluses()
1937 parent->hweight_after_donation += iocg->hweight_after_donation; in transfer_surpluses()
1947 if (iocg->level) { in transfer_surpluses()
1948 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
1950 iocg->hweight_active = DIV64_U64_ROUND_UP( in transfer_surpluses()
1951 (u64)parent->hweight_active * iocg->active, in transfer_surpluses()
1952 parent->child_active_sum); in transfer_surpluses()
1956 iocg->hweight_donating = min(iocg->hweight_donating, in transfer_surpluses()
1957 iocg->hweight_active); in transfer_surpluses()
1958 iocg->hweight_after_donation = min(iocg->hweight_after_donation, in transfer_surpluses()
1959 iocg->hweight_donating - 1); in transfer_surpluses()
1960 if (WARN_ON_ONCE(iocg->hweight_active <= 1 || in transfer_surpluses()
1961 iocg->hweight_donating <= 1 || in transfer_surpluses()
1962 iocg->hweight_after_donation == 0)) { in transfer_surpluses()
1964 pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup); in transfer_surpluses()
1966 iocg->hweight_active, iocg->hweight_donating, in transfer_surpluses()
1967 iocg->hweight_after_donation); in transfer_surpluses()
1972 * Calculate the global donation rate (gamma) - the rate to adjust in transfer_surpluses()
1973 * non-donating budgets by. in transfer_surpluses()
1979 * hweights can't be whole; however, due to the round-ups during hweight in transfer_surpluses()
1980 * calculations, root_iocg->hweight_donating might still end up equal to in transfer_surpluses()
1983 * gamma = (1 - t_r') / (1 - t_r) in transfer_surpluses()
1986 (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE, in transfer_surpluses()
1987 WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1)); in transfer_surpluses()
1998 if (iocg->level == 0) { in transfer_surpluses()
2000 iocg->child_adjusted_sum = DIV64_U64_ROUND_UP( in transfer_surpluses()
2001 iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating), in transfer_surpluses()
2002 WEIGHT_ONE - iocg->hweight_after_donation); in transfer_surpluses()
2006 parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
2009 iocg->hweight_inuse = DIV64_U64_ROUND_UP( in transfer_surpluses()
2010 (u64)gamma * (iocg->hweight_active - iocg->hweight_donating), in transfer_surpluses()
2011 WEIGHT_ONE) + iocg->hweight_after_donation; in transfer_surpluses()
2015 (u64)parent->child_adjusted_sum * iocg->hweight_inuse, in transfer_surpluses()
2016 parent->hweight_inuse); in transfer_surpluses()
2020 iocg->child_active_sum * iocg->hweight_donating, in transfer_surpluses()
2021 iocg->hweight_active); in transfer_surpluses()
2022 sf = iocg->child_active_sum - st; in transfer_surpluses()
2024 (u64)iocg->active * iocg->hweight_donating, in transfer_surpluses()
2025 iocg->hweight_active); in transfer_surpluses()
2027 (u64)inuse * iocg->hweight_after_donation, in transfer_surpluses()
2028 iocg->hweight_inuse); in transfer_surpluses()
2030 iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt); in transfer_surpluses()
2034 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and in transfer_surpluses()
2038 struct ioc_gq *parent = iocg->ancestors[iocg->level - 1]; in transfer_surpluses()
2042 * In-debt iocgs participated in the donation calculation with in transfer_surpluses()
2045 * @iocg->inuse stay at the minimum and we don't wanna in transfer_surpluses()
2048 if (iocg->abs_vdebt) { in transfer_surpluses()
2049 WARN_ON_ONCE(iocg->inuse > 1); in transfer_surpluses()
2055 parent->child_adjusted_sum * iocg->hweight_after_donation, in transfer_surpluses()
2056 parent->hweight_inuse); in transfer_surpluses()
2059 iocg->inuse, inuse, in transfer_surpluses()
2060 iocg->hweight_inuse, in transfer_surpluses()
2061 iocg->hweight_after_donation); in transfer_surpluses()
2063 __propagate_weights(iocg, iocg->active, inuse, true, now); in transfer_surpluses()
2068 list_del_init(&iocg->walk_list); in transfer_surpluses()
2075 * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2090 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2091 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2092 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2102 if (ioc->busy_level > 0) in ioc_forgive_debts()
2103 usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us); in ioc_forgive_debts()
2105 ioc->dfgv_usage_us_sum += usage_us_sum; in ioc_forgive_debts()
2106 if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD)) in ioc_forgive_debts()
2113 dur = now->now - ioc->dfgv_period_at; in ioc_forgive_debts()
2114 usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur); in ioc_forgive_debts()
2116 ioc->dfgv_period_at = now->now; in ioc_forgive_debts()
2117 ioc->dfgv_usage_us_sum = 0; in ioc_forgive_debts()
2121 ioc->dfgv_period_rem = 0; in ioc_forgive_debts()
2130 * run and carrying over the left-over duration in @ioc->dfgv_period_rem in ioc_forgive_debts()
2131 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive in ioc_forgive_debts()
2134 nr_cycles = dur + ioc->dfgv_period_rem; in ioc_forgive_debts()
2135 ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD); in ioc_forgive_debts()
2137 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_forgive_debts()
2140 if (!iocg->abs_vdebt && !iocg->delay) in ioc_forgive_debts()
2143 spin_lock(&iocg->waitq.lock); in ioc_forgive_debts()
2145 old_debt = iocg->abs_vdebt; in ioc_forgive_debts()
2146 old_delay = iocg->delay; in ioc_forgive_debts()
2148 nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1); in ioc_forgive_debts()
2149 if (iocg->abs_vdebt) in ioc_forgive_debts()
2150 iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1; in ioc_forgive_debts()
2152 if (iocg->delay) in ioc_forgive_debts()
2153 iocg->delay = iocg->delay >> nr_cycles_shift ?: 1; in ioc_forgive_debts()
2158 old_debt, iocg->abs_vdebt, in ioc_forgive_debts()
2159 old_delay, iocg->delay); in ioc_forgive_debts()
2161 spin_unlock(&iocg->waitq.lock); in ioc_forgive_debts()
2180 list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) { in ioc_check_iocgs()
2181 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_check_iocgs()
2182 !iocg->delay && !iocg_is_idle(iocg)) in ioc_check_iocgs()
2185 spin_lock(&iocg->waitq.lock); in ioc_check_iocgs()
2188 if (iocg->wait_since) { in ioc_check_iocgs()
2189 iocg->stat.wait_us += now->now - iocg->wait_since; in ioc_check_iocgs()
2190 iocg->wait_since = now->now; in ioc_check_iocgs()
2192 if (iocg->indebt_since) { in ioc_check_iocgs()
2193 iocg->stat.indebt_us += in ioc_check_iocgs()
2194 now->now - iocg->indebt_since; in ioc_check_iocgs()
2195 iocg->indebt_since = now->now; in ioc_check_iocgs()
2197 if (iocg->indelay_since) { in ioc_check_iocgs()
2198 iocg->stat.indelay_us += in ioc_check_iocgs()
2199 now->now - iocg->indelay_since; in ioc_check_iocgs()
2200 iocg->indelay_since = now->now; in ioc_check_iocgs()
2203 if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt || in ioc_check_iocgs()
2204 iocg->delay) { in ioc_check_iocgs()
2207 if (iocg->abs_vdebt || iocg->delay) in ioc_check_iocgs()
2211 u64 vtime = atomic64_read(&iocg->vtime); in ioc_check_iocgs()
2220 excess = now->vnow - vtime - ioc->margins.target; in ioc_check_iocgs()
2225 ioc->vtime_err -= div64_u64(excess * old_hwi, in ioc_check_iocgs()
2230 atomic64_read(&iocg->active_period), in ioc_check_iocgs()
2231 atomic64_read(&ioc->cur_period), vtime); in ioc_check_iocgs()
2233 list_del_init(&iocg->active_list); in ioc_check_iocgs()
2236 spin_unlock(&iocg->waitq.lock); in ioc_check_iocgs()
2261 spin_lock_irq(&ioc->lock); in ioc_timer_fn()
2263 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM]; in ioc_timer_fn()
2264 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM]; in ioc_timer_fn()
2267 period_vtime = now.vnow - ioc->period_at_vtime; in ioc_timer_fn()
2269 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2277 * below needs updated usage stat. Let's bring stat up-to-date. in ioc_timer_fn()
2279 iocg_flush_stat(&ioc->active_iocgs, &now); in ioc_timer_fn()
2282 list_for_each_entry(iocg, &ioc->active_iocgs, active_list) { in ioc_timer_fn()
2290 vdone = atomic64_read(&iocg->done_vtime); in ioc_timer_fn()
2291 vtime = atomic64_read(&iocg->vtime); in ioc_timer_fn()
2296 * in-flight for longer than a period. Detect them by in ioc_timer_fn()
2301 !atomic_read(&iocg_to_blkg(iocg)->use_delay) && in ioc_timer_fn()
2303 time_after64(vtime, now.vnow - in ioc_timer_fn()
2305 time_before64(vdone, now.vnow - period_vtime)) in ioc_timer_fn()
2309 * Determine absolute usage factoring in in-flight IOs to avoid in ioc_timer_fn()
2310 * high-latency completions appearing as idle. in ioc_timer_fn()
2312 usage_us = iocg->usage_delta_us; in ioc_timer_fn()
2316 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_timer_fn()
2318 (!waitqueue_active(&iocg->waitq) && in ioc_timer_fn()
2319 time_before64(vtime, now.vnow - ioc->margins.low))) { in ioc_timer_fn()
2325 cost_to_abs_cost(vtime - vdone, hw_inuse), in ioc_timer_fn()
2326 ioc->vtime_base_rate); in ioc_timer_fn()
2332 if (time_after64(iocg->activated_at, ioc->period_at)) in ioc_timer_fn()
2333 usage_dur = max_t(u64, now.now - iocg->activated_at, 1); in ioc_timer_fn()
2335 usage_dur = max_t(u64, now.now - ioc->period_at, 1); in ioc_timer_fn()
2358 iocg->hweight_donating = hwa; in ioc_timer_fn()
2359 iocg->hweight_after_donation = new_hwi; in ioc_timer_fn()
2360 list_add(&iocg->surplus_list, &surpluses); in ioc_timer_fn()
2361 } else if (!iocg->abs_vdebt) { in ioc_timer_fn()
2373 iocg->inuse, iocg->active, in ioc_timer_fn()
2374 iocg->hweight_inuse, new_hwi); in ioc_timer_fn()
2376 __propagate_weights(iocg, iocg->active, in ioc_timer_fn()
2377 iocg->active, true, &now); in ioc_timer_fn()
2393 list_del_init(&iocg->surplus_list); in ioc_timer_fn()
2401 prev_busy_level = ioc->busy_level; in ioc_timer_fn()
2406 ioc->busy_level = max(ioc->busy_level, 0); in ioc_timer_fn()
2407 ioc->busy_level++; in ioc_timer_fn()
2417 ioc->busy_level = min(ioc->busy_level, 0); in ioc_timer_fn()
2424 ioc->busy_level--; in ioc_timer_fn()
2432 ioc->busy_level = 0; in ioc_timer_fn()
2436 ioc->busy_level = 0; in ioc_timer_fn()
2439 ioc->busy_level = clamp(ioc->busy_level, -1000, 1000); in ioc_timer_fn()
2452 atomic64_inc(&ioc->cur_period); in ioc_timer_fn()
2454 if (ioc->running != IOC_STOP) { in ioc_timer_fn()
2455 if (!list_empty(&ioc->active_iocgs)) { in ioc_timer_fn()
2458 ioc->busy_level = 0; in ioc_timer_fn()
2459 ioc->vtime_err = 0; in ioc_timer_fn()
2460 ioc->running = IOC_IDLE; in ioc_timer_fn()
2466 spin_unlock_irq(&ioc->lock); in ioc_timer_fn()
2472 struct ioc *ioc = iocg->ioc; in adjust_inuse_and_calc_cost()
2473 struct ioc_margins *margins = &ioc->margins; in adjust_inuse_and_calc_cost()
2474 u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi; in adjust_inuse_and_calc_cost()
2477 u64 cost, new_inuse; in adjust_inuse_and_calc_cost() local
2482 cost = abs_cost_to_cost(abs_cost, hwi); in adjust_inuse_and_calc_cost()
2483 margin = now->vnow - vtime - cost; in adjust_inuse_and_calc_cost()
2486 if (iocg->abs_vdebt) in adjust_inuse_and_calc_cost()
2487 return cost; in adjust_inuse_and_calc_cost()
2493 if (margin >= iocg->saved_margin || margin >= margins->low || in adjust_inuse_and_calc_cost()
2494 iocg->inuse == iocg->active) in adjust_inuse_and_calc_cost()
2495 return cost; in adjust_inuse_and_calc_cost()
2497 spin_lock_irqsave(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2500 if (iocg->abs_vdebt || list_empty(&iocg->active_list)) { in adjust_inuse_and_calc_cost()
2501 spin_unlock_irqrestore(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2502 return cost; in adjust_inuse_and_calc_cost()
2507 * adj_step must be determined after acquiring ioc->lock - we might in adjust_inuse_and_calc_cost()
2509 * be reading 0 iocg->active before ioc->lock which will lead to in adjust_inuse_and_calc_cost()
2512 new_inuse = iocg->inuse; in adjust_inuse_and_calc_cost()
2513 adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100); in adjust_inuse_and_calc_cost()
2516 propagate_weights(iocg, iocg->active, new_inuse, true, now); in adjust_inuse_and_calc_cost()
2518 cost = abs_cost_to_cost(abs_cost, hwi); in adjust_inuse_and_calc_cost()
2519 } while (time_after64(vtime + cost, now->vnow) && in adjust_inuse_and_calc_cost()
2520 iocg->inuse != iocg->active); in adjust_inuse_and_calc_cost()
2522 spin_unlock_irqrestore(&ioc->lock, flags); in adjust_inuse_and_calc_cost()
2525 old_inuse, iocg->inuse, old_hwi, hwi); in adjust_inuse_and_calc_cost()
2527 return cost; in adjust_inuse_and_calc_cost()
2533 struct ioc *ioc = iocg->ioc; in calc_vtime_cost_builtin()
2537 u64 cost = 0; in calc_vtime_cost_builtin() local
2539 /* Can't calculate cost for empty bio */ in calc_vtime_cost_builtin()
2540 if (!bio->bi_iter.bi_size) in calc_vtime_cost_builtin()
2545 coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO]; in calc_vtime_cost_builtin()
2546 coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO]; in calc_vtime_cost_builtin()
2547 coef_page = ioc->params.lcoefs[LCOEF_RPAGE]; in calc_vtime_cost_builtin()
2550 coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO]; in calc_vtime_cost_builtin()
2551 coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO]; in calc_vtime_cost_builtin()
2552 coef_page = ioc->params.lcoefs[LCOEF_WPAGE]; in calc_vtime_cost_builtin()
2558 if (iocg->cursor) { in calc_vtime_cost_builtin()
2559 seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor); in calc_vtime_cost_builtin()
2565 cost += coef_randio; in calc_vtime_cost_builtin()
2567 cost += coef_seqio; in calc_vtime_cost_builtin()
2570 cost += pages * coef_page; in calc_vtime_cost_builtin()
2572 *costp = cost; in calc_vtime_cost_builtin()
2577 u64 cost; in calc_vtime_cost() local
2579 calc_vtime_cost_builtin(bio, iocg, is_merge, &cost); in calc_vtime_cost()
2580 return cost; in calc_vtime_cost()
2590 *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE]; in calc_size_vtime_cost_builtin()
2593 *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE]; in calc_size_vtime_cost_builtin()
2602 u64 cost; in calc_size_vtime_cost() local
2604 calc_size_vtime_cost_builtin(rq, ioc, &cost); in calc_size_vtime_cost()
2605 return cost; in calc_size_vtime_cost()
2610 struct blkcg_gq *blkg = bio->bi_blkg; in ioc_rqos_throttle()
2615 u64 abs_cost, cost, vtime; in ioc_rqos_throttle() local
2620 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_throttle()
2623 /* calculate the absolute vtime cost */ in ioc_rqos_throttle()
2631 iocg->cursor = bio_end_sector(bio); in ioc_rqos_throttle()
2632 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_throttle()
2633 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); in ioc_rqos_throttle()
2637 * tests are racy but the races aren't systemic - we only miss once in ioc_rqos_throttle()
2640 if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt && in ioc_rqos_throttle()
2641 time_before_eq64(vtime + cost, now.vnow)) { in ioc_rqos_throttle()
2642 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_throttle()
2648 * cause priority inversions are punted to @ioc->aux_iocg and charged as in ioc_rqos_throttle()
2649 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling in ioc_rqos_throttle()
2650 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine in ioc_rqos_throttle()
2654 ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt); in ioc_rqos_throttle()
2660 * is synchronized against both ioc->lock and waitq.lock and we won't in ioc_rqos_throttle()
2665 if (unlikely(list_empty(&iocg->active_list))) { in ioc_rqos_throttle()
2667 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_throttle()
2691 blkcg_schedule_throttle(rqos->disk, in ioc_rqos_throttle()
2692 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_throttle()
2698 if (!iocg->abs_vdebt && iocg->inuse != iocg->active) { in ioc_rqos_throttle()
2704 propagate_weights(iocg, iocg->active, iocg->active, true, in ioc_rqos_throttle()
2712 * or too long. Each wait entry records the absolute cost it's in ioc_rqos_throttle()
2713 * waiting for to allow re-evaluation using a custom wait entry. in ioc_rqos_throttle()
2718 * All waiters are on iocg->waitq and the wait states are in ioc_rqos_throttle()
2727 __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2740 finish_wait(&iocg->waitq, &wait.wait); in ioc_rqos_throttle()
2746 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_merge()
2750 u64 vtime, abs_cost, cost; in ioc_rqos_merge() local
2754 if (!ioc->enabled || !iocg || !iocg->level) in ioc_rqos_merge()
2763 vtime = atomic64_read(&iocg->vtime); in ioc_rqos_merge()
2764 cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now); in ioc_rqos_merge()
2768 blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor) in ioc_rqos_merge()
2769 iocg->cursor = bio_end; in ioc_rqos_merge()
2773 * cost assigned. in ioc_rqos_merge()
2775 if (rq->bio && rq->bio->bi_iocost_cost && in ioc_rqos_merge()
2776 time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) { in ioc_rqos_merge()
2777 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_merge()
2786 spin_lock_irqsave(&ioc->lock, flags); in ioc_rqos_merge()
2787 spin_lock(&iocg->waitq.lock); in ioc_rqos_merge()
2789 if (likely(!list_empty(&iocg->active_list))) { in ioc_rqos_merge()
2792 blkcg_schedule_throttle(rqos->disk, in ioc_rqos_merge()
2793 (bio->bi_opf & REQ_SWAP) == REQ_SWAP); in ioc_rqos_merge()
2795 iocg_commit_bio(iocg, bio, abs_cost, cost); in ioc_rqos_merge()
2798 spin_unlock(&iocg->waitq.lock); in ioc_rqos_merge()
2799 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_rqos_merge()
2804 struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg); in ioc_rqos_done_bio()
2806 if (iocg && bio->bi_iocost_cost) in ioc_rqos_done_bio()
2807 atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime); in ioc_rqos_done_bio()
2817 if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns) in ioc_rqos_done()
2833 on_q_ns = ktime_get_ns() - rq->alloc_time_ns; in ioc_rqos_done()
2834 rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns; in ioc_rqos_done()
2837 ccs = get_cpu_ptr(ioc->pcpu_stat); in ioc_rqos_done()
2840 on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC) in ioc_rqos_done()
2841 local_inc(&ccs->missed[rw].nr_met); in ioc_rqos_done()
2843 local_inc(&ccs->missed[rw].nr_missed); in ioc_rqos_done()
2845 local64_add(rq_wait_ns, &ccs->rq_wait_ns); in ioc_rqos_done()
2854 spin_lock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2856 spin_unlock_irq(&ioc->lock); in ioc_rqos_queue_depth_changed()
2863 blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost); in ioc_rqos_exit()
2865 spin_lock_irq(&ioc->lock); in ioc_rqos_exit()
2866 ioc->running = IOC_STOP; in ioc_rqos_exit()
2867 spin_unlock_irq(&ioc->lock); in ioc_rqos_exit()
2869 timer_shutdown_sync(&ioc->timer); in ioc_rqos_exit()
2870 free_percpu(ioc->pcpu_stat); in ioc_rqos_exit()
2890 return -ENOMEM; in blk_iocost_init()
2892 ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat); in blk_iocost_init()
2893 if (!ioc->pcpu_stat) { in blk_iocost_init()
2895 return -ENOMEM; in blk_iocost_init()
2899 struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu); in blk_iocost_init()
2901 for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) { in blk_iocost_init()
2902 local_set(&ccs->missed[i].nr_met, 0); in blk_iocost_init()
2903 local_set(&ccs->missed[i].nr_missed, 0); in blk_iocost_init()
2905 local64_set(&ccs->rq_wait_ns, 0); in blk_iocost_init()
2908 spin_lock_init(&ioc->lock); in blk_iocost_init()
2909 timer_setup(&ioc->timer, ioc_timer_fn, 0); in blk_iocost_init()
2910 INIT_LIST_HEAD(&ioc->active_iocgs); in blk_iocost_init()
2912 ioc->running = IOC_IDLE; in blk_iocost_init()
2913 ioc->vtime_base_rate = VTIME_PER_USEC; in blk_iocost_init()
2914 atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC); in blk_iocost_init()
2915 seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock); in blk_iocost_init()
2916 ioc->period_at = ktime_to_us(ktime_get()); in blk_iocost_init()
2917 atomic64_set(&ioc->cur_period, 0); in blk_iocost_init()
2918 atomic_set(&ioc->hweight_gen, 0); in blk_iocost_init()
2920 spin_lock_irq(&ioc->lock); in blk_iocost_init()
2921 ioc->autop_idx = AUTOP_INVALID; in blk_iocost_init()
2923 spin_unlock_irq(&ioc->lock); in blk_iocost_init()
2931 ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops); in blk_iocost_init()
2941 rq_qos_del(&ioc->rqos); in blk_iocost_init()
2943 free_percpu(ioc->pcpu_stat); in blk_iocost_init()
2956 iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE; in ioc_cpd_alloc()
2957 return &iocc->cpd; in ioc_cpd_alloc()
2968 int levels = blkcg->css.cgroup->level + 1; in ioc_pd_alloc()
2972 disk->node_id); in ioc_pd_alloc()
2976 iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp); in ioc_pd_alloc()
2977 if (!iocg->pcpu_stat) { in ioc_pd_alloc()
2982 return &iocg->pd; in ioc_pd_alloc()
2988 struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd); in ioc_pd_init()
2989 struct ioc *ioc = q_to_ioc(blkg->q); in ioc_pd_init()
2996 iocg->ioc = ioc; in ioc_pd_init()
2997 atomic64_set(&iocg->vtime, now.vnow); in ioc_pd_init()
2998 atomic64_set(&iocg->done_vtime, now.vnow); in ioc_pd_init()
2999 atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period)); in ioc_pd_init()
3000 INIT_LIST_HEAD(&iocg->active_list); in ioc_pd_init()
3001 INIT_LIST_HEAD(&iocg->walk_list); in ioc_pd_init()
3002 INIT_LIST_HEAD(&iocg->surplus_list); in ioc_pd_init()
3003 iocg->hweight_active = WEIGHT_ONE; in ioc_pd_init()
3004 iocg->hweight_inuse = WEIGHT_ONE; in ioc_pd_init()
3006 init_waitqueue_head(&iocg->waitq); in ioc_pd_init()
3007 hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS); in ioc_pd_init()
3008 iocg->waitq_timer.function = iocg_waitq_timer_fn; in ioc_pd_init()
3010 iocg->level = blkg->blkcg->css.cgroup->level; in ioc_pd_init()
3012 for (tblkg = blkg; tblkg; tblkg = tblkg->parent) { in ioc_pd_init()
3014 iocg->ancestors[tiocg->level] = tiocg; in ioc_pd_init()
3017 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_init()
3019 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_init()
3025 struct ioc *ioc = iocg->ioc; in ioc_pd_free()
3029 spin_lock_irqsave(&ioc->lock, flags); in ioc_pd_free()
3031 if (!list_empty(&iocg->active_list)) { in ioc_pd_free()
3036 list_del_init(&iocg->active_list); in ioc_pd_free()
3039 WARN_ON_ONCE(!list_empty(&iocg->walk_list)); in ioc_pd_free()
3040 WARN_ON_ONCE(!list_empty(&iocg->surplus_list)); in ioc_pd_free()
3042 spin_unlock_irqrestore(&ioc->lock, flags); in ioc_pd_free()
3044 hrtimer_cancel(&iocg->waitq_timer); in ioc_pd_free()
3046 free_percpu(iocg->pcpu_stat); in ioc_pd_free()
3053 struct ioc *ioc = iocg->ioc; in ioc_pd_stat()
3055 if (!ioc->enabled) in ioc_pd_stat()
3058 if (iocg->level == 0) { in ioc_pd_stat()
3060 ioc->vtime_base_rate * 10000, in ioc_pd_stat()
3062 seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100); in ioc_pd_stat()
3065 seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us); in ioc_pd_stat()
3068 seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu", in ioc_pd_stat()
3069 iocg->last_stat.wait_us, in ioc_pd_stat()
3070 iocg->last_stat.indebt_us, in ioc_pd_stat()
3071 iocg->last_stat.indelay_us); in ioc_pd_stat()
3077 const char *dname = blkg_dev_name(pd->blkg); in ioc_weight_prfill()
3080 if (dname && iocg->cfg_weight) in ioc_weight_prfill()
3081 seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE); in ioc_weight_prfill()
3091 seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE); in ioc_weight_show()
3093 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_weight_show()
3112 return -EINVAL; in ioc_weight_write()
3115 return -EINVAL; in ioc_weight_write()
3117 spin_lock_irq(&blkcg->lock); in ioc_weight_write()
3118 iocc->dfl_weight = v * WEIGHT_ONE; in ioc_weight_write()
3119 hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) { in ioc_weight_write()
3123 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3124 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3126 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3129 spin_unlock_irq(&blkcg->lock); in ioc_weight_write()
3151 spin_lock(&iocg->ioc->lock); in ioc_weight_write()
3152 iocg->cfg_weight = v * WEIGHT_ONE; in ioc_weight_write()
3153 ioc_now(iocg->ioc, &now); in ioc_weight_write()
3155 spin_unlock(&iocg->ioc->lock); in ioc_weight_write()
3161 ret = -EINVAL; in ioc_weight_write()
3170 const char *dname = blkg_dev_name(pd->blkg); in ioc_qos_prfill()
3171 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_qos_prfill()
3176 spin_lock_irq(&ioc->lock); in ioc_qos_prfill()
3178 dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto", in ioc_qos_prfill()
3179 ioc->params.qos[QOS_RPPM] / 10000, in ioc_qos_prfill()
3180 ioc->params.qos[QOS_RPPM] % 10000 / 100, in ioc_qos_prfill()
3181 ioc->params.qos[QOS_RLAT], in ioc_qos_prfill()
3182 ioc->params.qos[QOS_WPPM] / 10000, in ioc_qos_prfill()
3183 ioc->params.qos[QOS_WPPM] % 10000 / 100, in ioc_qos_prfill()
3184 ioc->params.qos[QOS_WLAT], in ioc_qos_prfill()
3185 ioc->params.qos[QOS_MIN] / 10000, in ioc_qos_prfill()
3186 ioc->params.qos[QOS_MIN] % 10000 / 100, in ioc_qos_prfill()
3187 ioc->params.qos[QOS_MAX] / 10000, in ioc_qos_prfill()
3188 ioc->params.qos[QOS_MAX] % 10000 / 100); in ioc_qos_prfill()
3189 spin_unlock_irq(&ioc->lock); in ioc_qos_prfill()
3198 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_qos_show()
3236 disk = ctx.bdev->bd_disk; in ioc_qos_write()
3237 if (!queue_is_mq(disk->queue)) { in ioc_qos_write()
3238 ret = -EOPNOTSUPP; in ioc_qos_write()
3242 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3247 ioc = q_to_ioc(disk->queue); in ioc_qos_write()
3250 blk_mq_freeze_queue(disk->queue); in ioc_qos_write()
3251 blk_mq_quiesce_queue(disk->queue); in ioc_qos_write()
3253 spin_lock_irq(&ioc->lock); in ioc_qos_write()
3254 memcpy(qos, ioc->params.qos, sizeof(qos)); in ioc_qos_write()
3255 enable = ioc->enabled; in ioc_qos_write()
3256 user = ioc->user_qos_params; in ioc_qos_write()
3324 if (enable && !ioc->enabled) { in ioc_qos_write()
3325 blk_stat_enable_accounting(disk->queue); in ioc_qos_write()
3326 blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3327 ioc->enabled = true; in ioc_qos_write()
3328 } else if (!enable && ioc->enabled) { in ioc_qos_write()
3329 blk_stat_disable_accounting(disk->queue); in ioc_qos_write()
3330 blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue); in ioc_qos_write()
3331 ioc->enabled = false; in ioc_qos_write()
3335 memcpy(ioc->params.qos, qos, sizeof(qos)); in ioc_qos_write()
3336 ioc->user_qos_params = true; in ioc_qos_write()
3338 ioc->user_qos_params = false; in ioc_qos_write()
3342 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3349 blk_mq_unquiesce_queue(disk->queue); in ioc_qos_write()
3350 blk_mq_unfreeze_queue(disk->queue); in ioc_qos_write()
3355 spin_unlock_irq(&ioc->lock); in ioc_qos_write()
3357 blk_mq_unquiesce_queue(disk->queue); in ioc_qos_write()
3358 blk_mq_unfreeze_queue(disk->queue); in ioc_qos_write()
3360 ret = -EINVAL; in ioc_qos_write()
3369 const char *dname = blkg_dev_name(pd->blkg); in ioc_cost_model_prfill()
3370 struct ioc *ioc = pd_to_iocg(pd)->ioc; in ioc_cost_model_prfill()
3371 u64 *u = ioc->params.i_lcoefs; in ioc_cost_model_prfill()
3376 spin_lock_irq(&ioc->lock); in ioc_cost_model_prfill()
3380 dname, ioc->user_cost_model ? "user" : "auto", in ioc_cost_model_prfill()
3383 spin_unlock_irq(&ioc->lock); in ioc_cost_model_prfill()
3392 &blkcg_policy_iocost, seq_cft(sf)->private, false); in ioc_cost_model_show()
3432 ret = -EOPNOTSUPP; in ioc_cost_model_write()
3438 ret = blk_iocost_init(ctx.bdev->bd_disk); in ioc_cost_model_write()
3447 spin_lock_irq(&ioc->lock); in ioc_cost_model_write()
3448 memcpy(u, ioc->params.i_lcoefs, sizeof(u)); in ioc_cost_model_write()
3449 user = ioc->user_cost_model; in ioc_cost_model_write()
3487 memcpy(ioc->params.i_lcoefs, u, sizeof(u)); in ioc_cost_model_write()
3488 ioc->user_cost_model = true; in ioc_cost_model_write()
3490 ioc->user_cost_model = false; in ioc_cost_model_write()
3493 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3502 spin_unlock_irq(&ioc->lock); in ioc_cost_model_write()
3507 ret = -EINVAL; in ioc_cost_model_write()
3521 .name = "cost.qos",
3527 .name = "cost.model",