17caa4715STejun Heo /* SPDX-License-Identifier: GPL-2.0
27caa4715STejun Heo *
37caa4715STejun Heo * IO cost model based controller.
47caa4715STejun Heo *
57caa4715STejun Heo * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
67caa4715STejun Heo * Copyright (C) 2019 Andy Newell <newella@fb.com>
77caa4715STejun Heo * Copyright (C) 2019 Facebook
87caa4715STejun Heo *
97caa4715STejun Heo * One challenge of controlling IO resources is the lack of trivially
107caa4715STejun Heo * observable cost metric. This is distinguished from CPU and memory where
117caa4715STejun Heo * wallclock time and the number of bytes can serve as accurate enough
127caa4715STejun Heo * approximations.
137caa4715STejun Heo *
147caa4715STejun Heo * Bandwidth and iops are the most commonly used metrics for IO devices but
157caa4715STejun Heo * depending on the type and specifics of the device, different IO patterns
167caa4715STejun Heo * easily lead to multiple orders of magnitude variations rendering them
177caa4715STejun Heo * useless for the purpose of IO capacity distribution. While on-device
187caa4715STejun Heo * time, with a lot of clutches, could serve as a useful approximation for
197caa4715STejun Heo * non-queued rotational devices, this is no longer viable with modern
207caa4715STejun Heo * devices, even the rotational ones.
217caa4715STejun Heo *
227caa4715STejun Heo * While there is no cost metric we can trivially observe, it isn't a
237caa4715STejun Heo * complete mystery. For example, on a rotational device, seek cost
247caa4715STejun Heo * dominates while a contiguous transfer contributes a smaller amount
257caa4715STejun Heo * proportional to the size. If we can characterize at least the relative
267caa4715STejun Heo * costs of these different types of IOs, it should be possible to
277caa4715STejun Heo * implement a reasonable work-conserving proportional IO resource
287caa4715STejun Heo * distribution.
297caa4715STejun Heo *
307caa4715STejun Heo * 1. IO Cost Model
317caa4715STejun Heo *
327caa4715STejun Heo * IO cost model estimates the cost of an IO given its basic parameters and
337caa4715STejun Heo * history (e.g. the end sector of the last IO). The cost is measured in
347caa4715STejun Heo * device time. If a given IO is estimated to cost 10ms, the device should
357caa4715STejun Heo * be able to process ~100 of those IOs in a second.
367caa4715STejun Heo *
377caa4715STejun Heo * Currently, there's only one builtin cost model - linear. Each IO is
387caa4715STejun Heo * classified as sequential or random and given a base cost accordingly.
397caa4715STejun Heo * On top of that, a size cost proportional to the length of the IO is
407caa4715STejun Heo * added. While simple, this model captures the operational
417caa4715STejun Heo * characteristics of a wide varienty of devices well enough. Default
425ba1add2SBaolin Wang * parameters for several different classes of devices are provided and the
437caa4715STejun Heo * parameters can be configured from userspace via
447caa4715STejun Heo * /sys/fs/cgroup/io.cost.model.
457caa4715STejun Heo *
467caa4715STejun Heo * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
477caa4715STejun Heo * device-specific coefficients.
487caa4715STejun Heo *
497caa4715STejun Heo * 2. Control Strategy
507caa4715STejun Heo *
517caa4715STejun Heo * The device virtual time (vtime) is used as the primary control metric.
527caa4715STejun Heo * The control strategy is composed of the following three parts.
537caa4715STejun Heo *
547caa4715STejun Heo * 2-1. Vtime Distribution
557caa4715STejun Heo *
567caa4715STejun Heo * When a cgroup becomes active in terms of IOs, its hierarchical share is
577caa4715STejun Heo * calculated. Please consider the following hierarchy where the numbers
587caa4715STejun Heo * inside parentheses denote the configured weights.
597caa4715STejun Heo *
607caa4715STejun Heo * root
617caa4715STejun Heo * / \
627caa4715STejun Heo * A (w:100) B (w:300)
637caa4715STejun Heo * / \
647caa4715STejun Heo * A0 (w:100) A1 (w:100)
657caa4715STejun Heo *
667caa4715STejun Heo * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
677caa4715STejun Heo * of equal weight, each gets 50% share. If then B starts issuing IOs, B
687caa4715STejun Heo * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
697caa4715STejun Heo * 12.5% each. The distribution mechanism only cares about these flattened
707caa4715STejun Heo * shares. They're called hweights (hierarchical weights) and always add
71fe20cdb5STejun Heo * upto 1 (WEIGHT_ONE).
727caa4715STejun Heo *
737caa4715STejun Heo * A given cgroup's vtime runs slower in inverse proportion to its hweight.
747caa4715STejun Heo * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
757caa4715STejun Heo * against the device vtime - an IO which takes 10ms on the underlying
767caa4715STejun Heo * device is considered to take 80ms on A0.
777caa4715STejun Heo *
787caa4715STejun Heo * This constitutes the basis of IO capacity distribution. Each cgroup's
797caa4715STejun Heo * vtime is running at a rate determined by its hweight. A cgroup tracks
805ba1add2SBaolin Wang * the vtime consumed by past IOs and can issue a new IO if doing so
817caa4715STejun Heo * wouldn't outrun the current device vtime. Otherwise, the IO is
827caa4715STejun Heo * suspended until the vtime has progressed enough to cover it.
837caa4715STejun Heo *
847caa4715STejun Heo * 2-2. Vrate Adjustment
857caa4715STejun Heo *
867caa4715STejun Heo * It's unrealistic to expect the cost model to be perfect. There are too
877caa4715STejun Heo * many devices and even on the same device the overall performance
887caa4715STejun Heo * fluctuates depending on numerous factors such as IO mixture and device
897caa4715STejun Heo * internal garbage collection. The controller needs to adapt dynamically.
907caa4715STejun Heo *
917caa4715STejun Heo * This is achieved by adjusting the overall IO rate according to how busy
927caa4715STejun Heo * the device is. If the device becomes overloaded, we're sending down too
937caa4715STejun Heo * many IOs and should generally slow down. If there are waiting issuers
947caa4715STejun Heo * but the device isn't saturated, we're issuing too few and should
957caa4715STejun Heo * generally speed up.
967caa4715STejun Heo *
977caa4715STejun Heo * To slow down, we lower the vrate - the rate at which the device vtime
987caa4715STejun Heo * passes compared to the wall clock. For example, if the vtime is running
997caa4715STejun Heo * at the vrate of 75%, all cgroups added up would only be able to issue
1007caa4715STejun Heo * 750ms worth of IOs per second, and vice-versa for speeding up.
1017caa4715STejun Heo *
1027caa4715STejun Heo * Device business is determined using two criteria - rq wait and
1037caa4715STejun Heo * completion latencies.
1047caa4715STejun Heo *
1057caa4715STejun Heo * When a device gets saturated, the on-device and then the request queues
1067caa4715STejun Heo * fill up and a bio which is ready to be issued has to wait for a request
1077caa4715STejun Heo * to become available. When this delay becomes noticeable, it's a clear
1087caa4715STejun Heo * indication that the device is saturated and we lower the vrate. This
1097caa4715STejun Heo * saturation signal is fairly conservative as it only triggers when both
1107caa4715STejun Heo * hardware and software queues are filled up, and is used as the default
1117caa4715STejun Heo * busy signal.
1127caa4715STejun Heo *
1137caa4715STejun Heo * As devices can have deep queues and be unfair in how the queued commands
114ecaaaabeSKemeng Shi * are executed, solely depending on rq wait may not result in satisfactory
1157caa4715STejun Heo * control quality. For a better control quality, completion latency QoS
1167caa4715STejun Heo * parameters can be configured so that the device is considered saturated
1177caa4715STejun Heo * if N'th percentile completion latency rises above the set point.
1187caa4715STejun Heo *
1197caa4715STejun Heo * The completion latency requirements are a function of both the
1207caa4715STejun Heo * underlying device characteristics and the desired IO latency quality of
1217caa4715STejun Heo * service. There is an inherent trade-off - the tighter the latency QoS,
1227caa4715STejun Heo * the higher the bandwidth lossage. Latency QoS is disabled by default
1237caa4715STejun Heo * and can be set through /sys/fs/cgroup/io.cost.qos.
1247caa4715STejun Heo *
1257caa4715STejun Heo * 2-3. Work Conservation
1267caa4715STejun Heo *
1277caa4715STejun Heo * Imagine two cgroups A and B with equal weights. A is issuing a small IO
1287caa4715STejun Heo * periodically while B is sending out enough parallel IOs to saturate the
1297caa4715STejun Heo * device on its own. Let's say A's usage amounts to 100ms worth of IO
1307caa4715STejun Heo * cost per second, i.e., 10% of the device capacity. The naive
1317caa4715STejun Heo * distribution of half and half would lead to 60% utilization of the
1327caa4715STejun Heo * device, a significant reduction in the total amount of work done
1337caa4715STejun Heo * compared to free-for-all competition. This is too high a cost to pay
1347caa4715STejun Heo * for IO control.
1357caa4715STejun Heo *
1367caa4715STejun Heo * To conserve the total amount of work done, we keep track of how much
1377caa4715STejun Heo * each active cgroup is actually using and yield part of its weight if
1387caa4715STejun Heo * there are other cgroups which can make use of it. In the above case,
1397caa4715STejun Heo * A's weight will be lowered so that it hovers above the actual usage and
1407caa4715STejun Heo * B would be able to use the rest.
1417caa4715STejun Heo *
1427caa4715STejun Heo * As we don't want to penalize a cgroup for donating its weight, the
1437caa4715STejun Heo * surplus weight adjustment factors in a margin and has an immediate
1447caa4715STejun Heo * snapback mechanism in case the cgroup needs more IO vtime for itself.
1457caa4715STejun Heo *
1467caa4715STejun Heo * Note that adjusting down surplus weights has the same effects as
1477caa4715STejun Heo * accelerating vtime for other cgroups and work conservation can also be
1487caa4715STejun Heo * implemented by adjusting vrate dynamically. However, squaring who can
1497caa4715STejun Heo * donate and should take back how much requires hweight propagations
1507caa4715STejun Heo * anyway making it easier to implement and understand as a separate
1517caa4715STejun Heo * mechanism.
1526954ff18STejun Heo *
1536954ff18STejun Heo * 3. Monitoring
1546954ff18STejun Heo *
1556954ff18STejun Heo * Instead of debugfs or other clumsy monitoring mechanisms, this
1566954ff18STejun Heo * controller uses a drgn based monitoring script -
1576954ff18STejun Heo * tools/cgroup/iocost_monitor.py. For details on drgn, please see
1585ba1add2SBaolin Wang * https://github.com/osandov/drgn. The output looks like the following.
1596954ff18STejun Heo *
1606954ff18STejun Heo * sdb RUN per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
1617c1ee704STejun Heo * active weight hweight% inflt% dbt delay usages%
1627c1ee704STejun Heo * test/a * 50/ 50 33.33/ 33.33 27.65 2 0*041 033:033:033
1637c1ee704STejun Heo * test/b * 100/ 100 66.67/ 66.67 17.56 0 0*000 066:079:077
1646954ff18STejun Heo *
1656954ff18STejun Heo * - per : Timer period
1666954ff18STejun Heo * - cur_per : Internal wall and device vtime clock
1676954ff18STejun Heo * - vrate : Device virtual time rate against wall clock
1686954ff18STejun Heo * - weight : Surplus-adjusted and configured weights
1696954ff18STejun Heo * - hweight : Surplus-adjusted and configured hierarchical weights
1706954ff18STejun Heo * - inflt : The percentage of in-flight IO cost at the end of last period
1716954ff18STejun Heo * - del_ms : Deferred issuer delay induction level and duration
1726954ff18STejun Heo * - usages : Usage history
1737caa4715STejun Heo */
1747caa4715STejun Heo
1757caa4715STejun Heo #include <linux/kernel.h>
1767caa4715STejun Heo #include <linux/module.h>
1777caa4715STejun Heo #include <linux/timer.h>
1787caa4715STejun Heo #include <linux/time64.h>
1797caa4715STejun Heo #include <linux/parser.h>
1807caa4715STejun Heo #include <linux/sched/signal.h>
1815e124f74STejun Heo #include <asm/local.h>
1825e124f74STejun Heo #include <asm/local64.h>
1837caa4715STejun Heo #include "blk-rq-qos.h"
1847caa4715STejun Heo #include "blk-stat.h"
1857caa4715STejun Heo #include "blk-wbt.h"
186672fdcf0SMing Lei #include "blk-cgroup.h"
1877caa4715STejun Heo
1887caa4715STejun Heo #ifdef CONFIG_TRACEPOINTS
1897caa4715STejun Heo
1907caa4715STejun Heo /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
1917caa4715STejun Heo #define TRACE_IOCG_PATH_LEN 1024
1927caa4715STejun Heo static DEFINE_SPINLOCK(trace_iocg_path_lock);
1937caa4715STejun Heo static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
1947caa4715STejun Heo
1957caa4715STejun Heo #define TRACE_IOCG_PATH(type, iocg, ...) \
1967caa4715STejun Heo do { \
1977caa4715STejun Heo unsigned long flags; \
1987caa4715STejun Heo if (trace_iocost_##type##_enabled()) { \
1997caa4715STejun Heo spin_lock_irqsave(&trace_iocg_path_lock, flags); \
2007caa4715STejun Heo cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup, \
2017caa4715STejun Heo trace_iocg_path, TRACE_IOCG_PATH_LEN); \
2027caa4715STejun Heo trace_iocost_##type(iocg, trace_iocg_path, \
2037caa4715STejun Heo ##__VA_ARGS__); \
2047caa4715STejun Heo spin_unlock_irqrestore(&trace_iocg_path_lock, flags); \
2057caa4715STejun Heo } \
2067caa4715STejun Heo } while (0)
2077caa4715STejun Heo
2087caa4715STejun Heo #else /* CONFIG_TRACE_POINTS */
2097caa4715STejun Heo #define TRACE_IOCG_PATH(type, iocg, ...) do { } while (0)
2107caa4715STejun Heo #endif /* CONFIG_TRACE_POINTS */
2117caa4715STejun Heo
2127caa4715STejun Heo enum {
2137caa4715STejun Heo MILLION = 1000000,
2147caa4715STejun Heo
2157caa4715STejun Heo /* timer period is calculated from latency requirements, bound it */
2167caa4715STejun Heo MIN_PERIOD = USEC_PER_MSEC,
2177caa4715STejun Heo MAX_PERIOD = USEC_PER_SEC,
2187caa4715STejun Heo
2197caa4715STejun Heo /*
220f1de2439STejun Heo * iocg->vtime is targeted at 50% behind the device vtime, which
2217caa4715STejun Heo * serves as its IO credit buffer. Surplus weight adjustment is
2227caa4715STejun Heo * immediately canceled if the vtime margin runs below 10%.
2237caa4715STejun Heo */
2247ca5b2e6STejun Heo MARGIN_MIN_PCT = 10,
225f1de2439STejun Heo MARGIN_LOW_PCT = 20,
226f1de2439STejun Heo MARGIN_TARGET_PCT = 50,
2277caa4715STejun Heo
228b0853ab4STejun Heo INUSE_ADJ_STEP_PCT = 25,
229b0853ab4STejun Heo
2307ca5b2e6STejun Heo /* Have some play in timer operations */
2317ca5b2e6STejun Heo TIMER_SLACK_PCT = 1,
2327caa4715STejun Heo
2337caa4715STejun Heo /* 1/64k is granular enough and can easily be handled w/ u32 */
234fe20cdb5STejun Heo WEIGHT_ONE = 1 << 16,
235ff1cc97bSJiri Slaby (SUSE) };
2367caa4715STejun Heo
237ff1cc97bSJiri Slaby (SUSE) enum {
2387caa4715STejun Heo /*
2397caa4715STejun Heo * As vtime is used to calculate the cost of each IO, it needs to
2407caa4715STejun Heo * be fairly high precision. For example, it should be able to
2417caa4715STejun Heo * represent the cost of a single page worth of discard with
2427caa4715STejun Heo * suffificient accuracy. At the same time, it should be able to
2437caa4715STejun Heo * represent reasonably long enough durations to be useful and
2447caa4715STejun Heo * convenient during operation.
2457caa4715STejun Heo *
2467caa4715STejun Heo * 1s worth of vtime is 2^37. This gives us both sub-nanosecond
2477caa4715STejun Heo * granularity and days of wrap-around time even at extreme vrates.
2487caa4715STejun Heo */
2497caa4715STejun Heo VTIME_PER_SEC_SHIFT = 37,
2507caa4715STejun Heo VTIME_PER_SEC = 1LLU << VTIME_PER_SEC_SHIFT,
2517caa4715STejun Heo VTIME_PER_USEC = VTIME_PER_SEC / USEC_PER_SEC,
252cd006509STejun Heo VTIME_PER_NSEC = VTIME_PER_SEC / NSEC_PER_SEC,
2537caa4715STejun Heo
2547caa4715STejun Heo /* bound vrate adjustments within two orders of magnitude */
2557caa4715STejun Heo VRATE_MIN_PPM = 10000, /* 1% */
2567caa4715STejun Heo VRATE_MAX_PPM = 100000000, /* 10000% */
2577caa4715STejun Heo
2587caa4715STejun Heo VRATE_MIN = VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
2597caa4715STejun Heo VRATE_CLAMP_ADJ_PCT = 4,
2607caa4715STejun Heo
2615f2779dfSArnd Bergmann /* switch iff the conditions are met for longer than this */
2625f2779dfSArnd Bergmann AUTOP_CYCLE_NSEC = 10LLU * NSEC_PER_SEC,
2635f2779dfSArnd Bergmann };
2645f2779dfSArnd Bergmann
2655f2779dfSArnd Bergmann enum {
2667caa4715STejun Heo /* if IOs end up waiting for requests, issue less */
2677caa4715STejun Heo RQ_WAIT_BUSY_PCT = 5,
2687caa4715STejun Heo
2697caa4715STejun Heo /* unbusy hysterisis */
2707caa4715STejun Heo UNBUSY_THR_PCT = 75,
2717caa4715STejun Heo
2725160a5a5STejun Heo /*
2735160a5a5STejun Heo * The effect of delay is indirect and non-linear and a huge amount of
2745160a5a5STejun Heo * future debt can accumulate abruptly while unthrottled. Linearly scale
2755160a5a5STejun Heo * up delay as debt is going up and then let it decay exponentially.
2765160a5a5STejun Heo * This gives us quick ramp ups while delay is accumulating and long
2775160a5a5STejun Heo * tails which can help reducing the frequency of debt explosions on
2785160a5a5STejun Heo * unthrottle. The parameters are experimentally determined.
2795160a5a5STejun Heo *
2805160a5a5STejun Heo * The delay mechanism provides adequate protection and behavior in many
2815160a5a5STejun Heo * cases. However, this is far from ideal and falls shorts on both
2825160a5a5STejun Heo * fronts. The debtors are often throttled too harshly costing a
2835160a5a5STejun Heo * significant level of fairness and possibly total work while the
2845160a5a5STejun Heo * protection against their impacts on the system can be choppy and
2855160a5a5STejun Heo * unreliable.
2865160a5a5STejun Heo *
2875160a5a5STejun Heo * The shortcoming primarily stems from the fact that, unlike for page
2885160a5a5STejun Heo * cache, the kernel doesn't have well-defined back-pressure propagation
2895160a5a5STejun Heo * mechanism and policies for anonymous memory. Fully addressing this
2905160a5a5STejun Heo * issue will likely require substantial improvements in the area.
2915160a5a5STejun Heo */
2925160a5a5STejun Heo MIN_DELAY_THR_PCT = 500,
2935160a5a5STejun Heo MAX_DELAY_THR_PCT = 25000,
2945160a5a5STejun Heo MIN_DELAY = 250,
2955160a5a5STejun Heo MAX_DELAY = 250 * USEC_PER_MSEC,
2965160a5a5STejun Heo
297c7af2a00STejun Heo /* halve debts if avg usage over 100ms is under 50% */
298c7af2a00STejun Heo DFGV_USAGE_PCT = 50,
299c7af2a00STejun Heo DFGV_PERIOD = 100 * USEC_PER_MSEC,
300dda1315fSTejun Heo
3017caa4715STejun Heo /* don't let cmds which take a very long time pin lagging for too long */
3027caa4715STejun Heo MAX_LAGGING_PERIODS = 10,
3037caa4715STejun Heo
3047caa4715STejun Heo /*
3057caa4715STejun Heo * Count IO size in 4k pages. The 12bit shift helps keeping
3067caa4715STejun Heo * size-proportional components of cost calculation in closer
3077caa4715STejun Heo * numbers of digits to per-IO cost components.
3087caa4715STejun Heo */
3097caa4715STejun Heo IOC_PAGE_SHIFT = 12,
3107caa4715STejun Heo IOC_PAGE_SIZE = 1 << IOC_PAGE_SHIFT,
3117caa4715STejun Heo IOC_SECT_TO_PAGE_SHIFT = IOC_PAGE_SHIFT - SECTOR_SHIFT,
3127caa4715STejun Heo
3137caa4715STejun Heo /* if apart further than 16M, consider randio for linear model */
3147caa4715STejun Heo LCOEF_RANDIO_PAGES = 4096,
3157caa4715STejun Heo };
3167caa4715STejun Heo
3177caa4715STejun Heo enum ioc_running {
3187caa4715STejun Heo IOC_IDLE,
3197caa4715STejun Heo IOC_RUNNING,
3207caa4715STejun Heo IOC_STOP,
3217caa4715STejun Heo };
3227caa4715STejun Heo
3237caa4715STejun Heo /* io.cost.qos controls including per-dev enable of the whole controller */
3247caa4715STejun Heo enum {
3257caa4715STejun Heo QOS_ENABLE,
3267caa4715STejun Heo QOS_CTRL,
3277caa4715STejun Heo NR_QOS_CTRL_PARAMS,
3287caa4715STejun Heo };
3297caa4715STejun Heo
3307caa4715STejun Heo /* io.cost.qos params */
3317caa4715STejun Heo enum {
3327caa4715STejun Heo QOS_RPPM,
3337caa4715STejun Heo QOS_RLAT,
3347caa4715STejun Heo QOS_WPPM,
3357caa4715STejun Heo QOS_WLAT,
3367caa4715STejun Heo QOS_MIN,
3377caa4715STejun Heo QOS_MAX,
3387caa4715STejun Heo NR_QOS_PARAMS,
3397caa4715STejun Heo };
3407caa4715STejun Heo
3417caa4715STejun Heo /* io.cost.model controls */
3427caa4715STejun Heo enum {
3437caa4715STejun Heo COST_CTRL,
3447caa4715STejun Heo COST_MODEL,
3457caa4715STejun Heo NR_COST_CTRL_PARAMS,
3467caa4715STejun Heo };
3477caa4715STejun Heo
3487caa4715STejun Heo /* builtin linear cost model coefficients */
3497caa4715STejun Heo enum {
3507caa4715STejun Heo I_LCOEF_RBPS,
3517caa4715STejun Heo I_LCOEF_RSEQIOPS,
3527caa4715STejun Heo I_LCOEF_RRANDIOPS,
3537caa4715STejun Heo I_LCOEF_WBPS,
3547caa4715STejun Heo I_LCOEF_WSEQIOPS,
3557caa4715STejun Heo I_LCOEF_WRANDIOPS,
3567caa4715STejun Heo NR_I_LCOEFS,
3577caa4715STejun Heo };
3587caa4715STejun Heo
3597caa4715STejun Heo enum {
3607caa4715STejun Heo LCOEF_RPAGE,
3617caa4715STejun Heo LCOEF_RSEQIO,
3627caa4715STejun Heo LCOEF_RRANDIO,
3637caa4715STejun Heo LCOEF_WPAGE,
3647caa4715STejun Heo LCOEF_WSEQIO,
3657caa4715STejun Heo LCOEF_WRANDIO,
3667caa4715STejun Heo NR_LCOEFS,
3677caa4715STejun Heo };
3687caa4715STejun Heo
3697caa4715STejun Heo enum {
3707caa4715STejun Heo AUTOP_INVALID,
3717caa4715STejun Heo AUTOP_HDD,
3727caa4715STejun Heo AUTOP_SSD_QD1,
3737caa4715STejun Heo AUTOP_SSD_DFL,
3747caa4715STejun Heo AUTOP_SSD_FAST,
3757caa4715STejun Heo };
3767caa4715STejun Heo
3777caa4715STejun Heo struct ioc_params {
3787caa4715STejun Heo u32 qos[NR_QOS_PARAMS];
3797caa4715STejun Heo u64 i_lcoefs[NR_I_LCOEFS];
3807caa4715STejun Heo u64 lcoefs[NR_LCOEFS];
3817caa4715STejun Heo u32 too_fast_vrate_pct;
3827caa4715STejun Heo u32 too_slow_vrate_pct;
3837caa4715STejun Heo };
3847caa4715STejun Heo
3857ca5b2e6STejun Heo struct ioc_margins {
3867ca5b2e6STejun Heo s64 min;
387f1de2439STejun Heo s64 low;
388f1de2439STejun Heo s64 target;
3897ca5b2e6STejun Heo };
3907ca5b2e6STejun Heo
3917caa4715STejun Heo struct ioc_missed {
3925e124f74STejun Heo local_t nr_met;
3935e124f74STejun Heo local_t nr_missed;
3947caa4715STejun Heo u32 last_met;
3957caa4715STejun Heo u32 last_missed;
3967caa4715STejun Heo };
3977caa4715STejun Heo
3987caa4715STejun Heo struct ioc_pcpu_stat {
3997caa4715STejun Heo struct ioc_missed missed[2];
4007caa4715STejun Heo
4015e124f74STejun Heo local64_t rq_wait_ns;
4027caa4715STejun Heo u64 last_rq_wait_ns;
4037caa4715STejun Heo };
4047caa4715STejun Heo
4057caa4715STejun Heo /* per device */
4067caa4715STejun Heo struct ioc {
4077caa4715STejun Heo struct rq_qos rqos;
4087caa4715STejun Heo
4097caa4715STejun Heo bool enabled;
4107caa4715STejun Heo
4117caa4715STejun Heo struct ioc_params params;
4127ca5b2e6STejun Heo struct ioc_margins margins;
4137caa4715STejun Heo u32 period_us;
4147ca5b2e6STejun Heo u32 timer_slack_ns;
4157caa4715STejun Heo u64 vrate_min;
4167caa4715STejun Heo u64 vrate_max;
4177caa4715STejun Heo
4187caa4715STejun Heo spinlock_t lock;
4197caa4715STejun Heo struct timer_list timer;
4207caa4715STejun Heo struct list_head active_iocgs; /* active cgroups */
4217caa4715STejun Heo struct ioc_pcpu_stat __percpu *pcpu_stat;
4227caa4715STejun Heo
4237caa4715STejun Heo enum ioc_running running;
4247caa4715STejun Heo atomic64_t vtime_rate;
425ac33e91eSTejun Heo u64 vtime_base_rate;
426ac33e91eSTejun Heo s64 vtime_err;
4277caa4715STejun Heo
42867b7b641SAhmed S. Darwish seqcount_spinlock_t period_seqcount;
429ce95570aSTejun Heo u64 period_at; /* wallclock starttime */
4307caa4715STejun Heo u64 period_at_vtime; /* vtime starttime */
4317caa4715STejun Heo
4327caa4715STejun Heo atomic64_t cur_period; /* inc'd each period */
4337caa4715STejun Heo int busy_level; /* saturation history */
4347caa4715STejun Heo
4357caa4715STejun Heo bool weights_updated;
4367caa4715STejun Heo atomic_t hweight_gen; /* for lazy hweights */
4377caa4715STejun Heo
438c7af2a00STejun Heo /* debt forgivness */
439c7af2a00STejun Heo u64 dfgv_period_at;
440c7af2a00STejun Heo u64 dfgv_period_rem;
441c7af2a00STejun Heo u64 dfgv_usage_us_sum;
442dda1315fSTejun Heo
4437caa4715STejun Heo u64 autop_too_fast_at;
4447caa4715STejun Heo u64 autop_too_slow_at;
4457caa4715STejun Heo int autop_idx;
4467caa4715STejun Heo bool user_qos_params:1;
4477caa4715STejun Heo bool user_cost_model:1;
4487caa4715STejun Heo };
4497caa4715STejun Heo
45097eb1975STejun Heo struct iocg_pcpu_stat {
45197eb1975STejun Heo local64_t abs_vusage;
45297eb1975STejun Heo };
45397eb1975STejun Heo
45497eb1975STejun Heo struct iocg_stat {
45597eb1975STejun Heo u64 usage_us;
456f0bf84a5STejun Heo u64 wait_us;
457f0bf84a5STejun Heo u64 indebt_us;
458f0bf84a5STejun Heo u64 indelay_us;
45997eb1975STejun Heo };
46097eb1975STejun Heo
4617caa4715STejun Heo /* per device-cgroup pair */
4627caa4715STejun Heo struct ioc_gq {
4637caa4715STejun Heo struct blkg_policy_data pd;
4647caa4715STejun Heo struct ioc *ioc;
4657caa4715STejun Heo
4667caa4715STejun Heo /*
4677caa4715STejun Heo * A iocg can get its weight from two sources - an explicit
4687caa4715STejun Heo * per-device-cgroup configuration or the default weight of the
4697caa4715STejun Heo * cgroup. `cfg_weight` is the explicit per-device-cgroup
4707caa4715STejun Heo * configuration. `weight` is the effective considering both
4717caa4715STejun Heo * sources.
4727caa4715STejun Heo *
4737caa4715STejun Heo * When an idle cgroup becomes active its `active` goes from 0 to
4747caa4715STejun Heo * `weight`. `inuse` is the surplus adjusted active weight.
4757caa4715STejun Heo * `active` and `inuse` are used to calculate `hweight_active` and
4767caa4715STejun Heo * `hweight_inuse`.
4777caa4715STejun Heo *
4787caa4715STejun Heo * `last_inuse` remembers `inuse` while an iocg is idle to persist
4797caa4715STejun Heo * surplus adjustments.
480b0853ab4STejun Heo *
481b0853ab4STejun Heo * `inuse` may be adjusted dynamically during period. `saved_*` are used
482b0853ab4STejun Heo * to determine and track adjustments.
4837caa4715STejun Heo */
4847caa4715STejun Heo u32 cfg_weight;
4857caa4715STejun Heo u32 weight;
4867caa4715STejun Heo u32 active;
4877caa4715STejun Heo u32 inuse;
488b0853ab4STejun Heo
4897caa4715STejun Heo u32 last_inuse;
490b0853ab4STejun Heo s64 saved_margin;
4917caa4715STejun Heo
4927caa4715STejun Heo sector_t cursor; /* to detect randio */
4937caa4715STejun Heo
4947caa4715STejun Heo /*
4957caa4715STejun Heo * `vtime` is this iocg's vtime cursor which progresses as IOs are
4967caa4715STejun Heo * issued. If lagging behind device vtime, the delta represents
4975ba1add2SBaolin Wang * the currently available IO budget. If running ahead, the
4987caa4715STejun Heo * overage.
4997caa4715STejun Heo *
5007caa4715STejun Heo * `vtime_done` is the same but progressed on completion rather
5017caa4715STejun Heo * than issue. The delta behind `vtime` represents the cost of
5027caa4715STejun Heo * currently in-flight IOs.
5037caa4715STejun Heo */
5047caa4715STejun Heo atomic64_t vtime;
5057caa4715STejun Heo atomic64_t done_vtime;
5060b80f986STejun Heo u64 abs_vdebt;
5077caa4715STejun Heo
5085160a5a5STejun Heo /* current delay in effect and when it started */
5095160a5a5STejun Heo u64 delay;
5105160a5a5STejun Heo u64 delay_at;
5115160a5a5STejun Heo
5127caa4715STejun Heo /*
5137caa4715STejun Heo * The period this iocg was last active in. Used for deactivation
5147caa4715STejun Heo * and invalidating `vtime`.
5157caa4715STejun Heo */
5167caa4715STejun Heo atomic64_t active_period;
5177caa4715STejun Heo struct list_head active_list;
5187caa4715STejun Heo
51900410f1bSTejun Heo /* see __propagate_weights() and current_hweight() for details */
5207caa4715STejun Heo u64 child_active_sum;
5217caa4715STejun Heo u64 child_inuse_sum;
522e08d02aaSTejun Heo u64 child_adjusted_sum;
5237caa4715STejun Heo int hweight_gen;
5247caa4715STejun Heo u32 hweight_active;
5257caa4715STejun Heo u32 hweight_inuse;
526e08d02aaSTejun Heo u32 hweight_donating;
52793f7d2dbSTejun Heo u32 hweight_after_donation;
5287caa4715STejun Heo
52997eb1975STejun Heo struct list_head walk_list;
5308692d2dbSTejun Heo struct list_head surplus_list;
53197eb1975STejun Heo
5327caa4715STejun Heo struct wait_queue_head waitq;
5337caa4715STejun Heo struct hrtimer waitq_timer;
5347caa4715STejun Heo
5351aa50d02STejun Heo /* timestamp at the latest activation */
5361aa50d02STejun Heo u64 activated_at;
5371aa50d02STejun Heo
53897eb1975STejun Heo /* statistics */
53997eb1975STejun Heo struct iocg_pcpu_stat __percpu *pcpu_stat;
5402a371f7dSChengming Zhou struct iocg_stat stat;
54197eb1975STejun Heo struct iocg_stat last_stat;
54297eb1975STejun Heo u64 last_stat_abs_vusage;
543f1de2439STejun Heo u64 usage_delta_us;
544f0bf84a5STejun Heo u64 wait_since;
545f0bf84a5STejun Heo u64 indebt_since;
546f0bf84a5STejun Heo u64 indelay_since;
5477caa4715STejun Heo
5487caa4715STejun Heo /* this iocg's depth in the hierarchy and ancestors including self */
5497caa4715STejun Heo int level;
5507caa4715STejun Heo struct ioc_gq *ancestors[];
5517caa4715STejun Heo };
5527caa4715STejun Heo
5537caa4715STejun Heo /* per cgroup */
5547caa4715STejun Heo struct ioc_cgrp {
5557caa4715STejun Heo struct blkcg_policy_data cpd;
5567caa4715STejun Heo unsigned int dfl_weight;
5577caa4715STejun Heo };
5587caa4715STejun Heo
5597caa4715STejun Heo struct ioc_now {
5607caa4715STejun Heo u64 now_ns;
561ce95570aSTejun Heo u64 now;
5627caa4715STejun Heo u64 vnow;
5637caa4715STejun Heo };
5647caa4715STejun Heo
5657caa4715STejun Heo struct iocg_wait {
5667caa4715STejun Heo struct wait_queue_entry wait;
5677caa4715STejun Heo struct bio *bio;
5687caa4715STejun Heo u64 abs_cost;
5697caa4715STejun Heo bool committed;
5707caa4715STejun Heo };
5717caa4715STejun Heo
5727caa4715STejun Heo struct iocg_wake_ctx {
5737caa4715STejun Heo struct ioc_gq *iocg;
5747caa4715STejun Heo u32 hw_inuse;
5757caa4715STejun Heo s64 vbudget;
5767caa4715STejun Heo };
5777caa4715STejun Heo
5787caa4715STejun Heo static const struct ioc_params autop[] = {
5797caa4715STejun Heo [AUTOP_HDD] = {
5807caa4715STejun Heo .qos = {
5817afcccafSTejun Heo [QOS_RLAT] = 250000, /* 250ms */
5827afcccafSTejun Heo [QOS_WLAT] = 250000,
5837caa4715STejun Heo [QOS_MIN] = VRATE_MIN_PPM,
5847caa4715STejun Heo [QOS_MAX] = VRATE_MAX_PPM,
5857caa4715STejun Heo },
5867caa4715STejun Heo .i_lcoefs = {
5877caa4715STejun Heo [I_LCOEF_RBPS] = 174019176,
5887caa4715STejun Heo [I_LCOEF_RSEQIOPS] = 41708,
5897caa4715STejun Heo [I_LCOEF_RRANDIOPS] = 370,
5907caa4715STejun Heo [I_LCOEF_WBPS] = 178075866,
5917caa4715STejun Heo [I_LCOEF_WSEQIOPS] = 42705,
5927caa4715STejun Heo [I_LCOEF_WRANDIOPS] = 378,
5937caa4715STejun Heo },
5947caa4715STejun Heo },
5957caa4715STejun Heo [AUTOP_SSD_QD1] = {
5967caa4715STejun Heo .qos = {
5977caa4715STejun Heo [QOS_RLAT] = 25000, /* 25ms */
5987caa4715STejun Heo [QOS_WLAT] = 25000,
5997caa4715STejun Heo [QOS_MIN] = VRATE_MIN_PPM,
6007caa4715STejun Heo [QOS_MAX] = VRATE_MAX_PPM,
6017caa4715STejun Heo },
6027caa4715STejun Heo .i_lcoefs = {
6037caa4715STejun Heo [I_LCOEF_RBPS] = 245855193,
6047caa4715STejun Heo [I_LCOEF_RSEQIOPS] = 61575,
6057caa4715STejun Heo [I_LCOEF_RRANDIOPS] = 6946,
6067caa4715STejun Heo [I_LCOEF_WBPS] = 141365009,
6077caa4715STejun Heo [I_LCOEF_WSEQIOPS] = 33716,
6087caa4715STejun Heo [I_LCOEF_WRANDIOPS] = 26796,
6097caa4715STejun Heo },
6107caa4715STejun Heo },
6117caa4715STejun Heo [AUTOP_SSD_DFL] = {
6127caa4715STejun Heo .qos = {
6137caa4715STejun Heo [QOS_RLAT] = 25000, /* 25ms */
6147caa4715STejun Heo [QOS_WLAT] = 25000,
6157caa4715STejun Heo [QOS_MIN] = VRATE_MIN_PPM,
6167caa4715STejun Heo [QOS_MAX] = VRATE_MAX_PPM,
6177caa4715STejun Heo },
6187caa4715STejun Heo .i_lcoefs = {
6197caa4715STejun Heo [I_LCOEF_RBPS] = 488636629,
6207caa4715STejun Heo [I_LCOEF_RSEQIOPS] = 8932,
6217caa4715STejun Heo [I_LCOEF_RRANDIOPS] = 8518,
6227caa4715STejun Heo [I_LCOEF_WBPS] = 427891549,
6237caa4715STejun Heo [I_LCOEF_WSEQIOPS] = 28755,
6247caa4715STejun Heo [I_LCOEF_WRANDIOPS] = 21940,
6257caa4715STejun Heo },
6267caa4715STejun Heo .too_fast_vrate_pct = 500,
6277caa4715STejun Heo },
6287caa4715STejun Heo [AUTOP_SSD_FAST] = {
6297caa4715STejun Heo .qos = {
6307caa4715STejun Heo [QOS_RLAT] = 5000, /* 5ms */
6317caa4715STejun Heo [QOS_WLAT] = 5000,
6327caa4715STejun Heo [QOS_MIN] = VRATE_MIN_PPM,
6337caa4715STejun Heo [QOS_MAX] = VRATE_MAX_PPM,
6347caa4715STejun Heo },
6357caa4715STejun Heo .i_lcoefs = {
6367caa4715STejun Heo [I_LCOEF_RBPS] = 3102524156LLU,
6377caa4715STejun Heo [I_LCOEF_RSEQIOPS] = 724816,
6387caa4715STejun Heo [I_LCOEF_RRANDIOPS] = 778122,
6397caa4715STejun Heo [I_LCOEF_WBPS] = 1742780862LLU,
6407caa4715STejun Heo [I_LCOEF_WSEQIOPS] = 425702,
6417caa4715STejun Heo [I_LCOEF_WRANDIOPS] = 443193,
6427caa4715STejun Heo },
6437caa4715STejun Heo .too_slow_vrate_pct = 10,
6447caa4715STejun Heo },
6457caa4715STejun Heo };
6467caa4715STejun Heo
6477caa4715STejun Heo /*
6487caa4715STejun Heo * vrate adjust percentages indexed by ioc->busy_level. We adjust up on
6497caa4715STejun Heo * vtime credit shortage and down on device saturation.
6507caa4715STejun Heo */
6517caa4715STejun Heo static u32 vrate_adj_pct[] =
6527caa4715STejun Heo { 0, 0, 0, 0,
6537caa4715STejun Heo 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
6547caa4715STejun Heo 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
6557caa4715STejun Heo 4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
6567caa4715STejun Heo
6577caa4715STejun Heo static struct blkcg_policy blkcg_policy_iocost;
6587caa4715STejun Heo
6597caa4715STejun Heo /* accessors and helpers */
rqos_to_ioc(struct rq_qos * rqos)6607caa4715STejun Heo static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
6617caa4715STejun Heo {
6627caa4715STejun Heo return container_of(rqos, struct ioc, rqos);
6637caa4715STejun Heo }
6647caa4715STejun Heo
q_to_ioc(struct request_queue * q)6657caa4715STejun Heo static struct ioc *q_to_ioc(struct request_queue *q)
6667caa4715STejun Heo {
6677caa4715STejun Heo return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
6687caa4715STejun Heo }
6697caa4715STejun Heo
ioc_name(struct ioc * ioc)6707caa4715STejun Heo static const char __maybe_unused *ioc_name(struct ioc *ioc)
6717caa4715STejun Heo {
672ba91c849SChristoph Hellwig struct gendisk *disk = ioc->rqos.disk;
6739df3e651SChristoph Hellwig
6749df3e651SChristoph Hellwig if (!disk)
6759df3e651SChristoph Hellwig return "<unknown>";
6769df3e651SChristoph Hellwig return disk->disk_name;
6777caa4715STejun Heo }
6787caa4715STejun Heo
pd_to_iocg(struct blkg_policy_data * pd)6797caa4715STejun Heo static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
6807caa4715STejun Heo {
6817caa4715STejun Heo return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
6827caa4715STejun Heo }
6837caa4715STejun Heo
blkg_to_iocg(struct blkcg_gq * blkg)6847caa4715STejun Heo static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
6857caa4715STejun Heo {
6867caa4715STejun Heo return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
6877caa4715STejun Heo }
6887caa4715STejun Heo
iocg_to_blkg(struct ioc_gq * iocg)6897caa4715STejun Heo static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
6907caa4715STejun Heo {
6917caa4715STejun Heo return pd_to_blkg(&iocg->pd);
6927caa4715STejun Heo }
6937caa4715STejun Heo
blkcg_to_iocc(struct blkcg * blkcg)6947caa4715STejun Heo static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
6957caa4715STejun Heo {
6967caa4715STejun Heo return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
6977caa4715STejun Heo struct ioc_cgrp, cpd);
6987caa4715STejun Heo }
6997caa4715STejun Heo
7007caa4715STejun Heo /*
7017caa4715STejun Heo * Scale @abs_cost to the inverse of @hw_inuse. The lower the hierarchical
70236a52481STejun Heo * weight, the more expensive each IO. Must round up.
7037caa4715STejun Heo */
abs_cost_to_cost(u64 abs_cost,u32 hw_inuse)7047caa4715STejun Heo static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
7057caa4715STejun Heo {
706fe20cdb5STejun Heo return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
7077caa4715STejun Heo }
7087caa4715STejun Heo
70936a52481STejun Heo /*
71036a52481STejun Heo * The inverse of abs_cost_to_cost(). Must round up.
71136a52481STejun Heo */
cost_to_abs_cost(u64 cost,u32 hw_inuse)71236a52481STejun Heo static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
71336a52481STejun Heo {
714fe20cdb5STejun Heo return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
71536a52481STejun Heo }
71636a52481STejun Heo
iocg_commit_bio(struct ioc_gq * iocg,struct bio * bio,u64 abs_cost,u64 cost)71797eb1975STejun Heo static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
71897eb1975STejun Heo u64 abs_cost, u64 cost)
7197caa4715STejun Heo {
72097eb1975STejun Heo struct iocg_pcpu_stat *gcs;
72197eb1975STejun Heo
7227caa4715STejun Heo bio->bi_iocost_cost = cost;
7237caa4715STejun Heo atomic64_add(cost, &iocg->vtime);
72497eb1975STejun Heo
72597eb1975STejun Heo gcs = get_cpu_ptr(iocg->pcpu_stat);
72697eb1975STejun Heo local64_add(abs_cost, &gcs->abs_vusage);
72797eb1975STejun Heo put_cpu_ptr(gcs);
7287caa4715STejun Heo }
7297caa4715STejun Heo
iocg_lock(struct ioc_gq * iocg,bool lock_ioc,unsigned long * flags)730da437b95STejun Heo static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
731da437b95STejun Heo {
732da437b95STejun Heo if (lock_ioc) {
733da437b95STejun Heo spin_lock_irqsave(&iocg->ioc->lock, *flags);
734da437b95STejun Heo spin_lock(&iocg->waitq.lock);
735da437b95STejun Heo } else {
736da437b95STejun Heo spin_lock_irqsave(&iocg->waitq.lock, *flags);
737da437b95STejun Heo }
738da437b95STejun Heo }
739da437b95STejun Heo
iocg_unlock(struct ioc_gq * iocg,bool unlock_ioc,unsigned long * flags)740da437b95STejun Heo static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
741da437b95STejun Heo {
742da437b95STejun Heo if (unlock_ioc) {
743da437b95STejun Heo spin_unlock(&iocg->waitq.lock);
744da437b95STejun Heo spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
745da437b95STejun Heo } else {
746da437b95STejun Heo spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
747da437b95STejun Heo }
748da437b95STejun Heo }
749da437b95STejun Heo
7507caa4715STejun Heo #define CREATE_TRACE_POINTS
7517caa4715STejun Heo #include <trace/events/iocost.h>
7527caa4715STejun Heo
ioc_refresh_margins(struct ioc * ioc)7537ca5b2e6STejun Heo static void ioc_refresh_margins(struct ioc *ioc)
7547ca5b2e6STejun Heo {
7557ca5b2e6STejun Heo struct ioc_margins *margins = &ioc->margins;
7567ca5b2e6STejun Heo u32 period_us = ioc->period_us;
757ac33e91eSTejun Heo u64 vrate = ioc->vtime_base_rate;
7587ca5b2e6STejun Heo
7597ca5b2e6STejun Heo margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
760f1de2439STejun Heo margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
761f1de2439STejun Heo margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
7627ca5b2e6STejun Heo }
7637ca5b2e6STejun Heo
7647caa4715STejun Heo /* latency Qos params changed, update period_us and all the dependent params */
ioc_refresh_period_us(struct ioc * ioc)7657caa4715STejun Heo static void ioc_refresh_period_us(struct ioc *ioc)
7667caa4715STejun Heo {
7677caa4715STejun Heo u32 ppm, lat, multi, period_us;
7687caa4715STejun Heo
7697caa4715STejun Heo lockdep_assert_held(&ioc->lock);
7707caa4715STejun Heo
7717caa4715STejun Heo /* pick the higher latency target */
7727caa4715STejun Heo if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
7737caa4715STejun Heo ppm = ioc->params.qos[QOS_RPPM];
7747caa4715STejun Heo lat = ioc->params.qos[QOS_RLAT];
7757caa4715STejun Heo } else {
7767caa4715STejun Heo ppm = ioc->params.qos[QOS_WPPM];
7777caa4715STejun Heo lat = ioc->params.qos[QOS_WLAT];
7787caa4715STejun Heo }
7797caa4715STejun Heo
7807caa4715STejun Heo /*
7817caa4715STejun Heo * We want the period to be long enough to contain a healthy number
7827caa4715STejun Heo * of IOs while short enough for granular control. Define it as a
7837caa4715STejun Heo * multiple of the latency target. Ideally, the multiplier should
7847caa4715STejun Heo * be scaled according to the percentile so that it would nominally
7857caa4715STejun Heo * contain a certain number of requests. Let's be simpler and
7867caa4715STejun Heo * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
7877caa4715STejun Heo */
7887caa4715STejun Heo if (ppm)
7897caa4715STejun Heo multi = max_t(u32, (MILLION - ppm) / 50000, 2);
7907caa4715STejun Heo else
7917caa4715STejun Heo multi = 2;
7927caa4715STejun Heo period_us = multi * lat;
7937caa4715STejun Heo period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
7947caa4715STejun Heo
7957caa4715STejun Heo /* calculate dependent params */
7967caa4715STejun Heo ioc->period_us = period_us;
7977ca5b2e6STejun Heo ioc->timer_slack_ns = div64_u64(
7987ca5b2e6STejun Heo (u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
7997ca5b2e6STejun Heo 100);
8007ca5b2e6STejun Heo ioc_refresh_margins(ioc);
8017caa4715STejun Heo }
8027caa4715STejun Heo
803e33b9365SBreno Leitao /*
804e33b9365SBreno Leitao * ioc->rqos.disk isn't initialized when this function is called from
805e33b9365SBreno Leitao * the init path.
806e33b9365SBreno Leitao */
ioc_autop_idx(struct ioc * ioc,struct gendisk * disk)807e33b9365SBreno Leitao static int ioc_autop_idx(struct ioc *ioc, struct gendisk *disk)
8087caa4715STejun Heo {
8097caa4715STejun Heo int idx = ioc->autop_idx;
8107caa4715STejun Heo const struct ioc_params *p = &autop[idx];
8117caa4715STejun Heo u32 vrate_pct;
8127caa4715STejun Heo u64 now_ns;
8137caa4715STejun Heo
8147caa4715STejun Heo /* rotational? */
815e33b9365SBreno Leitao if (!blk_queue_nonrot(disk->queue))
8167caa4715STejun Heo return AUTOP_HDD;
8177caa4715STejun Heo
8187caa4715STejun Heo /* handle SATA SSDs w/ broken NCQ */
819e33b9365SBreno Leitao if (blk_queue_depth(disk->queue) == 1)
8207caa4715STejun Heo return AUTOP_SSD_QD1;
8217caa4715STejun Heo
8227caa4715STejun Heo /* use one of the normal ssd sets */
8237caa4715STejun Heo if (idx < AUTOP_SSD_DFL)
8247caa4715STejun Heo return AUTOP_SSD_DFL;
8257caa4715STejun Heo
8267caa4715STejun Heo /* if user is overriding anything, maintain what was there */
8277caa4715STejun Heo if (ioc->user_qos_params || ioc->user_cost_model)
8287caa4715STejun Heo return idx;
8297caa4715STejun Heo
8307caa4715STejun Heo /* step up/down based on the vrate */
831ac33e91eSTejun Heo vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
8327caa4715STejun Heo now_ns = ktime_get_ns();
8337caa4715STejun Heo
8347caa4715STejun Heo if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
8357caa4715STejun Heo if (!ioc->autop_too_fast_at)
8367caa4715STejun Heo ioc->autop_too_fast_at = now_ns;
8377caa4715STejun Heo if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
8387caa4715STejun Heo return idx + 1;
8397caa4715STejun Heo } else {
8407caa4715STejun Heo ioc->autop_too_fast_at = 0;
8417caa4715STejun Heo }
8427caa4715STejun Heo
8437caa4715STejun Heo if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
8447caa4715STejun Heo if (!ioc->autop_too_slow_at)
8457caa4715STejun Heo ioc->autop_too_slow_at = now_ns;
8467caa4715STejun Heo if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
8477caa4715STejun Heo return idx - 1;
8487caa4715STejun Heo } else {
8497caa4715STejun Heo ioc->autop_too_slow_at = 0;
8507caa4715STejun Heo }
8517caa4715STejun Heo
8527caa4715STejun Heo return idx;
8537caa4715STejun Heo }
8547caa4715STejun Heo
8557caa4715STejun Heo /*
8567caa4715STejun Heo * Take the followings as input
8577caa4715STejun Heo *
8587caa4715STejun Heo * @bps maximum sequential throughput
8597caa4715STejun Heo * @seqiops maximum sequential 4k iops
8607caa4715STejun Heo * @randiops maximum random 4k iops
8617caa4715STejun Heo *
8627caa4715STejun Heo * and calculate the linear model cost coefficients.
8637caa4715STejun Heo *
8647caa4715STejun Heo * *@page per-page cost 1s / (@bps / 4096)
8657caa4715STejun Heo * *@seqio base cost of a seq IO max((1s / @seqiops) - *@page, 0)
8667caa4715STejun Heo * @randiops base cost of a rand IO max((1s / @randiops) - *@page, 0)
8677caa4715STejun Heo */
calc_lcoefs(u64 bps,u64 seqiops,u64 randiops,u64 * page,u64 * seqio,u64 * randio)8687caa4715STejun Heo static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
8697caa4715STejun Heo u64 *page, u64 *seqio, u64 *randio)
8707caa4715STejun Heo {
8717caa4715STejun Heo u64 v;
8727caa4715STejun Heo
8737caa4715STejun Heo *page = *seqio = *randio = 0;
8747caa4715STejun Heo
875984af1e6SLi Nan if (bps) {
876984af1e6SLi Nan u64 bps_pages = DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE);
877984af1e6SLi Nan
878984af1e6SLi Nan if (bps_pages)
879984af1e6SLi Nan *page = DIV64_U64_ROUND_UP(VTIME_PER_SEC, bps_pages);
880984af1e6SLi Nan else
881984af1e6SLi Nan *page = 1;
882984af1e6SLi Nan }
8837caa4715STejun Heo
8847caa4715STejun Heo if (seqiops) {
8857caa4715STejun Heo v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
8867caa4715STejun Heo if (v > *page)
8877caa4715STejun Heo *seqio = v - *page;
8887caa4715STejun Heo }
8897caa4715STejun Heo
8907caa4715STejun Heo if (randiops) {
8917caa4715STejun Heo v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
8927caa4715STejun Heo if (v > *page)
8937caa4715STejun Heo *randio = v - *page;
8947caa4715STejun Heo }
8957caa4715STejun Heo }
8967caa4715STejun Heo
ioc_refresh_lcoefs(struct ioc * ioc)8977caa4715STejun Heo static void ioc_refresh_lcoefs(struct ioc *ioc)
8987caa4715STejun Heo {
8997caa4715STejun Heo u64 *u = ioc->params.i_lcoefs;
9007caa4715STejun Heo u64 *c = ioc->params.lcoefs;
9017caa4715STejun Heo
9027caa4715STejun Heo calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
9037caa4715STejun Heo &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
9047caa4715STejun Heo calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
9057caa4715STejun Heo &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
9067caa4715STejun Heo }
9077caa4715STejun Heo
908e33b9365SBreno Leitao /*
909e33b9365SBreno Leitao * struct gendisk is required as an argument because ioc->rqos.disk
910e33b9365SBreno Leitao * is not properly initialized when called from the init path.
911e33b9365SBreno Leitao */
ioc_refresh_params_disk(struct ioc * ioc,bool force,struct gendisk * disk)912e33b9365SBreno Leitao static bool ioc_refresh_params_disk(struct ioc *ioc, bool force,
913e33b9365SBreno Leitao struct gendisk *disk)
9147caa4715STejun Heo {
9157caa4715STejun Heo const struct ioc_params *p;
9167caa4715STejun Heo int idx;
9177caa4715STejun Heo
9187caa4715STejun Heo lockdep_assert_held(&ioc->lock);
9197caa4715STejun Heo
920e33b9365SBreno Leitao idx = ioc_autop_idx(ioc, disk);
9217caa4715STejun Heo p = &autop[idx];
9227caa4715STejun Heo
9237caa4715STejun Heo if (idx == ioc->autop_idx && !force)
9247caa4715STejun Heo return false;
9257caa4715STejun Heo
926c6d2efddSKemeng Shi if (idx != ioc->autop_idx) {
9277caa4715STejun Heo atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
928c6d2efddSKemeng Shi ioc->vtime_base_rate = VTIME_PER_USEC;
929c6d2efddSKemeng Shi }
9307caa4715STejun Heo
9317caa4715STejun Heo ioc->autop_idx = idx;
9327caa4715STejun Heo ioc->autop_too_fast_at = 0;
9337caa4715STejun Heo ioc->autop_too_slow_at = 0;
9347caa4715STejun Heo
9357caa4715STejun Heo if (!ioc->user_qos_params)
9367caa4715STejun Heo memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
9377caa4715STejun Heo if (!ioc->user_cost_model)
9387caa4715STejun Heo memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
9397caa4715STejun Heo
9407caa4715STejun Heo ioc_refresh_period_us(ioc);
9417caa4715STejun Heo ioc_refresh_lcoefs(ioc);
9427caa4715STejun Heo
9437caa4715STejun Heo ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
9447caa4715STejun Heo VTIME_PER_USEC, MILLION);
945b3260329SLi Nan ioc->vrate_max = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MAX] *
9467caa4715STejun Heo VTIME_PER_USEC, MILLION);
9477caa4715STejun Heo
9487caa4715STejun Heo return true;
9497caa4715STejun Heo }
9507caa4715STejun Heo
ioc_refresh_params(struct ioc * ioc,bool force)951e33b9365SBreno Leitao static bool ioc_refresh_params(struct ioc *ioc, bool force)
952e33b9365SBreno Leitao {
953e33b9365SBreno Leitao return ioc_refresh_params_disk(ioc, force, ioc->rqos.disk);
954e33b9365SBreno Leitao }
955e33b9365SBreno Leitao
956ac33e91eSTejun Heo /*
957ac33e91eSTejun Heo * When an iocg accumulates too much vtime or gets deactivated, we throw away
958ac33e91eSTejun Heo * some vtime, which lowers the overall device utilization. As the exact amount
959ac33e91eSTejun Heo * which is being thrown away is known, we can compensate by accelerating the
960ac33e91eSTejun Heo * vrate accordingly so that the extra vtime generated in the current period
961ac33e91eSTejun Heo * matches what got lost.
962ac33e91eSTejun Heo */
ioc_refresh_vrate(struct ioc * ioc,struct ioc_now * now)963ac33e91eSTejun Heo static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
964ac33e91eSTejun Heo {
965ac33e91eSTejun Heo s64 pleft = ioc->period_at + ioc->period_us - now->now;
966ac33e91eSTejun Heo s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
967ac33e91eSTejun Heo s64 vcomp, vcomp_min, vcomp_max;
968ac33e91eSTejun Heo
969ac33e91eSTejun Heo lockdep_assert_held(&ioc->lock);
970ac33e91eSTejun Heo
971ac33e91eSTejun Heo /* we need some time left in this period */
972ac33e91eSTejun Heo if (pleft <= 0)
973ac33e91eSTejun Heo goto done;
974ac33e91eSTejun Heo
975ac33e91eSTejun Heo /*
976ac33e91eSTejun Heo * Calculate how much vrate should be adjusted to offset the error.
977ac33e91eSTejun Heo * Limit the amount of adjustment and deduct the adjusted amount from
978ac33e91eSTejun Heo * the error.
979ac33e91eSTejun Heo */
980ac33e91eSTejun Heo vcomp = -div64_s64(ioc->vtime_err, pleft);
981ac33e91eSTejun Heo vcomp_min = -(ioc->vtime_base_rate >> 1);
982ac33e91eSTejun Heo vcomp_max = ioc->vtime_base_rate;
983ac33e91eSTejun Heo vcomp = clamp(vcomp, vcomp_min, vcomp_max);
984ac33e91eSTejun Heo
985ac33e91eSTejun Heo ioc->vtime_err += vcomp * pleft;
986ac33e91eSTejun Heo
987ac33e91eSTejun Heo atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
988ac33e91eSTejun Heo done:
989ac33e91eSTejun Heo /* bound how much error can accumulate */
990ac33e91eSTejun Heo ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
991ac33e91eSTejun Heo }
992ac33e91eSTejun Heo
ioc_adjust_base_vrate(struct ioc * ioc,u32 rq_wait_pct,int nr_lagging,int nr_shortages,int prev_busy_level,u32 * missed_ppm)993926f75f6SBaolin Wang static void ioc_adjust_base_vrate(struct ioc *ioc, u32 rq_wait_pct,
994926f75f6SBaolin Wang int nr_lagging, int nr_shortages,
995926f75f6SBaolin Wang int prev_busy_level, u32 *missed_ppm)
996926f75f6SBaolin Wang {
997926f75f6SBaolin Wang u64 vrate = ioc->vtime_base_rate;
998926f75f6SBaolin Wang u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
999926f75f6SBaolin Wang
1000926f75f6SBaolin Wang if (!ioc->busy_level || (ioc->busy_level < 0 && nr_lagging)) {
1001926f75f6SBaolin Wang if (ioc->busy_level != prev_busy_level || nr_lagging)
100263c9eac4SKemeng Shi trace_iocost_ioc_vrate_adj(ioc, vrate,
1003926f75f6SBaolin Wang missed_ppm, rq_wait_pct,
1004926f75f6SBaolin Wang nr_lagging, nr_shortages);
1005926f75f6SBaolin Wang
1006926f75f6SBaolin Wang return;
1007926f75f6SBaolin Wang }
1008926f75f6SBaolin Wang
1009926f75f6SBaolin Wang /*
1010926f75f6SBaolin Wang * If vrate is out of bounds, apply clamp gradually as the
1011926f75f6SBaolin Wang * bounds can change abruptly. Otherwise, apply busy_level
1012926f75f6SBaolin Wang * based adjustment.
1013926f75f6SBaolin Wang */
1014926f75f6SBaolin Wang if (vrate < vrate_min) {
1015926f75f6SBaolin Wang vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT), 100);
1016926f75f6SBaolin Wang vrate = min(vrate, vrate_min);
1017926f75f6SBaolin Wang } else if (vrate > vrate_max) {
1018926f75f6SBaolin Wang vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT), 100);
1019926f75f6SBaolin Wang vrate = max(vrate, vrate_max);
1020926f75f6SBaolin Wang } else {
1021926f75f6SBaolin Wang int idx = min_t(int, abs(ioc->busy_level),
1022926f75f6SBaolin Wang ARRAY_SIZE(vrate_adj_pct) - 1);
1023926f75f6SBaolin Wang u32 adj_pct = vrate_adj_pct[idx];
1024926f75f6SBaolin Wang
1025926f75f6SBaolin Wang if (ioc->busy_level > 0)
1026926f75f6SBaolin Wang adj_pct = 100 - adj_pct;
1027926f75f6SBaolin Wang else
1028926f75f6SBaolin Wang adj_pct = 100 + adj_pct;
1029926f75f6SBaolin Wang
1030926f75f6SBaolin Wang vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1031926f75f6SBaolin Wang vrate_min, vrate_max);
1032926f75f6SBaolin Wang }
1033926f75f6SBaolin Wang
1034926f75f6SBaolin Wang trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1035926f75f6SBaolin Wang nr_lagging, nr_shortages);
1036926f75f6SBaolin Wang
1037926f75f6SBaolin Wang ioc->vtime_base_rate = vrate;
1038926f75f6SBaolin Wang ioc_refresh_margins(ioc);
1039926f75f6SBaolin Wang }
1040926f75f6SBaolin Wang
10417caa4715STejun Heo /* take a snapshot of the current [v]time and vrate */
ioc_now(struct ioc * ioc,struct ioc_now * now)10427caa4715STejun Heo static void ioc_now(struct ioc *ioc, struct ioc_now *now)
10437caa4715STejun Heo {
10447caa4715STejun Heo unsigned seq;
10456c31be32SKemeng Shi u64 vrate;
10467caa4715STejun Heo
10477caa4715STejun Heo now->now_ns = ktime_get();
10487caa4715STejun Heo now->now = ktime_to_us(now->now_ns);
10496c31be32SKemeng Shi vrate = atomic64_read(&ioc->vtime_rate);
10507caa4715STejun Heo
10517caa4715STejun Heo /*
10527caa4715STejun Heo * The current vtime is
10537caa4715STejun Heo *
10547caa4715STejun Heo * vtime at period start + (wallclock time since the start) * vrate
10557caa4715STejun Heo *
10567caa4715STejun Heo * As a consistent snapshot of `period_at_vtime` and `period_at` is
10577caa4715STejun Heo * needed, they're seqcount protected.
10587caa4715STejun Heo */
10597caa4715STejun Heo do {
10607caa4715STejun Heo seq = read_seqcount_begin(&ioc->period_seqcount);
10617caa4715STejun Heo now->vnow = ioc->period_at_vtime +
10626c31be32SKemeng Shi (now->now - ioc->period_at) * vrate;
10637caa4715STejun Heo } while (read_seqcount_retry(&ioc->period_seqcount, seq));
10647caa4715STejun Heo }
10657caa4715STejun Heo
ioc_start_period(struct ioc * ioc,struct ioc_now * now)10667caa4715STejun Heo static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
10677caa4715STejun Heo {
10687caa4715STejun Heo WARN_ON_ONCE(ioc->running != IOC_RUNNING);
10697caa4715STejun Heo
10707caa4715STejun Heo write_seqcount_begin(&ioc->period_seqcount);
10717caa4715STejun Heo ioc->period_at = now->now;
10727caa4715STejun Heo ioc->period_at_vtime = now->vnow;
10737caa4715STejun Heo write_seqcount_end(&ioc->period_seqcount);
10747caa4715STejun Heo
10757caa4715STejun Heo ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
10767caa4715STejun Heo add_timer(&ioc->timer);
10777caa4715STejun Heo }
10787caa4715STejun Heo
10797caa4715STejun Heo /*
10807caa4715STejun Heo * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1081b0853ab4STejun Heo * weight sums and propagate upwards accordingly. If @save, the current margin
1082b0853ab4STejun Heo * is saved to be used as reference for later inuse in-period adjustments.
10837caa4715STejun Heo */
__propagate_weights(struct ioc_gq * iocg,u32 active,u32 inuse,bool save,struct ioc_now * now)1084b0853ab4STejun Heo static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1085b0853ab4STejun Heo bool save, struct ioc_now *now)
10867caa4715STejun Heo {
10877caa4715STejun Heo struct ioc *ioc = iocg->ioc;
10887caa4715STejun Heo int lvl;
10897caa4715STejun Heo
10907caa4715STejun Heo lockdep_assert_held(&ioc->lock);
10917caa4715STejun Heo
1092e9f4eee9STejun Heo /*
1093e9f4eee9STejun Heo * For an active leaf node, its inuse shouldn't be zero or exceed
1094e9f4eee9STejun Heo * @active. An active internal node's inuse is solely determined by the
1095e9f4eee9STejun Heo * inuse to active ratio of its children regardless of @inuse.
1096e9f4eee9STejun Heo */
1097e9f4eee9STejun Heo if (list_empty(&iocg->active_list) && iocg->child_active_sum) {
1098e9f4eee9STejun Heo inuse = DIV64_U64_ROUND_UP(active * iocg->child_inuse_sum,
1099e9f4eee9STejun Heo iocg->child_active_sum);
1100e9f4eee9STejun Heo } else {
1101*4a542118SNathan Chancellor /*
1102*4a542118SNathan Chancellor * It may be tempting to turn this into a clamp expression with
1103*4a542118SNathan Chancellor * a lower limit of 1 but active may be 0, which cannot be used
1104*4a542118SNathan Chancellor * as an upper limit in that situation. This expression allows
1105*4a542118SNathan Chancellor * active to clamp inuse unless it is 0, in which case inuse
1106*4a542118SNathan Chancellor * becomes 1.
1107*4a542118SNathan Chancellor */
1108*4a542118SNathan Chancellor inuse = min(inuse, active) ?: 1;
1109e9f4eee9STejun Heo }
1110db84a72aSTejun Heo
1111b0853ab4STejun Heo iocg->last_inuse = iocg->inuse;
1112b0853ab4STejun Heo if (save)
1113b0853ab4STejun Heo iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1114b0853ab4STejun Heo
1115db84a72aSTejun Heo if (active == iocg->active && inuse == iocg->inuse)
1116db84a72aSTejun Heo return;
11177caa4715STejun Heo
11187caa4715STejun Heo for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
11197caa4715STejun Heo struct ioc_gq *parent = iocg->ancestors[lvl];
11207caa4715STejun Heo struct ioc_gq *child = iocg->ancestors[lvl + 1];
11217caa4715STejun Heo u32 parent_active = 0, parent_inuse = 0;
11227caa4715STejun Heo
11237caa4715STejun Heo /* update the level sums */
11247caa4715STejun Heo parent->child_active_sum += (s32)(active - child->active);
11257caa4715STejun Heo parent->child_inuse_sum += (s32)(inuse - child->inuse);
1126e9f4eee9STejun Heo /* apply the updates */
11277caa4715STejun Heo child->active = active;
11287caa4715STejun Heo child->inuse = inuse;
11297caa4715STejun Heo
11307caa4715STejun Heo /*
11317caa4715STejun Heo * The delta between inuse and active sums indicates that
11325ba1add2SBaolin Wang * much of weight is being given away. Parent's inuse
11337caa4715STejun Heo * and active should reflect the ratio.
11347caa4715STejun Heo */
11357caa4715STejun Heo if (parent->child_active_sum) {
11367caa4715STejun Heo parent_active = parent->weight;
11377caa4715STejun Heo parent_inuse = DIV64_U64_ROUND_UP(
11387caa4715STejun Heo parent_active * parent->child_inuse_sum,
11397caa4715STejun Heo parent->child_active_sum);
11407caa4715STejun Heo }
11417caa4715STejun Heo
11427caa4715STejun Heo /* do we need to keep walking up? */
11437caa4715STejun Heo if (parent_active == parent->active &&
11447caa4715STejun Heo parent_inuse == parent->inuse)
11457caa4715STejun Heo break;
11467caa4715STejun Heo
11477caa4715STejun Heo active = parent_active;
11487caa4715STejun Heo inuse = parent_inuse;
11497caa4715STejun Heo }
11507caa4715STejun Heo
11517caa4715STejun Heo ioc->weights_updated = true;
11527caa4715STejun Heo }
11537caa4715STejun Heo
commit_weights(struct ioc * ioc)115400410f1bSTejun Heo static void commit_weights(struct ioc *ioc)
11557caa4715STejun Heo {
11567caa4715STejun Heo lockdep_assert_held(&ioc->lock);
11577caa4715STejun Heo
11587caa4715STejun Heo if (ioc->weights_updated) {
11597caa4715STejun Heo /* paired with rmb in current_hweight(), see there */
11607caa4715STejun Heo smp_wmb();
11617caa4715STejun Heo atomic_inc(&ioc->hweight_gen);
11627caa4715STejun Heo ioc->weights_updated = false;
11637caa4715STejun Heo }
11647caa4715STejun Heo }
11657caa4715STejun Heo
propagate_weights(struct ioc_gq * iocg,u32 active,u32 inuse,bool save,struct ioc_now * now)1166b0853ab4STejun Heo static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1167b0853ab4STejun Heo bool save, struct ioc_now *now)
11687caa4715STejun Heo {
1169b0853ab4STejun Heo __propagate_weights(iocg, active, inuse, save, now);
117000410f1bSTejun Heo commit_weights(iocg->ioc);
11717caa4715STejun Heo }
11727caa4715STejun Heo
current_hweight(struct ioc_gq * iocg,u32 * hw_activep,u32 * hw_inusep)11737caa4715STejun Heo static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
11747caa4715STejun Heo {
11757caa4715STejun Heo struct ioc *ioc = iocg->ioc;
11767caa4715STejun Heo int lvl;
11777caa4715STejun Heo u32 hwa, hwi;
11787caa4715STejun Heo int ioc_gen;
11797caa4715STejun Heo
11807caa4715STejun Heo /* hot path - if uptodate, use cached */
11817caa4715STejun Heo ioc_gen = atomic_read(&ioc->hweight_gen);
11827caa4715STejun Heo if (ioc_gen == iocg->hweight_gen)
11837caa4715STejun Heo goto out;
11847caa4715STejun Heo
11857caa4715STejun Heo /*
118600410f1bSTejun Heo * Paired with wmb in commit_weights(). If we saw the updated
118700410f1bSTejun Heo * hweight_gen, all the weight updates from __propagate_weights() are
118800410f1bSTejun Heo * visible too.
11897caa4715STejun Heo *
11907caa4715STejun Heo * We can race with weight updates during calculation and get it
11917caa4715STejun Heo * wrong. However, hweight_gen would have changed and a future
11927caa4715STejun Heo * reader will recalculate and we're guaranteed to discard the
11937caa4715STejun Heo * wrong result soon.
11947caa4715STejun Heo */
11957caa4715STejun Heo smp_rmb();
11967caa4715STejun Heo
1197fe20cdb5STejun Heo hwa = hwi = WEIGHT_ONE;
11987caa4715STejun Heo for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
11997caa4715STejun Heo struct ioc_gq *parent = iocg->ancestors[lvl];
12007caa4715STejun Heo struct ioc_gq *child = iocg->ancestors[lvl + 1];
1201bd0adb91STejun Heo u64 active_sum = READ_ONCE(parent->child_active_sum);
1202bd0adb91STejun Heo u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
12037caa4715STejun Heo u32 active = READ_ONCE(child->active);
12047caa4715STejun Heo u32 inuse = READ_ONCE(child->inuse);
12057caa4715STejun Heo
12067caa4715STejun Heo /* we can race with deactivations and either may read as zero */
12077caa4715STejun Heo if (!active_sum || !inuse_sum)
12087caa4715STejun Heo continue;
12097caa4715STejun Heo
1210bd0adb91STejun Heo active_sum = max_t(u64, active, active_sum);
1211bd0adb91STejun Heo hwa = div64_u64((u64)hwa * active, active_sum);
12127caa4715STejun Heo
1213bd0adb91STejun Heo inuse_sum = max_t(u64, inuse, inuse_sum);
1214bd0adb91STejun Heo hwi = div64_u64((u64)hwi * inuse, inuse_sum);
12157caa4715STejun Heo }
12167caa4715STejun Heo
12177caa4715STejun Heo iocg->hweight_active = max_t(u32, hwa, 1);
12187caa4715STejun Heo iocg->hweight_inuse = max_t(u32, hwi, 1);
12197caa4715STejun Heo iocg->hweight_gen = ioc_gen;
12207caa4715STejun Heo out:
12217caa4715STejun Heo if (hw_activep)
12227caa4715STejun Heo *hw_activep = iocg->hweight_active;
12237caa4715STejun Heo if (hw_inusep)
12247caa4715STejun Heo *hw_inusep = iocg->hweight_inuse;
12257caa4715STejun Heo }
12267caa4715STejun Heo
122793f7d2dbSTejun Heo /*
122893f7d2dbSTejun Heo * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
122993f7d2dbSTejun Heo * other weights stay unchanged.
123093f7d2dbSTejun Heo */
current_hweight_max(struct ioc_gq * iocg)123193f7d2dbSTejun Heo static u32 current_hweight_max(struct ioc_gq *iocg)
123293f7d2dbSTejun Heo {
123393f7d2dbSTejun Heo u32 hwm = WEIGHT_ONE;
123493f7d2dbSTejun Heo u32 inuse = iocg->active;
123593f7d2dbSTejun Heo u64 child_inuse_sum;
123693f7d2dbSTejun Heo int lvl;
123793f7d2dbSTejun Heo
123893f7d2dbSTejun Heo lockdep_assert_held(&iocg->ioc->lock);
123993f7d2dbSTejun Heo
124093f7d2dbSTejun Heo for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
124193f7d2dbSTejun Heo struct ioc_gq *parent = iocg->ancestors[lvl];
124293f7d2dbSTejun Heo struct ioc_gq *child = iocg->ancestors[lvl + 1];
124393f7d2dbSTejun Heo
124493f7d2dbSTejun Heo child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
124593f7d2dbSTejun Heo hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
124693f7d2dbSTejun Heo inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
124793f7d2dbSTejun Heo parent->child_active_sum);
124893f7d2dbSTejun Heo }
124993f7d2dbSTejun Heo
125093f7d2dbSTejun Heo return max_t(u32, hwm, 1);
125193f7d2dbSTejun Heo }
125293f7d2dbSTejun Heo
weight_updated(struct ioc_gq * iocg,struct ioc_now * now)1253b0853ab4STejun Heo static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
12547caa4715STejun Heo {
12557caa4715STejun Heo struct ioc *ioc = iocg->ioc;
12567caa4715STejun Heo struct blkcg_gq *blkg = iocg_to_blkg(iocg);
12577caa4715STejun Heo struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
12587caa4715STejun Heo u32 weight;
12597caa4715STejun Heo
12607caa4715STejun Heo lockdep_assert_held(&ioc->lock);
12617caa4715STejun Heo
12627caa4715STejun Heo weight = iocg->cfg_weight ?: iocc->dfl_weight;
12637caa4715STejun Heo if (weight != iocg->weight && iocg->active)
1264b0853ab4STejun Heo propagate_weights(iocg, weight, iocg->inuse, true, now);
12657caa4715STejun Heo iocg->weight = weight;
12667caa4715STejun Heo }
12677caa4715STejun Heo
iocg_activate(struct ioc_gq * iocg,struct ioc_now * now)12687caa4715STejun Heo static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
12697caa4715STejun Heo {
12707caa4715STejun Heo struct ioc *ioc = iocg->ioc;
1271ac33e91eSTejun Heo u64 last_period, cur_period;
1272ac33e91eSTejun Heo u64 vtime, vtarget;
12737caa4715STejun Heo int i;
12747caa4715STejun Heo
12757caa4715STejun Heo /*
12767caa4715STejun Heo * If seem to be already active, just update the stamp to tell the
12777caa4715STejun Heo * timer that we're still active. We don't mind occassional races.
12787caa4715STejun Heo */
12797caa4715STejun Heo if (!list_empty(&iocg->active_list)) {
12807caa4715STejun Heo ioc_now(ioc, now);
12817caa4715STejun Heo cur_period = atomic64_read(&ioc->cur_period);
12827caa4715STejun Heo if (atomic64_read(&iocg->active_period) != cur_period)
12837caa4715STejun Heo atomic64_set(&iocg->active_period, cur_period);
12847caa4715STejun Heo return true;
12857caa4715STejun Heo }
12867caa4715STejun Heo
12877caa4715STejun Heo /* racy check on internal node IOs, treat as root level IOs */
12887caa4715STejun Heo if (iocg->child_active_sum)
12897caa4715STejun Heo return false;
12907caa4715STejun Heo
12917caa4715STejun Heo spin_lock_irq(&ioc->lock);
12927caa4715STejun Heo
12937caa4715STejun Heo ioc_now(ioc, now);
12947caa4715STejun Heo
12957caa4715STejun Heo /* update period */
12967caa4715STejun Heo cur_period = atomic64_read(&ioc->cur_period);
12977caa4715STejun Heo last_period = atomic64_read(&iocg->active_period);
12987caa4715STejun Heo atomic64_set(&iocg->active_period, cur_period);
12997caa4715STejun Heo
13007caa4715STejun Heo /* already activated or breaking leaf-only constraint? */
13017caa4715STejun Heo if (!list_empty(&iocg->active_list))
13028b37bc27SJiufei Xue goto succeed_unlock;
13038b37bc27SJiufei Xue for (i = iocg->level - 1; i > 0; i--)
13048b37bc27SJiufei Xue if (!list_empty(&iocg->ancestors[i]->active_list))
13057caa4715STejun Heo goto fail_unlock;
13068b37bc27SJiufei Xue
13077caa4715STejun Heo if (iocg->child_active_sum)
13087caa4715STejun Heo goto fail_unlock;
13097caa4715STejun Heo
13107caa4715STejun Heo /*
1311ac33e91eSTejun Heo * Always start with the target budget. On deactivation, we throw away
1312ac33e91eSTejun Heo * anything above it.
13137caa4715STejun Heo */
1314ac33e91eSTejun Heo vtarget = now->vnow - ioc->margins.target;
13157caa4715STejun Heo vtime = atomic64_read(&iocg->vtime);
13167caa4715STejun Heo
1317ac33e91eSTejun Heo atomic64_add(vtarget - vtime, &iocg->vtime);
1318ac33e91eSTejun Heo atomic64_add(vtarget - vtime, &iocg->done_vtime);
1319ac33e91eSTejun Heo vtime = vtarget;
13207caa4715STejun Heo
13217caa4715STejun Heo /*
13227caa4715STejun Heo * Activate, propagate weight and start period timer if not
13237caa4715STejun Heo * running. Reset hweight_gen to avoid accidental match from
13247caa4715STejun Heo * wrapping.
13257caa4715STejun Heo */
13267caa4715STejun Heo iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
13277caa4715STejun Heo list_add(&iocg->active_list, &ioc->active_iocgs);
1328b0853ab4STejun Heo
132900410f1bSTejun Heo propagate_weights(iocg, iocg->weight,
1330b0853ab4STejun Heo iocg->last_inuse ?: iocg->weight, true, now);
13317caa4715STejun Heo
13327caa4715STejun Heo TRACE_IOCG_PATH(iocg_activate, iocg, now,
13337caa4715STejun Heo last_period, cur_period, vtime);
13347caa4715STejun Heo
13351aa50d02STejun Heo iocg->activated_at = now->now;
13367caa4715STejun Heo
13377caa4715STejun Heo if (ioc->running == IOC_IDLE) {
13387caa4715STejun Heo ioc->running = IOC_RUNNING;
1339c7af2a00STejun Heo ioc->dfgv_period_at = now->now;
1340c7af2a00STejun Heo ioc->dfgv_period_rem = 0;
13417caa4715STejun Heo ioc_start_period(ioc, now);
13427caa4715STejun Heo }
13437caa4715STejun Heo
13448b37bc27SJiufei Xue succeed_unlock:
13457caa4715STejun Heo spin_unlock_irq(&ioc->lock);
13467caa4715STejun Heo return true;
13477caa4715STejun Heo
13487caa4715STejun Heo fail_unlock:
13497caa4715STejun Heo spin_unlock_irq(&ioc->lock);
13507caa4715STejun Heo return false;
13517caa4715STejun Heo }
13527caa4715STejun Heo
iocg_kick_delay(struct ioc_gq * iocg,struct ioc_now * now)13536ef20f78STejun Heo static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
13546ef20f78STejun Heo {
13556ef20f78STejun Heo struct ioc *ioc = iocg->ioc;
13566ef20f78STejun Heo struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1357ce0e99caSRik van Riel u64 tdelta, delay, new_delay, shift;
13585160a5a5STejun Heo s64 vover, vover_pct;
1359c421a3ebSTejun Heo u32 hwa;
13606ef20f78STejun Heo
13616ef20f78STejun Heo lockdep_assert_held(&iocg->waitq.lock);
13626ef20f78STejun Heo
136327b21613STejun Heo /*
136427b21613STejun Heo * If the delay is set by another CPU, we may be in the past. No need to
136527b21613STejun Heo * change anything if so. This avoids decay calculation underflow.
136627b21613STejun Heo */
136727b21613STejun Heo if (time_before64(now->now, iocg->delay_at))
136827b21613STejun Heo return false;
136927b21613STejun Heo
13705160a5a5STejun Heo /* calculate the current delay in effect - 1/2 every second */
13715160a5a5STejun Heo tdelta = now->now - iocg->delay_at;
1372ce0e99caSRik van Riel shift = div64_u64(tdelta, USEC_PER_SEC);
1373ce0e99caSRik van Riel if (iocg->delay && shift < BITS_PER_LONG)
1374ce0e99caSRik van Riel delay = iocg->delay >> shift;
13755160a5a5STejun Heo else
13765160a5a5STejun Heo delay = 0;
13776ef20f78STejun Heo
13785160a5a5STejun Heo /* calculate the new delay from the debt amount */
13795160a5a5STejun Heo current_hweight(iocg, &hwa, NULL);
13805160a5a5STejun Heo vover = atomic64_read(&iocg->vtime) +
13815160a5a5STejun Heo abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1382ac33e91eSTejun Heo vover_pct = div64_s64(100 * vover,
1383ac33e91eSTejun Heo ioc->period_us * ioc->vtime_base_rate);
13845160a5a5STejun Heo
13855160a5a5STejun Heo if (vover_pct <= MIN_DELAY_THR_PCT)
13865160a5a5STejun Heo new_delay = 0;
13875160a5a5STejun Heo else if (vover_pct >= MAX_DELAY_THR_PCT)
13885160a5a5STejun Heo new_delay = MAX_DELAY;
13895160a5a5STejun Heo else
13905160a5a5STejun Heo new_delay = MIN_DELAY +
13915160a5a5STejun Heo div_u64((MAX_DELAY - MIN_DELAY) *
13925160a5a5STejun Heo (vover_pct - MIN_DELAY_THR_PCT),
13935160a5a5STejun Heo MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
13945160a5a5STejun Heo
13955160a5a5STejun Heo /* pick the higher one and apply */
13965160a5a5STejun Heo if (new_delay > delay) {
13975160a5a5STejun Heo iocg->delay = new_delay;
13985160a5a5STejun Heo iocg->delay_at = now->now;
13995160a5a5STejun Heo delay = new_delay;
14005160a5a5STejun Heo }
14015160a5a5STejun Heo
14025160a5a5STejun Heo if (delay >= MIN_DELAY) {
1403f0bf84a5STejun Heo if (!iocg->indelay_since)
1404f0bf84a5STejun Heo iocg->indelay_since = now->now;
14055160a5a5STejun Heo blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
14065160a5a5STejun Heo return true;
14075160a5a5STejun Heo } else {
1408f0bf84a5STejun Heo if (iocg->indelay_since) {
14092a371f7dSChengming Zhou iocg->stat.indelay_us += now->now - iocg->indelay_since;
1410f0bf84a5STejun Heo iocg->indelay_since = 0;
1411f0bf84a5STejun Heo }
14125160a5a5STejun Heo iocg->delay = 0;
14136ef20f78STejun Heo blkcg_clear_delay(blkg);
14146ef20f78STejun Heo return false;
14156ef20f78STejun Heo }
14166ef20f78STejun Heo }
14176ef20f78STejun Heo
iocg_incur_debt(struct ioc_gq * iocg,u64 abs_cost,struct ioc_now * now)1418c421a3ebSTejun Heo static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1419c421a3ebSTejun Heo struct ioc_now *now)
1420c421a3ebSTejun Heo {
1421c421a3ebSTejun Heo struct iocg_pcpu_stat *gcs;
1422c421a3ebSTejun Heo
1423c421a3ebSTejun Heo lockdep_assert_held(&iocg->ioc->lock);
1424c421a3ebSTejun Heo lockdep_assert_held(&iocg->waitq.lock);
1425c421a3ebSTejun Heo WARN_ON_ONCE(list_empty(&iocg->active_list));
1426c421a3ebSTejun Heo
1427c421a3ebSTejun Heo /*
1428c421a3ebSTejun Heo * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1429c421a3ebSTejun Heo * inuse donating all of it share to others until its debt is paid off.
1430c421a3ebSTejun Heo */
1431f0bf84a5STejun Heo if (!iocg->abs_vdebt && abs_cost) {
1432f0bf84a5STejun Heo iocg->indebt_since = now->now;
1433c421a3ebSTejun Heo propagate_weights(iocg, iocg->active, 0, false, now);
1434f0bf84a5STejun Heo }
1435c421a3ebSTejun Heo
1436c421a3ebSTejun Heo iocg->abs_vdebt += abs_cost;
1437c421a3ebSTejun Heo
1438c421a3ebSTejun Heo gcs = get_cpu_ptr(iocg->pcpu_stat);
1439c421a3ebSTejun Heo local64_add(abs_cost, &gcs->abs_vusage);
1440c421a3ebSTejun Heo put_cpu_ptr(gcs);
1441c421a3ebSTejun Heo }
1442c421a3ebSTejun Heo
iocg_pay_debt(struct ioc_gq * iocg,u64 abs_vpay,struct ioc_now * now)1443c421a3ebSTejun Heo static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1444c421a3ebSTejun Heo struct ioc_now *now)
1445c421a3ebSTejun Heo {
1446c421a3ebSTejun Heo lockdep_assert_held(&iocg->ioc->lock);
1447c421a3ebSTejun Heo lockdep_assert_held(&iocg->waitq.lock);
1448c421a3ebSTejun Heo
14491c172ac7SLi Nan /*
14501c172ac7SLi Nan * make sure that nobody messed with @iocg. Check iocg->pd.online
14511c172ac7SLi Nan * to avoid warn when removing blkcg or disk.
14521c172ac7SLi Nan */
14531c172ac7SLi Nan WARN_ON_ONCE(list_empty(&iocg->active_list) && iocg->pd.online);
1454c421a3ebSTejun Heo WARN_ON_ONCE(iocg->inuse > 1);
1455c421a3ebSTejun Heo
1456c421a3ebSTejun Heo iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1457c421a3ebSTejun Heo
1458c421a3ebSTejun Heo /* if debt is paid in full, restore inuse */
1459f0bf84a5STejun Heo if (!iocg->abs_vdebt) {
14602a371f7dSChengming Zhou iocg->stat.indebt_us += now->now - iocg->indebt_since;
1461f0bf84a5STejun Heo iocg->indebt_since = 0;
1462f0bf84a5STejun Heo
1463c421a3ebSTejun Heo propagate_weights(iocg, iocg->active, iocg->last_inuse,
1464c421a3ebSTejun Heo false, now);
1465c421a3ebSTejun Heo }
1466f0bf84a5STejun Heo }
1467c421a3ebSTejun Heo
iocg_wake_fn(struct wait_queue_entry * wq_entry,unsigned mode,int flags,void * key)14687caa4715STejun Heo static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
14697caa4715STejun Heo int flags, void *key)
14707caa4715STejun Heo {
14717caa4715STejun Heo struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1472a7609c68SLi zeming struct iocg_wake_ctx *ctx = key;
14737caa4715STejun Heo u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
14747caa4715STejun Heo
14757caa4715STejun Heo ctx->vbudget -= cost;
14767caa4715STejun Heo
14777caa4715STejun Heo if (ctx->vbudget < 0)
14787caa4715STejun Heo return -1;
14797caa4715STejun Heo
148097eb1975STejun Heo iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
14815ab189cfSTejun Heo wait->committed = true;
14827caa4715STejun Heo
14837caa4715STejun Heo /*
14847caa4715STejun Heo * autoremove_wake_function() removes the wait entry only when it
14855ab189cfSTejun Heo * actually changed the task state. We want the wait always removed.
14865ab189cfSTejun Heo * Remove explicitly and use default_wake_function(). Note that the
14875ab189cfSTejun Heo * order of operations is important as finish_wait() tests whether
14885ab189cfSTejun Heo * @wq_entry is removed without grabbing the lock.
14897caa4715STejun Heo */
14907caa4715STejun Heo default_wake_function(wq_entry, mode, flags, key);
14915ab189cfSTejun Heo list_del_init_careful(&wq_entry->entry);
14927caa4715STejun Heo return 0;
14937caa4715STejun Heo }
14947caa4715STejun Heo
1495da437b95STejun Heo /*
1496da437b95STejun Heo * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1497da437b95STejun Heo * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1498da437b95STejun Heo * addition to iocg->waitq.lock.
1499da437b95STejun Heo */
iocg_kick_waitq(struct ioc_gq * iocg,bool pay_debt,struct ioc_now * now)1500da437b95STejun Heo static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1501da437b95STejun Heo struct ioc_now *now)
15027caa4715STejun Heo {
15037caa4715STejun Heo struct ioc *ioc = iocg->ioc;
15047caa4715STejun Heo struct iocg_wake_ctx ctx = { .iocg = iocg };
1505da437b95STejun Heo u64 vshortage, expires, oexpires;
150636a52481STejun Heo s64 vbudget;
1507c421a3ebSTejun Heo u32 hwa;
15087caa4715STejun Heo
15097caa4715STejun Heo lockdep_assert_held(&iocg->waitq.lock);
15107caa4715STejun Heo
1511c421a3ebSTejun Heo current_hweight(iocg, &hwa, NULL);
151236a52481STejun Heo vbudget = now->vnow - atomic64_read(&iocg->vtime);
151336a52481STejun Heo
151436a52481STejun Heo /* pay off debt */
1515da437b95STejun Heo if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1516c421a3ebSTejun Heo u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1517c421a3ebSTejun Heo u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1518c421a3ebSTejun Heo u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
151936a52481STejun Heo
1520da437b95STejun Heo lockdep_assert_held(&ioc->lock);
1521da437b95STejun Heo
1522c421a3ebSTejun Heo atomic64_add(vpay, &iocg->vtime);
1523c421a3ebSTejun Heo atomic64_add(vpay, &iocg->done_vtime);
1524c421a3ebSTejun Heo iocg_pay_debt(iocg, abs_vpay, now);
1525c421a3ebSTejun Heo vbudget -= vpay;
152636a52481STejun Heo }
152736a52481STejun Heo
15285160a5a5STejun Heo if (iocg->abs_vdebt || iocg->delay)
15295160a5a5STejun Heo iocg_kick_delay(iocg, now);
15305160a5a5STejun Heo
15317caa4715STejun Heo /*
1532da437b95STejun Heo * Debt can still be outstanding if we haven't paid all yet or the
1533da437b95STejun Heo * caller raced and called without @pay_debt. Shouldn't wake up waiters
1534da437b95STejun Heo * under debt. Make sure @vbudget reflects the outstanding amount and is
1535da437b95STejun Heo * not positive.
1536da437b95STejun Heo */
1537da437b95STejun Heo if (iocg->abs_vdebt) {
1538c421a3ebSTejun Heo s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1539da437b95STejun Heo vbudget = min_t(s64, 0, vbudget - vdebt);
1540da437b95STejun Heo }
1541da437b95STejun Heo
1542da437b95STejun Heo /*
1543c421a3ebSTejun Heo * Wake up the ones which are due and see how much vtime we'll need for
1544c421a3ebSTejun Heo * the next one. As paying off debt restores hw_inuse, it must be read
1545c421a3ebSTejun Heo * after the above debt payment.
15467caa4715STejun Heo */
1547da437b95STejun Heo ctx.vbudget = vbudget;
1548c421a3ebSTejun Heo current_hweight(iocg, NULL, &ctx.hw_inuse);
1549c421a3ebSTejun Heo
15507caa4715STejun Heo __wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1551c421a3ebSTejun Heo
1552f0bf84a5STejun Heo if (!waitqueue_active(&iocg->waitq)) {
1553f0bf84a5STejun Heo if (iocg->wait_since) {
15542a371f7dSChengming Zhou iocg->stat.wait_us += now->now - iocg->wait_since;
1555f0bf84a5STejun Heo iocg->wait_since = 0;
1556f0bf84a5STejun Heo }
15577caa4715STejun Heo return;
1558f0bf84a5STejun Heo }
1559f0bf84a5STejun Heo
1560f0bf84a5STejun Heo if (!iocg->wait_since)
1561f0bf84a5STejun Heo iocg->wait_since = now->now;
1562f0bf84a5STejun Heo
15637caa4715STejun Heo if (WARN_ON_ONCE(ctx.vbudget >= 0))
15647caa4715STejun Heo return;
15657caa4715STejun Heo
15667ca5b2e6STejun Heo /* determine next wakeup, add a timer margin to guarantee chunking */
15677caa4715STejun Heo vshortage = -ctx.vbudget;
15687caa4715STejun Heo expires = now->now_ns +
1569ac33e91eSTejun Heo DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1570ac33e91eSTejun Heo NSEC_PER_USEC;
15717ca5b2e6STejun Heo expires += ioc->timer_slack_ns;
15727caa4715STejun Heo
15737caa4715STejun Heo /* if already active and close enough, don't bother */
15747caa4715STejun Heo oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
15757caa4715STejun Heo if (hrtimer_is_queued(&iocg->waitq_timer) &&
15767ca5b2e6STejun Heo abs(oexpires - expires) <= ioc->timer_slack_ns)
15777caa4715STejun Heo return;
15787caa4715STejun Heo
15797caa4715STejun Heo hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
15807ca5b2e6STejun Heo ioc->timer_slack_ns, HRTIMER_MODE_ABS);
15817caa4715STejun Heo }
15827caa4715STejun Heo
iocg_waitq_timer_fn(struct hrtimer * timer)15837caa4715STejun Heo static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
15847caa4715STejun Heo {
15857caa4715STejun Heo struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1586da437b95STejun Heo bool pay_debt = READ_ONCE(iocg->abs_vdebt);
15877caa4715STejun Heo struct ioc_now now;
15887caa4715STejun Heo unsigned long flags;
15897caa4715STejun Heo
15907caa4715STejun Heo ioc_now(iocg->ioc, &now);
15917caa4715STejun Heo
1592da437b95STejun Heo iocg_lock(iocg, pay_debt, &flags);
1593da437b95STejun Heo iocg_kick_waitq(iocg, pay_debt, &now);
1594da437b95STejun Heo iocg_unlock(iocg, pay_debt, &flags);
15957caa4715STejun Heo
15967caa4715STejun Heo return HRTIMER_NORESTART;
15977caa4715STejun Heo }
15987caa4715STejun Heo
ioc_lat_stat(struct ioc * ioc,u32 * missed_ppm_ar,u32 * rq_wait_pct_p)15997caa4715STejun Heo static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
16007caa4715STejun Heo {
16017caa4715STejun Heo u32 nr_met[2] = { };
16027caa4715STejun Heo u32 nr_missed[2] = { };
16037caa4715STejun Heo u64 rq_wait_ns = 0;
16047caa4715STejun Heo int cpu, rw;
16057caa4715STejun Heo
16067caa4715STejun Heo for_each_online_cpu(cpu) {
16077caa4715STejun Heo struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
16087caa4715STejun Heo u64 this_rq_wait_ns;
16097caa4715STejun Heo
16107caa4715STejun Heo for (rw = READ; rw <= WRITE; rw++) {
16115e124f74STejun Heo u32 this_met = local_read(&stat->missed[rw].nr_met);
16125e124f74STejun Heo u32 this_missed = local_read(&stat->missed[rw].nr_missed);
16137caa4715STejun Heo
16147caa4715STejun Heo nr_met[rw] += this_met - stat->missed[rw].last_met;
16157caa4715STejun Heo nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
16167caa4715STejun Heo stat->missed[rw].last_met = this_met;
16177caa4715STejun Heo stat->missed[rw].last_missed = this_missed;
16187caa4715STejun Heo }
16197caa4715STejun Heo
16205e124f74STejun Heo this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
16217caa4715STejun Heo rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
16227caa4715STejun Heo stat->last_rq_wait_ns = this_rq_wait_ns;
16237caa4715STejun Heo }
16247caa4715STejun Heo
16257caa4715STejun Heo for (rw = READ; rw <= WRITE; rw++) {
16267caa4715STejun Heo if (nr_met[rw] + nr_missed[rw])
16277caa4715STejun Heo missed_ppm_ar[rw] =
16287caa4715STejun Heo DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
16297caa4715STejun Heo nr_met[rw] + nr_missed[rw]);
16307caa4715STejun Heo else
16317caa4715STejun Heo missed_ppm_ar[rw] = 0;
16327caa4715STejun Heo }
16337caa4715STejun Heo
16347caa4715STejun Heo *rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
16357caa4715STejun Heo ioc->period_us * NSEC_PER_USEC);
16367caa4715STejun Heo }
16377caa4715STejun Heo
16387caa4715STejun Heo /* was iocg idle this period? */
iocg_is_idle(struct ioc_gq * iocg)16397caa4715STejun Heo static bool iocg_is_idle(struct ioc_gq *iocg)
16407caa4715STejun Heo {
16417caa4715STejun Heo struct ioc *ioc = iocg->ioc;
16427caa4715STejun Heo
16437caa4715STejun Heo /* did something get issued this period? */
16447caa4715STejun Heo if (atomic64_read(&iocg->active_period) ==
16457caa4715STejun Heo atomic64_read(&ioc->cur_period))
16467caa4715STejun Heo return false;
16477caa4715STejun Heo
16487caa4715STejun Heo /* is something in flight? */
1649dcd6589bSTejun Heo if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
16507caa4715STejun Heo return false;
16517caa4715STejun Heo
16527caa4715STejun Heo return true;
16537caa4715STejun Heo }
16547caa4715STejun Heo
165597eb1975STejun Heo /*
165697eb1975STejun Heo * Call this function on the target leaf @iocg's to build pre-order traversal
165797eb1975STejun Heo * list of all the ancestors in @inner_walk. The inner nodes are linked through
165897eb1975STejun Heo * ->walk_list and the caller is responsible for dissolving the list after use.
165997eb1975STejun Heo */
iocg_build_inner_walk(struct ioc_gq * iocg,struct list_head * inner_walk)166097eb1975STejun Heo static void iocg_build_inner_walk(struct ioc_gq *iocg,
166197eb1975STejun Heo struct list_head *inner_walk)
166297eb1975STejun Heo {
166397eb1975STejun Heo int lvl;
166497eb1975STejun Heo
166597eb1975STejun Heo WARN_ON_ONCE(!list_empty(&iocg->walk_list));
166697eb1975STejun Heo
166797eb1975STejun Heo /* find the first ancestor which hasn't been visited yet */
166897eb1975STejun Heo for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
166997eb1975STejun Heo if (!list_empty(&iocg->ancestors[lvl]->walk_list))
167097eb1975STejun Heo break;
167197eb1975STejun Heo }
167297eb1975STejun Heo
167397eb1975STejun Heo /* walk down and visit the inner nodes to get pre-order traversal */
167497eb1975STejun Heo while (++lvl <= iocg->level - 1) {
167597eb1975STejun Heo struct ioc_gq *inner = iocg->ancestors[lvl];
167697eb1975STejun Heo
167797eb1975STejun Heo /* record traversal order */
167897eb1975STejun Heo list_add_tail(&inner->walk_list, inner_walk);
167997eb1975STejun Heo }
168097eb1975STejun Heo }
168197eb1975STejun Heo
16822a371f7dSChengming Zhou /* propagate the deltas to the parent */
iocg_flush_stat_upward(struct ioc_gq * iocg)16832a371f7dSChengming Zhou static void iocg_flush_stat_upward(struct ioc_gq *iocg)
16842a371f7dSChengming Zhou {
16852a371f7dSChengming Zhou if (iocg->level > 0) {
16862a371f7dSChengming Zhou struct iocg_stat *parent_stat =
16872a371f7dSChengming Zhou &iocg->ancestors[iocg->level - 1]->stat;
16882a371f7dSChengming Zhou
16892a371f7dSChengming Zhou parent_stat->usage_us +=
16902a371f7dSChengming Zhou iocg->stat.usage_us - iocg->last_stat.usage_us;
16912a371f7dSChengming Zhou parent_stat->wait_us +=
16922a371f7dSChengming Zhou iocg->stat.wait_us - iocg->last_stat.wait_us;
16932a371f7dSChengming Zhou parent_stat->indebt_us +=
16942a371f7dSChengming Zhou iocg->stat.indebt_us - iocg->last_stat.indebt_us;
16952a371f7dSChengming Zhou parent_stat->indelay_us +=
16962a371f7dSChengming Zhou iocg->stat.indelay_us - iocg->last_stat.indelay_us;
16972a371f7dSChengming Zhou }
16982a371f7dSChengming Zhou
16992a371f7dSChengming Zhou iocg->last_stat = iocg->stat;
17002a371f7dSChengming Zhou }
17012a371f7dSChengming Zhou
170297eb1975STejun Heo /* collect per-cpu counters and propagate the deltas to the parent */
iocg_flush_stat_leaf(struct ioc_gq * iocg,struct ioc_now * now)17032a371f7dSChengming Zhou static void iocg_flush_stat_leaf(struct ioc_gq *iocg, struct ioc_now *now)
170497eb1975STejun Heo {
1705ac33e91eSTejun Heo struct ioc *ioc = iocg->ioc;
170697eb1975STejun Heo u64 abs_vusage = 0;
170797eb1975STejun Heo u64 vusage_delta;
170897eb1975STejun Heo int cpu;
170997eb1975STejun Heo
171097eb1975STejun Heo lockdep_assert_held(&iocg->ioc->lock);
171197eb1975STejun Heo
171297eb1975STejun Heo /* collect per-cpu counters */
171397eb1975STejun Heo for_each_possible_cpu(cpu) {
171497eb1975STejun Heo abs_vusage += local64_read(
171597eb1975STejun Heo per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
171697eb1975STejun Heo }
171797eb1975STejun Heo vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
171897eb1975STejun Heo iocg->last_stat_abs_vusage = abs_vusage;
171997eb1975STejun Heo
1720ac33e91eSTejun Heo iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
17212a371f7dSChengming Zhou iocg->stat.usage_us += iocg->usage_delta_us;
172297eb1975STejun Heo
17232a371f7dSChengming Zhou iocg_flush_stat_upward(iocg);
172497eb1975STejun Heo }
172597eb1975STejun Heo
172697eb1975STejun Heo /* get stat counters ready for reading on all active iocgs */
iocg_flush_stat(struct list_head * target_iocgs,struct ioc_now * now)172797eb1975STejun Heo static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
172897eb1975STejun Heo {
172997eb1975STejun Heo LIST_HEAD(inner_walk);
173097eb1975STejun Heo struct ioc_gq *iocg, *tiocg;
173197eb1975STejun Heo
173297eb1975STejun Heo /* flush leaves and build inner node walk list */
173397eb1975STejun Heo list_for_each_entry(iocg, target_iocgs, active_list) {
17342a371f7dSChengming Zhou iocg_flush_stat_leaf(iocg, now);
173597eb1975STejun Heo iocg_build_inner_walk(iocg, &inner_walk);
173697eb1975STejun Heo }
173797eb1975STejun Heo
173897eb1975STejun Heo /* keep flushing upwards by walking the inner list backwards */
173997eb1975STejun Heo list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
17402a371f7dSChengming Zhou iocg_flush_stat_upward(iocg);
174197eb1975STejun Heo list_del_init(&iocg->walk_list);
174297eb1975STejun Heo }
174397eb1975STejun Heo }
174497eb1975STejun Heo
174593f7d2dbSTejun Heo /*
174693f7d2dbSTejun Heo * Determine what @iocg's hweight_inuse should be after donating unused
174793f7d2dbSTejun Heo * capacity. @hwm is the upper bound and used to signal no donation. This
174893f7d2dbSTejun Heo * function also throws away @iocg's excess budget.
174993f7d2dbSTejun Heo */
hweight_after_donation(struct ioc_gq * iocg,u32 old_hwi,u32 hwm,u32 usage,struct ioc_now * now)1750ac33e91eSTejun Heo static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1751ac33e91eSTejun Heo u32 usage, struct ioc_now *now)
17527caa4715STejun Heo {
175393f7d2dbSTejun Heo struct ioc *ioc = iocg->ioc;
175493f7d2dbSTejun Heo u64 vtime = atomic64_read(&iocg->vtime);
1755f1de2439STejun Heo s64 excess, delta, target, new_hwi;
175693f7d2dbSTejun Heo
1757c421a3ebSTejun Heo /* debt handling owns inuse for debtors */
1758c421a3ebSTejun Heo if (iocg->abs_vdebt)
1759c421a3ebSTejun Heo return 1;
1760c421a3ebSTejun Heo
176193f7d2dbSTejun Heo /* see whether minimum margin requirement is met */
176293f7d2dbSTejun Heo if (waitqueue_active(&iocg->waitq) ||
176393f7d2dbSTejun Heo time_after64(vtime, now->vnow - ioc->margins.min))
176493f7d2dbSTejun Heo return hwm;
176593f7d2dbSTejun Heo
1766ac33e91eSTejun Heo /* throw away excess above target */
1767ac33e91eSTejun Heo excess = now->vnow - vtime - ioc->margins.target;
176893f7d2dbSTejun Heo if (excess > 0) {
176993f7d2dbSTejun Heo atomic64_add(excess, &iocg->vtime);
177093f7d2dbSTejun Heo atomic64_add(excess, &iocg->done_vtime);
177193f7d2dbSTejun Heo vtime += excess;
1772ac33e91eSTejun Heo ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
177393f7d2dbSTejun Heo }
177493f7d2dbSTejun Heo
1775f1de2439STejun Heo /*
1776f1de2439STejun Heo * Let's say the distance between iocg's and device's vtimes as a
1777f1de2439STejun Heo * fraction of period duration is delta. Assuming that the iocg will
1778f1de2439STejun Heo * consume the usage determined above, we want to determine new_hwi so
1779f1de2439STejun Heo * that delta equals MARGIN_TARGET at the end of the next period.
1780f1de2439STejun Heo *
1781f1de2439STejun Heo * We need to execute usage worth of IOs while spending the sum of the
1782f1de2439STejun Heo * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1783f1de2439STejun Heo * (delta):
1784f1de2439STejun Heo *
1785f1de2439STejun Heo * usage = (1 - MARGIN_TARGET + delta) * new_hwi
1786f1de2439STejun Heo *
1787f1de2439STejun Heo * Therefore, the new_hwi is:
1788f1de2439STejun Heo *
1789f1de2439STejun Heo * new_hwi = usage / (1 - MARGIN_TARGET + delta)
1790f1de2439STejun Heo */
1791f1de2439STejun Heo delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1792f1de2439STejun Heo now->vnow - ioc->period_at_vtime);
1793f1de2439STejun Heo target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1794f1de2439STejun Heo new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
17957caa4715STejun Heo
1796f1de2439STejun Heo return clamp_t(s64, new_hwi, 1, hwm);
17977caa4715STejun Heo }
17987caa4715STejun Heo
1799e08d02aaSTejun Heo /*
1800e08d02aaSTejun Heo * For work-conservation, an iocg which isn't using all of its share should
1801e08d02aaSTejun Heo * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1802e08d02aaSTejun Heo * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1803e08d02aaSTejun Heo *
1804e08d02aaSTejun Heo * #1 is mathematically simpler but has the drawback of requiring synchronous
1805e08d02aaSTejun Heo * global hweight_inuse updates when idle iocg's get activated or inuse weights
1806e08d02aaSTejun Heo * change due to donation snapbacks as it has the possibility of grossly
1807e08d02aaSTejun Heo * overshooting what's allowed by the model and vrate.
1808e08d02aaSTejun Heo *
1809e08d02aaSTejun Heo * #2 is inherently safe with local operations. The donating iocg can easily
1810e08d02aaSTejun Heo * snap back to higher weights when needed without worrying about impacts on
1811e08d02aaSTejun Heo * other nodes as the impacts will be inherently correct. This also makes idle
1812e08d02aaSTejun Heo * iocg activations safe. The only effect activations have is decreasing
1813e08d02aaSTejun Heo * hweight_inuse of others, the right solution to which is for those iocgs to
1814e08d02aaSTejun Heo * snap back to higher weights.
1815e08d02aaSTejun Heo *
1816e08d02aaSTejun Heo * So, we go with #2. The challenge is calculating how each donating iocg's
1817e08d02aaSTejun Heo * inuse should be adjusted to achieve the target donation amounts. This is done
1818e08d02aaSTejun Heo * using Andy's method described in the following pdf.
1819e08d02aaSTejun Heo *
1820e08d02aaSTejun Heo * https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1821e08d02aaSTejun Heo *
1822e08d02aaSTejun Heo * Given the weights and target after-donation hweight_inuse values, Andy's
1823e08d02aaSTejun Heo * method determines how the proportional distribution should look like at each
1824e08d02aaSTejun Heo * sibling level to maintain the relative relationship between all non-donating
1825e08d02aaSTejun Heo * pairs. To roughly summarize, it divides the tree into donating and
1826e08d02aaSTejun Heo * non-donating parts, calculates global donation rate which is used to
1827e08d02aaSTejun Heo * determine the target hweight_inuse for each node, and then derives per-level
1828e08d02aaSTejun Heo * proportions.
1829e08d02aaSTejun Heo *
1830e08d02aaSTejun Heo * The following pdf shows that global distribution calculated this way can be
1831e08d02aaSTejun Heo * achieved by scaling inuse weights of donating leaves and propagating the
1832e08d02aaSTejun Heo * adjustments upwards proportionally.
1833e08d02aaSTejun Heo *
1834e08d02aaSTejun Heo * https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1835e08d02aaSTejun Heo *
1836e08d02aaSTejun Heo * Combining the above two, we can determine how each leaf iocg's inuse should
1837e08d02aaSTejun Heo * be adjusted to achieve the target donation.
1838e08d02aaSTejun Heo *
1839e08d02aaSTejun Heo * https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1840e08d02aaSTejun Heo *
1841e08d02aaSTejun Heo * The inline comments use symbols from the last pdf.
1842e08d02aaSTejun Heo *
1843e08d02aaSTejun Heo * b is the sum of the absolute budgets in the subtree. 1 for the root node.
1844e08d02aaSTejun Heo * f is the sum of the absolute budgets of non-donating nodes in the subtree.
1845e08d02aaSTejun Heo * t is the sum of the absolute budgets of donating nodes in the subtree.
1846e08d02aaSTejun Heo * w is the weight of the node. w = w_f + w_t
1847e08d02aaSTejun Heo * w_f is the non-donating portion of w. w_f = w * f / b
1848e08d02aaSTejun Heo * w_b is the donating portion of w. w_t = w * t / b
1849e08d02aaSTejun Heo * s is the sum of all sibling weights. s = Sum(w) for siblings
1850e08d02aaSTejun Heo * s_f and s_t are the non-donating and donating portions of s.
1851e08d02aaSTejun Heo *
1852e08d02aaSTejun Heo * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1853e08d02aaSTejun Heo * w_pt is the donating portion of the parent's weight and w'_pt the same value
1854e08d02aaSTejun Heo * after adjustments. Subscript r denotes the root node's values.
1855e08d02aaSTejun Heo */
transfer_surpluses(struct list_head * surpluses,struct ioc_now * now)185693f7d2dbSTejun Heo static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
185793f7d2dbSTejun Heo {
1858e08d02aaSTejun Heo LIST_HEAD(over_hwa);
1859e08d02aaSTejun Heo LIST_HEAD(inner_walk);
1860e08d02aaSTejun Heo struct ioc_gq *iocg, *tiocg, *root_iocg;
1861e08d02aaSTejun Heo u32 after_sum, over_sum, over_target, gamma;
186293f7d2dbSTejun Heo
1863e08d02aaSTejun Heo /*
1864e08d02aaSTejun Heo * It's pretty unlikely but possible for the total sum of
1865e08d02aaSTejun Heo * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1866e08d02aaSTejun Heo * confuse the following calculations. If such condition is detected,
1867e08d02aaSTejun Heo * scale down everyone over its full share equally to keep the sum below
1868e08d02aaSTejun Heo * WEIGHT_ONE.
1869e08d02aaSTejun Heo */
1870e08d02aaSTejun Heo after_sum = 0;
1871e08d02aaSTejun Heo over_sum = 0;
187293f7d2dbSTejun Heo list_for_each_entry(iocg, surpluses, surplus_list) {
1873e08d02aaSTejun Heo u32 hwa;
187493f7d2dbSTejun Heo
1875e08d02aaSTejun Heo current_hweight(iocg, &hwa, NULL);
1876e08d02aaSTejun Heo after_sum += iocg->hweight_after_donation;
187793f7d2dbSTejun Heo
1878e08d02aaSTejun Heo if (iocg->hweight_after_donation > hwa) {
1879e08d02aaSTejun Heo over_sum += iocg->hweight_after_donation;
1880e08d02aaSTejun Heo list_add(&iocg->walk_list, &over_hwa);
188193f7d2dbSTejun Heo }
188293f7d2dbSTejun Heo }
188393f7d2dbSTejun Heo
1884e08d02aaSTejun Heo if (after_sum >= WEIGHT_ONE) {
1885e08d02aaSTejun Heo /*
1886e08d02aaSTejun Heo * The delta should be deducted from the over_sum, calculate
1887e08d02aaSTejun Heo * target over_sum value.
1888e08d02aaSTejun Heo */
1889e08d02aaSTejun Heo u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1890e08d02aaSTejun Heo WARN_ON_ONCE(over_sum <= over_delta);
1891e08d02aaSTejun Heo over_target = over_sum - over_delta;
1892e08d02aaSTejun Heo } else {
1893e08d02aaSTejun Heo over_target = 0;
1894e08d02aaSTejun Heo }
1895e08d02aaSTejun Heo
1896e08d02aaSTejun Heo list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1897e08d02aaSTejun Heo if (over_target)
1898e08d02aaSTejun Heo iocg->hweight_after_donation =
1899e08d02aaSTejun Heo div_u64((u64)iocg->hweight_after_donation *
1900e08d02aaSTejun Heo over_target, over_sum);
1901e08d02aaSTejun Heo list_del_init(&iocg->walk_list);
1902e08d02aaSTejun Heo }
1903e08d02aaSTejun Heo
1904e08d02aaSTejun Heo /*
1905e08d02aaSTejun Heo * Build pre-order inner node walk list and prepare for donation
1906e08d02aaSTejun Heo * adjustment calculations.
1907e08d02aaSTejun Heo */
1908e08d02aaSTejun Heo list_for_each_entry(iocg, surpluses, surplus_list) {
1909e08d02aaSTejun Heo iocg_build_inner_walk(iocg, &inner_walk);
1910e08d02aaSTejun Heo }
1911e08d02aaSTejun Heo
1912e08d02aaSTejun Heo root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1913e08d02aaSTejun Heo WARN_ON_ONCE(root_iocg->level > 0);
1914e08d02aaSTejun Heo
1915e08d02aaSTejun Heo list_for_each_entry(iocg, &inner_walk, walk_list) {
1916e08d02aaSTejun Heo iocg->child_adjusted_sum = 0;
1917e08d02aaSTejun Heo iocg->hweight_donating = 0;
1918e08d02aaSTejun Heo iocg->hweight_after_donation = 0;
1919e08d02aaSTejun Heo }
1920e08d02aaSTejun Heo
1921e08d02aaSTejun Heo /*
1922e08d02aaSTejun Heo * Propagate the donating budget (b_t) and after donation budget (b'_t)
1923e08d02aaSTejun Heo * up the hierarchy.
1924e08d02aaSTejun Heo */
1925e08d02aaSTejun Heo list_for_each_entry(iocg, surpluses, surplus_list) {
1926e08d02aaSTejun Heo struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1927e08d02aaSTejun Heo
1928e08d02aaSTejun Heo parent->hweight_donating += iocg->hweight_donating;
1929e08d02aaSTejun Heo parent->hweight_after_donation += iocg->hweight_after_donation;
1930e08d02aaSTejun Heo }
1931e08d02aaSTejun Heo
1932e08d02aaSTejun Heo list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1933e08d02aaSTejun Heo if (iocg->level > 0) {
1934e08d02aaSTejun Heo struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1935e08d02aaSTejun Heo
1936e08d02aaSTejun Heo parent->hweight_donating += iocg->hweight_donating;
1937e08d02aaSTejun Heo parent->hweight_after_donation += iocg->hweight_after_donation;
1938e08d02aaSTejun Heo }
1939e08d02aaSTejun Heo }
1940e08d02aaSTejun Heo
1941e08d02aaSTejun Heo /*
1942e08d02aaSTejun Heo * Calculate inner hwa's (b) and make sure the donation values are
1943e08d02aaSTejun Heo * within the accepted ranges as we're doing low res calculations with
1944e08d02aaSTejun Heo * roundups.
1945e08d02aaSTejun Heo */
1946e08d02aaSTejun Heo list_for_each_entry(iocg, &inner_walk, walk_list) {
1947e08d02aaSTejun Heo if (iocg->level) {
1948e08d02aaSTejun Heo struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1949e08d02aaSTejun Heo
1950e08d02aaSTejun Heo iocg->hweight_active = DIV64_U64_ROUND_UP(
1951e08d02aaSTejun Heo (u64)parent->hweight_active * iocg->active,
1952e08d02aaSTejun Heo parent->child_active_sum);
1953e08d02aaSTejun Heo
1954e08d02aaSTejun Heo }
1955e08d02aaSTejun Heo
1956e08d02aaSTejun Heo iocg->hweight_donating = min(iocg->hweight_donating,
1957e08d02aaSTejun Heo iocg->hweight_active);
1958e08d02aaSTejun Heo iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1959e08d02aaSTejun Heo iocg->hweight_donating - 1);
1960e08d02aaSTejun Heo if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1961e08d02aaSTejun Heo iocg->hweight_donating <= 1 ||
1962e08d02aaSTejun Heo iocg->hweight_after_donation == 0)) {
1963e08d02aaSTejun Heo pr_warn("iocg: invalid donation weights in ");
1964e08d02aaSTejun Heo pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1965e08d02aaSTejun Heo pr_cont(": active=%u donating=%u after=%u\n",
1966e08d02aaSTejun Heo iocg->hweight_active, iocg->hweight_donating,
1967e08d02aaSTejun Heo iocg->hweight_after_donation);
1968e08d02aaSTejun Heo }
1969e08d02aaSTejun Heo }
1970e08d02aaSTejun Heo
1971e08d02aaSTejun Heo /*
1972e08d02aaSTejun Heo * Calculate the global donation rate (gamma) - the rate to adjust
1973769b628dSTejun Heo * non-donating budgets by.
1974769b628dSTejun Heo *
1975769b628dSTejun Heo * No need to use 64bit multiplication here as the first operand is
1976769b628dSTejun Heo * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1977769b628dSTejun Heo *
1978769b628dSTejun Heo * We know that there are beneficiary nodes and the sum of the donating
1979769b628dSTejun Heo * hweights can't be whole; however, due to the round-ups during hweight
1980769b628dSTejun Heo * calculations, root_iocg->hweight_donating might still end up equal to
1981769b628dSTejun Heo * or greater than whole. Limit the range when calculating the divider.
1982e08d02aaSTejun Heo *
1983e08d02aaSTejun Heo * gamma = (1 - t_r') / (1 - t_r)
1984e08d02aaSTejun Heo */
1985e08d02aaSTejun Heo gamma = DIV_ROUND_UP(
1986e08d02aaSTejun Heo (WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1987769b628dSTejun Heo WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1988e08d02aaSTejun Heo
1989e08d02aaSTejun Heo /*
1990e08d02aaSTejun Heo * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1991e08d02aaSTejun Heo * nodes.
1992e08d02aaSTejun Heo */
1993e08d02aaSTejun Heo list_for_each_entry(iocg, &inner_walk, walk_list) {
1994e08d02aaSTejun Heo struct ioc_gq *parent;
1995e08d02aaSTejun Heo u32 inuse, wpt, wptp;
1996e08d02aaSTejun Heo u64 st, sf;
1997e08d02aaSTejun Heo
1998e08d02aaSTejun Heo if (iocg->level == 0) {
1999e08d02aaSTejun Heo /* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
2000e08d02aaSTejun Heo iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
2001e08d02aaSTejun Heo iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
2002e08d02aaSTejun Heo WEIGHT_ONE - iocg->hweight_after_donation);
2003e08d02aaSTejun Heo continue;
2004e08d02aaSTejun Heo }
2005e08d02aaSTejun Heo
2006e08d02aaSTejun Heo parent = iocg->ancestors[iocg->level - 1];
2007e08d02aaSTejun Heo
2008e08d02aaSTejun Heo /* b' = gamma * b_f + b_t' */
2009e08d02aaSTejun Heo iocg->hweight_inuse = DIV64_U64_ROUND_UP(
2010e08d02aaSTejun Heo (u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
2011e08d02aaSTejun Heo WEIGHT_ONE) + iocg->hweight_after_donation;
2012e08d02aaSTejun Heo
2013e08d02aaSTejun Heo /* w' = s' * b' / b'_p */
2014e08d02aaSTejun Heo inuse = DIV64_U64_ROUND_UP(
2015e08d02aaSTejun Heo (u64)parent->child_adjusted_sum * iocg->hweight_inuse,
2016e08d02aaSTejun Heo parent->hweight_inuse);
2017e08d02aaSTejun Heo
2018e08d02aaSTejun Heo /* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
2019e08d02aaSTejun Heo st = DIV64_U64_ROUND_UP(
2020e08d02aaSTejun Heo iocg->child_active_sum * iocg->hweight_donating,
2021e08d02aaSTejun Heo iocg->hweight_active);
2022e08d02aaSTejun Heo sf = iocg->child_active_sum - st;
2023e08d02aaSTejun Heo wpt = DIV64_U64_ROUND_UP(
2024e08d02aaSTejun Heo (u64)iocg->active * iocg->hweight_donating,
2025e08d02aaSTejun Heo iocg->hweight_active);
2026e08d02aaSTejun Heo wptp = DIV64_U64_ROUND_UP(
2027e08d02aaSTejun Heo (u64)inuse * iocg->hweight_after_donation,
2028e08d02aaSTejun Heo iocg->hweight_inuse);
2029e08d02aaSTejun Heo
2030e08d02aaSTejun Heo iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
2031e08d02aaSTejun Heo }
2032e08d02aaSTejun Heo
2033e08d02aaSTejun Heo /*
2034e08d02aaSTejun Heo * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
2035e08d02aaSTejun Heo * we can finally determine leaf adjustments.
2036e08d02aaSTejun Heo */
2037e08d02aaSTejun Heo list_for_each_entry(iocg, surpluses, surplus_list) {
2038e08d02aaSTejun Heo struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
2039e08d02aaSTejun Heo u32 inuse;
2040e08d02aaSTejun Heo
2041c421a3ebSTejun Heo /*
2042c421a3ebSTejun Heo * In-debt iocgs participated in the donation calculation with
2043c421a3ebSTejun Heo * the minimum target hweight_inuse. Configuring inuse
2044c421a3ebSTejun Heo * accordingly would work fine but debt handling expects
2045c421a3ebSTejun Heo * @iocg->inuse stay at the minimum and we don't wanna
2046c421a3ebSTejun Heo * interfere.
2047c421a3ebSTejun Heo */
2048c421a3ebSTejun Heo if (iocg->abs_vdebt) {
2049c421a3ebSTejun Heo WARN_ON_ONCE(iocg->inuse > 1);
2050c421a3ebSTejun Heo continue;
2051c421a3ebSTejun Heo }
2052c421a3ebSTejun Heo
2053e08d02aaSTejun Heo /* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
2054e08d02aaSTejun Heo inuse = DIV64_U64_ROUND_UP(
2055e08d02aaSTejun Heo parent->child_adjusted_sum * iocg->hweight_after_donation,
2056e08d02aaSTejun Heo parent->hweight_inuse);
205704603755STejun Heo
205804603755STejun Heo TRACE_IOCG_PATH(inuse_transfer, iocg, now,
205904603755STejun Heo iocg->inuse, inuse,
206004603755STejun Heo iocg->hweight_inuse,
206104603755STejun Heo iocg->hweight_after_donation);
206204603755STejun Heo
2063b0853ab4STejun Heo __propagate_weights(iocg, iocg->active, inuse, true, now);
2064e08d02aaSTejun Heo }
2065e08d02aaSTejun Heo
2066e08d02aaSTejun Heo /* walk list should be dissolved after use */
2067e08d02aaSTejun Heo list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
2068e08d02aaSTejun Heo list_del_init(&iocg->walk_list);
2069e08d02aaSTejun Heo }
2070e08d02aaSTejun Heo
2071ab8df828STejun Heo /*
2072ab8df828STejun Heo * A low weight iocg can amass a large amount of debt, for example, when
2073ab8df828STejun Heo * anonymous memory gets reclaimed aggressively. If the system has a lot of
2074ab8df828STejun Heo * memory paired with a slow IO device, the debt can span multiple seconds or
2075ab8df828STejun Heo * more. If there are no other subsequent IO issuers, the in-debt iocg may end
2076ab8df828STejun Heo * up blocked paying its debt while the IO device is idle.
2077ab8df828STejun Heo *
2078ab8df828STejun Heo * The following protects against such cases. If the device has been
2079d9517841STejun Heo * sufficiently idle for a while, the debts are halved and delays are
2080d9517841STejun Heo * recalculated.
2081ab8df828STejun Heo */
ioc_forgive_debts(struct ioc * ioc,u64 usage_us_sum,int nr_debtors,struct ioc_now * now)2082ab8df828STejun Heo static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
208333a1fe6dSTejun Heo struct ioc_now *now)
2084ab8df828STejun Heo {
2085ab8df828STejun Heo struct ioc_gq *iocg;
20861ab2cfe1SKonstantin Ovsepian u64 dur, usage_pct, nr_cycles, nr_cycles_shift;
2087c7af2a00STejun Heo
2088c7af2a00STejun Heo /* if no debtor, reset the cycle */
2089c7af2a00STejun Heo if (!nr_debtors) {
2090c7af2a00STejun Heo ioc->dfgv_period_at = now->now;
2091c7af2a00STejun Heo ioc->dfgv_period_rem = 0;
2092c7af2a00STejun Heo ioc->dfgv_usage_us_sum = 0;
2093c7af2a00STejun Heo return;
2094c7af2a00STejun Heo }
2095c7af2a00STejun Heo
2096c7af2a00STejun Heo /*
2097c7af2a00STejun Heo * Debtors can pass through a lot of writes choking the device and we
2098c7af2a00STejun Heo * don't want to be forgiving debts while the device is struggling from
2099c7af2a00STejun Heo * write bursts. If we're missing latency targets, consider the device
2100c7af2a00STejun Heo * fully utilized.
2101c7af2a00STejun Heo */
2102c7af2a00STejun Heo if (ioc->busy_level > 0)
2103c7af2a00STejun Heo usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2104c7af2a00STejun Heo
2105c7af2a00STejun Heo ioc->dfgv_usage_us_sum += usage_us_sum;
2106c7af2a00STejun Heo if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2107c7af2a00STejun Heo return;
2108c7af2a00STejun Heo
2109c7af2a00STejun Heo /*
2110c7af2a00STejun Heo * At least DFGV_PERIOD has passed since the last period. Calculate the
2111c7af2a00STejun Heo * average usage and reset the period counters.
2112c7af2a00STejun Heo */
2113c7af2a00STejun Heo dur = now->now - ioc->dfgv_period_at;
2114c7af2a00STejun Heo usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2115c7af2a00STejun Heo
2116c7af2a00STejun Heo ioc->dfgv_period_at = now->now;
2117c7af2a00STejun Heo ioc->dfgv_usage_us_sum = 0;
2118c7af2a00STejun Heo
2119c7af2a00STejun Heo /* if was too busy, reset everything */
2120c7af2a00STejun Heo if (usage_pct > DFGV_USAGE_PCT) {
2121c7af2a00STejun Heo ioc->dfgv_period_rem = 0;
2122c7af2a00STejun Heo return;
2123c7af2a00STejun Heo }
2124c7af2a00STejun Heo
2125c7af2a00STejun Heo /*
2126c7af2a00STejun Heo * Usage is lower than threshold. Let's forgive some debts. Debt
2127c7af2a00STejun Heo * forgiveness runs off of the usual ioc timer but its period usually
2128c7af2a00STejun Heo * doesn't match ioc's. Compensate the difference by performing the
2129c7af2a00STejun Heo * reduction as many times as would fit in the duration since the last
2130c7af2a00STejun Heo * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2131c7af2a00STejun Heo * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2132c7af2a00STejun Heo * reductions is doubled.
2133c7af2a00STejun Heo */
2134c7af2a00STejun Heo nr_cycles = dur + ioc->dfgv_period_rem;
2135c7af2a00STejun Heo ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2136ab8df828STejun Heo
2137ab8df828STejun Heo list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2138c5a6561bSTejun Heo u64 __maybe_unused old_debt, __maybe_unused old_delay;
2139c5a6561bSTejun Heo
2140bec02dbbSTejun Heo if (!iocg->abs_vdebt && !iocg->delay)
2141c7af2a00STejun Heo continue;
2142c5a6561bSTejun Heo
2143ab8df828STejun Heo spin_lock(&iocg->waitq.lock);
2144c5a6561bSTejun Heo
2145c5a6561bSTejun Heo old_debt = iocg->abs_vdebt;
2146c5a6561bSTejun Heo old_delay = iocg->delay;
2147c5a6561bSTejun Heo
21481ab2cfe1SKonstantin Ovsepian nr_cycles_shift = min_t(u64, nr_cycles, BITS_PER_LONG - 1);
2149bec02dbbSTejun Heo if (iocg->abs_vdebt)
21501ab2cfe1SKonstantin Ovsepian iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles_shift ?: 1;
21511ab2cfe1SKonstantin Ovsepian
2152bec02dbbSTejun Heo if (iocg->delay)
21531ab2cfe1SKonstantin Ovsepian iocg->delay = iocg->delay >> nr_cycles_shift ?: 1;
2154bec02dbbSTejun Heo
2155ab8df828STejun Heo iocg_kick_waitq(iocg, true, now);
2156c5a6561bSTejun Heo
2157c5a6561bSTejun Heo TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2158c5a6561bSTejun Heo old_debt, iocg->abs_vdebt,
2159c5a6561bSTejun Heo old_delay, iocg->delay);
2160c5a6561bSTejun Heo
2161ab8df828STejun Heo spin_unlock(&iocg->waitq.lock);
2162ab8df828STejun Heo }
2163ab8df828STejun Heo }
2164ab8df828STejun Heo
21652474787aSBaolin Wang /*
21662474787aSBaolin Wang * Check the active iocgs' state to avoid oversleeping and deactive
21672474787aSBaolin Wang * idle iocgs.
21682474787aSBaolin Wang *
21692474787aSBaolin Wang * Since waiters determine the sleep durations based on the vrate
21702474787aSBaolin Wang * they saw at the time of sleep, if vrate has increased, some
21712474787aSBaolin Wang * waiters could be sleeping for too long. Wake up tardy waiters
21722474787aSBaolin Wang * which should have woken up in the last period and expire idle
21732474787aSBaolin Wang * iocgs.
21742474787aSBaolin Wang */
ioc_check_iocgs(struct ioc * ioc,struct ioc_now * now)21752474787aSBaolin Wang static int ioc_check_iocgs(struct ioc *ioc, struct ioc_now *now)
21762474787aSBaolin Wang {
21772474787aSBaolin Wang int nr_debtors = 0;
21782474787aSBaolin Wang struct ioc_gq *iocg, *tiocg;
21792474787aSBaolin Wang
21802474787aSBaolin Wang list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
21812474787aSBaolin Wang if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
21822474787aSBaolin Wang !iocg->delay && !iocg_is_idle(iocg))
21832474787aSBaolin Wang continue;
21842474787aSBaolin Wang
21852474787aSBaolin Wang spin_lock(&iocg->waitq.lock);
21862474787aSBaolin Wang
21872474787aSBaolin Wang /* flush wait and indebt stat deltas */
21882474787aSBaolin Wang if (iocg->wait_since) {
21892a371f7dSChengming Zhou iocg->stat.wait_us += now->now - iocg->wait_since;
21902474787aSBaolin Wang iocg->wait_since = now->now;
21912474787aSBaolin Wang }
21922474787aSBaolin Wang if (iocg->indebt_since) {
21932a371f7dSChengming Zhou iocg->stat.indebt_us +=
21942474787aSBaolin Wang now->now - iocg->indebt_since;
21952474787aSBaolin Wang iocg->indebt_since = now->now;
21962474787aSBaolin Wang }
21972474787aSBaolin Wang if (iocg->indelay_since) {
21982a371f7dSChengming Zhou iocg->stat.indelay_us +=
21992474787aSBaolin Wang now->now - iocg->indelay_since;
22002474787aSBaolin Wang iocg->indelay_since = now->now;
22012474787aSBaolin Wang }
22022474787aSBaolin Wang
22032474787aSBaolin Wang if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
22042474787aSBaolin Wang iocg->delay) {
22052474787aSBaolin Wang /* might be oversleeping vtime / hweight changes, kick */
22062474787aSBaolin Wang iocg_kick_waitq(iocg, true, now);
22072474787aSBaolin Wang if (iocg->abs_vdebt || iocg->delay)
22082474787aSBaolin Wang nr_debtors++;
22092474787aSBaolin Wang } else if (iocg_is_idle(iocg)) {
22102474787aSBaolin Wang /* no waiter and idle, deactivate */
22112474787aSBaolin Wang u64 vtime = atomic64_read(&iocg->vtime);
22122474787aSBaolin Wang s64 excess;
22132474787aSBaolin Wang
22142474787aSBaolin Wang /*
22152474787aSBaolin Wang * @iocg has been inactive for a full duration and will
22162474787aSBaolin Wang * have a high budget. Account anything above target as
22172474787aSBaolin Wang * error and throw away. On reactivation, it'll start
22182474787aSBaolin Wang * with the target budget.
22192474787aSBaolin Wang */
22202474787aSBaolin Wang excess = now->vnow - vtime - ioc->margins.target;
22212474787aSBaolin Wang if (excess > 0) {
22222474787aSBaolin Wang u32 old_hwi;
22232474787aSBaolin Wang
22242474787aSBaolin Wang current_hweight(iocg, NULL, &old_hwi);
22252474787aSBaolin Wang ioc->vtime_err -= div64_u64(excess * old_hwi,
22262474787aSBaolin Wang WEIGHT_ONE);
22272474787aSBaolin Wang }
22282474787aSBaolin Wang
222976efc1c7SBaolin Wang TRACE_IOCG_PATH(iocg_idle, iocg, now,
223076efc1c7SBaolin Wang atomic64_read(&iocg->active_period),
223176efc1c7SBaolin Wang atomic64_read(&ioc->cur_period), vtime);
22322474787aSBaolin Wang __propagate_weights(iocg, 0, 0, false, now);
22332474787aSBaolin Wang list_del_init(&iocg->active_list);
22342474787aSBaolin Wang }
22352474787aSBaolin Wang
22362474787aSBaolin Wang spin_unlock(&iocg->waitq.lock);
22372474787aSBaolin Wang }
22382474787aSBaolin Wang
22392474787aSBaolin Wang commit_weights(ioc);
22402474787aSBaolin Wang return nr_debtors;
22412474787aSBaolin Wang }
22422474787aSBaolin Wang
ioc_timer_fn(struct timer_list * timer)22437caa4715STejun Heo static void ioc_timer_fn(struct timer_list *timer)
22447caa4715STejun Heo {
22457caa4715STejun Heo struct ioc *ioc = container_of(timer, struct ioc, timer);
22467caa4715STejun Heo struct ioc_gq *iocg, *tiocg;
22477caa4715STejun Heo struct ioc_now now;
22488692d2dbSTejun Heo LIST_HEAD(surpluses);
22492474787aSBaolin Wang int nr_debtors, nr_shortages = 0, nr_lagging = 0;
2250dda1315fSTejun Heo u64 usage_us_sum = 0;
2251074501bcSYu Kuai u32 ppm_rthr;
2252074501bcSYu Kuai u32 ppm_wthr;
22537caa4715STejun Heo u32 missed_ppm[2], rq_wait_pct;
22547caa4715STejun Heo u64 period_vtime;
2255f1de2439STejun Heo int prev_busy_level;
22567caa4715STejun Heo
22577caa4715STejun Heo /* how were the latencies during the period? */
22587caa4715STejun Heo ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
22597caa4715STejun Heo
22607caa4715STejun Heo /* take care of active iocgs */
22617caa4715STejun Heo spin_lock_irq(&ioc->lock);
22627caa4715STejun Heo
2263074501bcSYu Kuai ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2264074501bcSYu Kuai ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
22657caa4715STejun Heo ioc_now(ioc, &now);
22667caa4715STejun Heo
22677caa4715STejun Heo period_vtime = now.vnow - ioc->period_at_vtime;
22687caa4715STejun Heo if (WARN_ON_ONCE(!period_vtime)) {
22697caa4715STejun Heo spin_unlock_irq(&ioc->lock);
22707caa4715STejun Heo return;
22717caa4715STejun Heo }
22727caa4715STejun Heo
22732474787aSBaolin Wang nr_debtors = ioc_check_iocgs(ioc, &now);
22747caa4715STejun Heo
2275f0bf84a5STejun Heo /*
2276f0bf84a5STejun Heo * Wait and indebt stat are flushed above and the donation calculation
2277f0bf84a5STejun Heo * below needs updated usage stat. Let's bring stat up-to-date.
2278f0bf84a5STejun Heo */
2279f0bf84a5STejun Heo iocg_flush_stat(&ioc->active_iocgs, &now);
2280f0bf84a5STejun Heo
2281f1de2439STejun Heo /* calc usage and see whether some weights need to be moved around */
22827caa4715STejun Heo list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2283c09245f6SBaolin Wang u64 vdone, vtime, usage_us;
2284c09245f6SBaolin Wang u32 hw_active, hw_inuse;
22857caa4715STejun Heo
22867caa4715STejun Heo /*
22877caa4715STejun Heo * Collect unused and wind vtime closer to vnow to prevent
22887caa4715STejun Heo * iocgs from accumulating a large amount of budget.
22897caa4715STejun Heo */
22907caa4715STejun Heo vdone = atomic64_read(&iocg->done_vtime);
22917caa4715STejun Heo vtime = atomic64_read(&iocg->vtime);
22927caa4715STejun Heo current_hweight(iocg, &hw_active, &hw_inuse);
22937caa4715STejun Heo
22947caa4715STejun Heo /*
22957caa4715STejun Heo * Latency QoS detection doesn't account for IOs which are
22967caa4715STejun Heo * in-flight for longer than a period. Detect them by
22977caa4715STejun Heo * comparing vdone against period start. If lagging behind
22987caa4715STejun Heo * IOs from past periods, don't increase vrate.
22997caa4715STejun Heo */
23007cd806a9STejun Heo if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
23017cd806a9STejun Heo !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
23027caa4715STejun Heo time_after64(vtime, vdone) &&
23037caa4715STejun Heo time_after64(vtime, now.vnow -
23047caa4715STejun Heo MAX_LAGGING_PERIODS * period_vtime) &&
23057caa4715STejun Heo time_before64(vdone, now.vnow - period_vtime))
23067caa4715STejun Heo nr_lagging++;
23077caa4715STejun Heo
23087caa4715STejun Heo /*
2309f1de2439STejun Heo * Determine absolute usage factoring in in-flight IOs to avoid
2310f1de2439STejun Heo * high-latency completions appearing as idle.
23117caa4715STejun Heo */
23121aa50d02STejun Heo usage_us = iocg->usage_delta_us;
2313dda1315fSTejun Heo usage_us_sum += usage_us;
2314f1de2439STejun Heo
2315c09245f6SBaolin Wang /* see whether there's surplus vtime */
2316c09245f6SBaolin Wang WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2317c09245f6SBaolin Wang if (hw_inuse < hw_active ||
2318c09245f6SBaolin Wang (!waitqueue_active(&iocg->waitq) &&
2319c09245f6SBaolin Wang time_before64(vtime, now.vnow - ioc->margins.low))) {
2320c09245f6SBaolin Wang u32 hwa, old_hwi, hwm, new_hwi, usage;
2321c09245f6SBaolin Wang u64 usage_dur;
2322c09245f6SBaolin Wang
23231aa50d02STejun Heo if (vdone != vtime) {
23241aa50d02STejun Heo u64 inflight_us = DIV64_U64_ROUND_UP(
23251aa50d02STejun Heo cost_to_abs_cost(vtime - vdone, hw_inuse),
2326ac33e91eSTejun Heo ioc->vtime_base_rate);
2327c09245f6SBaolin Wang
23281aa50d02STejun Heo usage_us = max(usage_us, inflight_us);
23291aa50d02STejun Heo }
23307caa4715STejun Heo
2331f1de2439STejun Heo /* convert to hweight based usage ratio */
23321aa50d02STejun Heo if (time_after64(iocg->activated_at, ioc->period_at))
2333f1de2439STejun Heo usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
23341aa50d02STejun Heo else
2335f1de2439STejun Heo usage_dur = max_t(u64, now.now - ioc->period_at, 1);
23361aa50d02STejun Heo
2337f1de2439STejun Heo usage = clamp_t(u32,
2338f1de2439STejun Heo DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2339f1de2439STejun Heo usage_dur),
234093f7d2dbSTejun Heo 1, WEIGHT_ONE);
23417caa4715STejun Heo
234293f7d2dbSTejun Heo /*
234393f7d2dbSTejun Heo * Already donating or accumulated enough to start.
234493f7d2dbSTejun Heo * Determine the donation amount.
234593f7d2dbSTejun Heo */
2346ac33e91eSTejun Heo current_hweight(iocg, &hwa, &old_hwi);
234793f7d2dbSTejun Heo hwm = current_hweight_max(iocg);
2348ac33e91eSTejun Heo new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2349ac33e91eSTejun Heo usage, &now);
2350edaa2633STejun Heo /*
2351edaa2633STejun Heo * Donation calculation assumes hweight_after_donation
2352edaa2633STejun Heo * to be positive, a condition that a donor w/ hwa < 2
2353edaa2633STejun Heo * can't meet. Don't bother with donation if hwa is
2354edaa2633STejun Heo * below 2. It's not gonna make a meaningful difference
2355edaa2633STejun Heo * anyway.
2356edaa2633STejun Heo */
2357edaa2633STejun Heo if (new_hwi < hwm && hwa >= 2) {
2358e08d02aaSTejun Heo iocg->hweight_donating = hwa;
235993f7d2dbSTejun Heo iocg->hweight_after_donation = new_hwi;
236093f7d2dbSTejun Heo list_add(&iocg->surplus_list, &surpluses);
23618c936f9eSTejun Heo } else if (!iocg->abs_vdebt) {
23628c936f9eSTejun Heo /*
23638c936f9eSTejun Heo * @iocg doesn't have enough to donate. Reset
23648c936f9eSTejun Heo * its inuse to active.
23658c936f9eSTejun Heo *
23668c936f9eSTejun Heo * Don't reset debtors as their inuse's are
23678c936f9eSTejun Heo * owned by debt handling. This shouldn't affect
23688c936f9eSTejun Heo * donation calculuation in any meaningful way
23698c936f9eSTejun Heo * as @iocg doesn't have a meaningful amount of
23708c936f9eSTejun Heo * share anyway.
23718c936f9eSTejun Heo */
237204603755STejun Heo TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
237304603755STejun Heo iocg->inuse, iocg->active,
237404603755STejun Heo iocg->hweight_inuse, new_hwi);
237504603755STejun Heo
237693f7d2dbSTejun Heo __propagate_weights(iocg, iocg->active,
2377b0853ab4STejun Heo iocg->active, true, &now);
237893f7d2dbSTejun Heo nr_shortages++;
237993f7d2dbSTejun Heo }
238093f7d2dbSTejun Heo } else {
238193f7d2dbSTejun Heo /* genuinely short on vtime */
238293f7d2dbSTejun Heo nr_shortages++;
23837caa4715STejun Heo }
23847caa4715STejun Heo }
238593f7d2dbSTejun Heo
238693f7d2dbSTejun Heo if (!list_empty(&surpluses) && nr_shortages)
238793f7d2dbSTejun Heo transfer_surpluses(&surpluses, &now);
238893f7d2dbSTejun Heo
238900410f1bSTejun Heo commit_weights(ioc);
23907caa4715STejun Heo
23918692d2dbSTejun Heo /* surplus list should be dissolved after use */
23928692d2dbSTejun Heo list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
23938692d2dbSTejun Heo list_del_init(&iocg->surplus_list);
23948692d2dbSTejun Heo
2395dda1315fSTejun Heo /*
23967caa4715STejun Heo * If q is getting clogged or we're missing too much, we're issuing
23977caa4715STejun Heo * too much IO and should lower vtime rate. If we're not missing
23987caa4715STejun Heo * and experiencing shortages but not surpluses, we're too stingy
23997caa4715STejun Heo * and should increase vtime rate.
24007caa4715STejun Heo */
240125d41e4aSTejun Heo prev_busy_level = ioc->busy_level;
24027caa4715STejun Heo if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
24037caa4715STejun Heo missed_ppm[READ] > ppm_rthr ||
24047caa4715STejun Heo missed_ppm[WRITE] > ppm_wthr) {
240581ca627aSTejun Heo /* clearly missing QoS targets, slow down vrate */
24067caa4715STejun Heo ioc->busy_level = max(ioc->busy_level, 0);
24077caa4715STejun Heo ioc->busy_level++;
24087cd806a9STejun Heo } else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
24097caa4715STejun Heo missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
24107caa4715STejun Heo missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
241181ca627aSTejun Heo /* QoS targets are being met with >25% margin */
241281ca627aSTejun Heo if (nr_shortages) {
241381ca627aSTejun Heo /*
241481ca627aSTejun Heo * We're throttling while the device has spare
241581ca627aSTejun Heo * capacity. If vrate was being slowed down, stop.
241681ca627aSTejun Heo */
24177caa4715STejun Heo ioc->busy_level = min(ioc->busy_level, 0);
241881ca627aSTejun Heo
241981ca627aSTejun Heo /*
242081ca627aSTejun Heo * If there are IOs spanning multiple periods, wait
2421065655c8STejun Heo * them out before pushing the device harder.
242281ca627aSTejun Heo */
2423065655c8STejun Heo if (!nr_lagging)
24247caa4715STejun Heo ioc->busy_level--;
242581ca627aSTejun Heo } else {
242681ca627aSTejun Heo /*
242781ca627aSTejun Heo * Nobody is being throttled and the users aren't
242881ca627aSTejun Heo * issuing enough IOs to saturate the device. We
242981ca627aSTejun Heo * simply don't know how close the device is to
243081ca627aSTejun Heo * saturation. Coast.
243181ca627aSTejun Heo */
243281ca627aSTejun Heo ioc->busy_level = 0;
24337cd806a9STejun Heo }
24347caa4715STejun Heo } else {
243581ca627aSTejun Heo /* inside the hysterisis margin, we're good */
24367caa4715STejun Heo ioc->busy_level = 0;
24377caa4715STejun Heo }
24387caa4715STejun Heo
24397caa4715STejun Heo ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
24407caa4715STejun Heo
2441926f75f6SBaolin Wang ioc_adjust_base_vrate(ioc, rq_wait_pct, nr_lagging, nr_shortages,
2442926f75f6SBaolin Wang prev_busy_level, missed_ppm);
24437caa4715STejun Heo
24447caa4715STejun Heo ioc_refresh_params(ioc, false);
24457caa4715STejun Heo
244633a1fe6dSTejun Heo ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
244733a1fe6dSTejun Heo
24487caa4715STejun Heo /*
24497caa4715STejun Heo * This period is done. Move onto the next one. If nothing's
24507caa4715STejun Heo * going on with the device, stop the timer.
24517caa4715STejun Heo */
24527caa4715STejun Heo atomic64_inc(&ioc->cur_period);
24537caa4715STejun Heo
24547caa4715STejun Heo if (ioc->running != IOC_STOP) {
24557caa4715STejun Heo if (!list_empty(&ioc->active_iocgs)) {
24567caa4715STejun Heo ioc_start_period(ioc, &now);
24577caa4715STejun Heo } else {
24587caa4715STejun Heo ioc->busy_level = 0;
2459ac33e91eSTejun Heo ioc->vtime_err = 0;
24607caa4715STejun Heo ioc->running = IOC_IDLE;
24617caa4715STejun Heo }
2462ac33e91eSTejun Heo
2463ac33e91eSTejun Heo ioc_refresh_vrate(ioc, &now);
24647caa4715STejun Heo }
24657caa4715STejun Heo
24667caa4715STejun Heo spin_unlock_irq(&ioc->lock);
24677caa4715STejun Heo }
24687caa4715STejun Heo
adjust_inuse_and_calc_cost(struct ioc_gq * iocg,u64 vtime,u64 abs_cost,struct ioc_now * now)2469b0853ab4STejun Heo static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2470b0853ab4STejun Heo u64 abs_cost, struct ioc_now *now)
2471b0853ab4STejun Heo {
2472b0853ab4STejun Heo struct ioc *ioc = iocg->ioc;
2473b0853ab4STejun Heo struct ioc_margins *margins = &ioc->margins;
247404603755STejun Heo u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2475aa67db24STejun Heo u32 hwi, adj_step;
2476b0853ab4STejun Heo s64 margin;
2477b0853ab4STejun Heo u64 cost, new_inuse;
24788d211554SLi Nan unsigned long flags;
2479b0853ab4STejun Heo
2480b0853ab4STejun Heo current_hweight(iocg, NULL, &hwi);
248104603755STejun Heo old_hwi = hwi;
2482b0853ab4STejun Heo cost = abs_cost_to_cost(abs_cost, hwi);
2483b0853ab4STejun Heo margin = now->vnow - vtime - cost;
2484b0853ab4STejun Heo
2485c421a3ebSTejun Heo /* debt handling owns inuse for debtors */
2486c421a3ebSTejun Heo if (iocg->abs_vdebt)
2487c421a3ebSTejun Heo return cost;
2488c421a3ebSTejun Heo
2489b0853ab4STejun Heo /*
24905ba1add2SBaolin Wang * We only increase inuse during period and do so if the margin has
2491b0853ab4STejun Heo * deteriorated since the previous adjustment.
2492b0853ab4STejun Heo */
2493b0853ab4STejun Heo if (margin >= iocg->saved_margin || margin >= margins->low ||
2494b0853ab4STejun Heo iocg->inuse == iocg->active)
2495b0853ab4STejun Heo return cost;
2496b0853ab4STejun Heo
24978d211554SLi Nan spin_lock_irqsave(&ioc->lock, flags);
2498b0853ab4STejun Heo
2499b0853ab4STejun Heo /* we own inuse only when @iocg is in the normal active state */
2500c421a3ebSTejun Heo if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
25018d211554SLi Nan spin_unlock_irqrestore(&ioc->lock, flags);
2502b0853ab4STejun Heo return cost;
2503b0853ab4STejun Heo }
2504b0853ab4STejun Heo
2505aa67db24STejun Heo /*
2506aa67db24STejun Heo * Bump up inuse till @abs_cost fits in the existing budget.
2507aa67db24STejun Heo * adj_step must be determined after acquiring ioc->lock - we might
2508aa67db24STejun Heo * have raced and lost to another thread for activation and could
2509aa67db24STejun Heo * be reading 0 iocg->active before ioc->lock which will lead to
2510aa67db24STejun Heo * infinite loop.
2511aa67db24STejun Heo */
2512b0853ab4STejun Heo new_inuse = iocg->inuse;
2513aa67db24STejun Heo adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2514b0853ab4STejun Heo do {
2515b0853ab4STejun Heo new_inuse = new_inuse + adj_step;
2516b0853ab4STejun Heo propagate_weights(iocg, iocg->active, new_inuse, true, now);
2517b0853ab4STejun Heo current_hweight(iocg, NULL, &hwi);
2518b0853ab4STejun Heo cost = abs_cost_to_cost(abs_cost, hwi);
2519b0853ab4STejun Heo } while (time_after64(vtime + cost, now->vnow) &&
2520b0853ab4STejun Heo iocg->inuse != iocg->active);
2521b0853ab4STejun Heo
25228d211554SLi Nan spin_unlock_irqrestore(&ioc->lock, flags);
252304603755STejun Heo
252404603755STejun Heo TRACE_IOCG_PATH(inuse_adjust, iocg, now,
252504603755STejun Heo old_inuse, iocg->inuse, old_hwi, hwi);
252604603755STejun Heo
2527b0853ab4STejun Heo return cost;
2528b0853ab4STejun Heo }
2529b0853ab4STejun Heo
calc_vtime_cost_builtin(struct bio * bio,struct ioc_gq * iocg,bool is_merge,u64 * costp)25307caa4715STejun Heo static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
25317caa4715STejun Heo bool is_merge, u64 *costp)
25327caa4715STejun Heo {
25337caa4715STejun Heo struct ioc *ioc = iocg->ioc;
25347caa4715STejun Heo u64 coef_seqio, coef_randio, coef_page;
25357caa4715STejun Heo u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
25367caa4715STejun Heo u64 seek_pages = 0;
25377caa4715STejun Heo u64 cost = 0;
25387caa4715STejun Heo
2539013adcbeSChengming Zhou /* Can't calculate cost for empty bio */
2540013adcbeSChengming Zhou if (!bio->bi_iter.bi_size)
2541013adcbeSChengming Zhou goto out;
2542013adcbeSChengming Zhou
25437caa4715STejun Heo switch (bio_op(bio)) {
25447caa4715STejun Heo case REQ_OP_READ:
25457caa4715STejun Heo coef_seqio = ioc->params.lcoefs[LCOEF_RSEQIO];
25467caa4715STejun Heo coef_randio = ioc->params.lcoefs[LCOEF_RRANDIO];
25477caa4715STejun Heo coef_page = ioc->params.lcoefs[LCOEF_RPAGE];
25487caa4715STejun Heo break;
25497caa4715STejun Heo case REQ_OP_WRITE:
25507caa4715STejun Heo coef_seqio = ioc->params.lcoefs[LCOEF_WSEQIO];
25517caa4715STejun Heo coef_randio = ioc->params.lcoefs[LCOEF_WRANDIO];
25527caa4715STejun Heo coef_page = ioc->params.lcoefs[LCOEF_WPAGE];
25537caa4715STejun Heo break;
25547caa4715STejun Heo default:
25557caa4715STejun Heo goto out;
25567caa4715STejun Heo }
25577caa4715STejun Heo
25587caa4715STejun Heo if (iocg->cursor) {
25597caa4715STejun Heo seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
25607caa4715STejun Heo seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
25617caa4715STejun Heo }
25627caa4715STejun Heo
25637caa4715STejun Heo if (!is_merge) {
25647caa4715STejun Heo if (seek_pages > LCOEF_RANDIO_PAGES) {
25657caa4715STejun Heo cost += coef_randio;
25667caa4715STejun Heo } else {
25677caa4715STejun Heo cost += coef_seqio;
25687caa4715STejun Heo }
25697caa4715STejun Heo }
25707caa4715STejun Heo cost += pages * coef_page;
25717caa4715STejun Heo out:
25727caa4715STejun Heo *costp = cost;
25737caa4715STejun Heo }
25747caa4715STejun Heo
calc_vtime_cost(struct bio * bio,struct ioc_gq * iocg,bool is_merge)25757caa4715STejun Heo static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
25767caa4715STejun Heo {
25777caa4715STejun Heo u64 cost;
25787caa4715STejun Heo
25797caa4715STejun Heo calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
25807caa4715STejun Heo return cost;
25817caa4715STejun Heo }
25827caa4715STejun Heo
calc_size_vtime_cost_builtin(struct request * rq,struct ioc * ioc,u64 * costp)2583cd006509STejun Heo static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2584cd006509STejun Heo u64 *costp)
2585cd006509STejun Heo {
2586cd006509STejun Heo unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2587cd006509STejun Heo
2588cd006509STejun Heo switch (req_op(rq)) {
2589cd006509STejun Heo case REQ_OP_READ:
2590cd006509STejun Heo *costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2591cd006509STejun Heo break;
2592cd006509STejun Heo case REQ_OP_WRITE:
2593cd006509STejun Heo *costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2594cd006509STejun Heo break;
2595cd006509STejun Heo default:
2596cd006509STejun Heo *costp = 0;
2597cd006509STejun Heo }
2598cd006509STejun Heo }
2599cd006509STejun Heo
calc_size_vtime_cost(struct request * rq,struct ioc * ioc)2600cd006509STejun Heo static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2601cd006509STejun Heo {
2602cd006509STejun Heo u64 cost;
2603cd006509STejun Heo
2604cd006509STejun Heo calc_size_vtime_cost_builtin(rq, ioc, &cost);
2605cd006509STejun Heo return cost;
2606cd006509STejun Heo }
2607cd006509STejun Heo
ioc_rqos_throttle(struct rq_qos * rqos,struct bio * bio)26087caa4715STejun Heo static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
26097caa4715STejun Heo {
26107caa4715STejun Heo struct blkcg_gq *blkg = bio->bi_blkg;
26117caa4715STejun Heo struct ioc *ioc = rqos_to_ioc(rqos);
26127caa4715STejun Heo struct ioc_gq *iocg = blkg_to_iocg(blkg);
26137caa4715STejun Heo struct ioc_now now;
26147caa4715STejun Heo struct iocg_wait wait;
26157caa4715STejun Heo u64 abs_cost, cost, vtime;
2616da437b95STejun Heo bool use_debt, ioc_locked;
2617da437b95STejun Heo unsigned long flags;
26187caa4715STejun Heo
2619d16baa3fSTejun Heo /* bypass IOs if disabled, still initializing, or for root cgroup */
2620d16baa3fSTejun Heo if (!ioc->enabled || !iocg || !iocg->level)
26217caa4715STejun Heo return;
26227caa4715STejun Heo
26237caa4715STejun Heo /* calculate the absolute vtime cost */
26247caa4715STejun Heo abs_cost = calc_vtime_cost(bio, iocg, false);
26257caa4715STejun Heo if (!abs_cost)
26267caa4715STejun Heo return;
26277caa4715STejun Heo
2628f1de2439STejun Heo if (!iocg_activate(iocg, &now))
2629f1de2439STejun Heo return;
2630f1de2439STejun Heo
26317caa4715STejun Heo iocg->cursor = bio_end_sector(bio);
26327caa4715STejun Heo vtime = atomic64_read(&iocg->vtime);
2633b0853ab4STejun Heo cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
26347caa4715STejun Heo
26357caa4715STejun Heo /*
26367caa4715STejun Heo * If no one's waiting and within budget, issue right away. The
26377caa4715STejun Heo * tests are racy but the races aren't systemic - we only miss once
26387caa4715STejun Heo * in a while which is fine.
26397caa4715STejun Heo */
26400b80f986STejun Heo if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
26417caa4715STejun Heo time_before_eq64(vtime + cost, now.vnow)) {
264297eb1975STejun Heo iocg_commit_bio(iocg, bio, abs_cost, cost);
26437caa4715STejun Heo return;
26447caa4715STejun Heo }
26457caa4715STejun Heo
264636a52481STejun Heo /*
2647da437b95STejun Heo * We're over budget. This can be handled in two ways. IOs which may
2648da437b95STejun Heo * cause priority inversions are punted to @ioc->aux_iocg and charged as
2649da437b95STejun Heo * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2650da437b95STejun Heo * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2651da437b95STejun Heo * whether debt handling is needed and acquire locks accordingly.
26520b80f986STejun Heo */
2653da437b95STejun Heo use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2654da437b95STejun Heo ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2655b0853ab4STejun Heo retry_lock:
2656da437b95STejun Heo iocg_lock(iocg, ioc_locked, &flags);
2657da437b95STejun Heo
2658da437b95STejun Heo /*
2659da437b95STejun Heo * @iocg must stay activated for debt and waitq handling. Deactivation
2660da437b95STejun Heo * is synchronized against both ioc->lock and waitq.lock and we won't
2661da437b95STejun Heo * get deactivated as long as we're waiting or has debt, so we're good
2662da437b95STejun Heo * if we're activated here. In the unlikely cases that we aren't, just
2663da437b95STejun Heo * issue the IO.
2664da437b95STejun Heo */
26650b80f986STejun Heo if (unlikely(list_empty(&iocg->active_list))) {
2666da437b95STejun Heo iocg_unlock(iocg, ioc_locked, &flags);
266797eb1975STejun Heo iocg_commit_bio(iocg, bio, abs_cost, cost);
26680b80f986STejun Heo return;
26690b80f986STejun Heo }
26700b80f986STejun Heo
26710b80f986STejun Heo /*
26720b80f986STejun Heo * We're over budget. If @bio has to be issued regardless, remember
26730b80f986STejun Heo * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
26740b80f986STejun Heo * off the debt before waking more IOs.
26750b80f986STejun Heo *
267636a52481STejun Heo * This way, the debt is continuously paid off each period with the
26770b80f986STejun Heo * actual budget available to the cgroup. If we just wound vtime, we
26780b80f986STejun Heo * would incorrectly use the current hw_inuse for the entire amount
26790b80f986STejun Heo * which, for example, can lead to the cgroup staying blocked for a
26800b80f986STejun Heo * long time even with substantially raised hw_inuse.
26810b80f986STejun Heo *
26820b80f986STejun Heo * An iocg with vdebt should stay online so that the timer can keep
26830b80f986STejun Heo * deducting its vdebt and [de]activate use_delay mechanism
26840b80f986STejun Heo * accordingly. We don't want to race against the timer trying to
26850b80f986STejun Heo * clear them and leave @iocg inactive w/ dangling use_delay heavily
26860b80f986STejun Heo * penalizing the cgroup and its descendants.
268736a52481STejun Heo */
2688da437b95STejun Heo if (use_debt) {
2689c421a3ebSTejun Heo iocg_incur_debt(iocg, abs_cost, &now);
269054c52e10STejun Heo if (iocg_kick_delay(iocg, &now))
2691ba91c849SChristoph Hellwig blkcg_schedule_throttle(rqos->disk,
2692d7bd15a1STejun Heo (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2693da437b95STejun Heo iocg_unlock(iocg, ioc_locked, &flags);
26947caa4715STejun Heo return;
26957caa4715STejun Heo }
26967caa4715STejun Heo
2697b0853ab4STejun Heo /* guarantee that iocgs w/ waiters have maximum inuse */
2698c421a3ebSTejun Heo if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2699b0853ab4STejun Heo if (!ioc_locked) {
2700b0853ab4STejun Heo iocg_unlock(iocg, false, &flags);
2701b0853ab4STejun Heo ioc_locked = true;
2702b0853ab4STejun Heo goto retry_lock;
2703b0853ab4STejun Heo }
2704b0853ab4STejun Heo propagate_weights(iocg, iocg->active, iocg->active, true,
2705b0853ab4STejun Heo &now);
2706b0853ab4STejun Heo }
2707b0853ab4STejun Heo
27087caa4715STejun Heo /*
27097caa4715STejun Heo * Append self to the waitq and schedule the wakeup timer if we're
27107caa4715STejun Heo * the first waiter. The timer duration is calculated based on the
27117caa4715STejun Heo * current vrate. vtime and hweight changes can make it too short
27127caa4715STejun Heo * or too long. Each wait entry records the absolute cost it's
27137caa4715STejun Heo * waiting for to allow re-evaluation using a custom wait entry.
27147caa4715STejun Heo *
27157caa4715STejun Heo * If too short, the timer simply reschedules itself. If too long,
27167caa4715STejun Heo * the period timer will notice and trigger wakeups.
27177caa4715STejun Heo *
27187caa4715STejun Heo * All waiters are on iocg->waitq and the wait states are
27197caa4715STejun Heo * synchronized using waitq.lock.
27207caa4715STejun Heo */
27217caa4715STejun Heo init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
27227caa4715STejun Heo wait.wait.private = current;
27237caa4715STejun Heo wait.bio = bio;
27247caa4715STejun Heo wait.abs_cost = abs_cost;
27257caa4715STejun Heo wait.committed = false; /* will be set true by waker */
27267caa4715STejun Heo
27277caa4715STejun Heo __add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2728da437b95STejun Heo iocg_kick_waitq(iocg, ioc_locked, &now);
27297caa4715STejun Heo
2730da437b95STejun Heo iocg_unlock(iocg, ioc_locked, &flags);
27317caa4715STejun Heo
27327caa4715STejun Heo while (true) {
27337caa4715STejun Heo set_current_state(TASK_UNINTERRUPTIBLE);
27347caa4715STejun Heo if (wait.committed)
27357caa4715STejun Heo break;
27367caa4715STejun Heo io_schedule();
27377caa4715STejun Heo }
27387caa4715STejun Heo
27397caa4715STejun Heo /* waker already committed us, proceed */
27407caa4715STejun Heo finish_wait(&iocg->waitq, &wait.wait);
27417caa4715STejun Heo }
27427caa4715STejun Heo
ioc_rqos_merge(struct rq_qos * rqos,struct request * rq,struct bio * bio)27437caa4715STejun Heo static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
27447caa4715STejun Heo struct bio *bio)
27457caa4715STejun Heo {
27467caa4715STejun Heo struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2747d16baa3fSTejun Heo struct ioc *ioc = rqos_to_ioc(rqos);
27487caa4715STejun Heo sector_t bio_end = bio_end_sector(bio);
2749e1518f63STejun Heo struct ioc_now now;
2750b0853ab4STejun Heo u64 vtime, abs_cost, cost;
27510b80f986STejun Heo unsigned long flags;
27527caa4715STejun Heo
2753d16baa3fSTejun Heo /* bypass if disabled, still initializing, or for root cgroup */
2754d16baa3fSTejun Heo if (!ioc->enabled || !iocg || !iocg->level)
27557caa4715STejun Heo return;
27567caa4715STejun Heo
27577caa4715STejun Heo abs_cost = calc_vtime_cost(bio, iocg, true);
27587caa4715STejun Heo if (!abs_cost)
27597caa4715STejun Heo return;
27607caa4715STejun Heo
2761e1518f63STejun Heo ioc_now(ioc, &now);
2762b0853ab4STejun Heo
2763b0853ab4STejun Heo vtime = atomic64_read(&iocg->vtime);
2764b0853ab4STejun Heo cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2765e1518f63STejun Heo
27667caa4715STejun Heo /* update cursor if backmerging into the request at the cursor */
27677caa4715STejun Heo if (blk_rq_pos(rq) < bio_end &&
27687caa4715STejun Heo blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
27697caa4715STejun Heo iocg->cursor = bio_end;
27707caa4715STejun Heo
2771e1518f63STejun Heo /*
27720b80f986STejun Heo * Charge if there's enough vtime budget and the existing request has
27730b80f986STejun Heo * cost assigned.
2774e1518f63STejun Heo */
2775e1518f63STejun Heo if (rq->bio && rq->bio->bi_iocost_cost &&
27760b80f986STejun Heo time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
277797eb1975STejun Heo iocg_commit_bio(iocg, bio, abs_cost, cost);
27780b80f986STejun Heo return;
27790b80f986STejun Heo }
27800b80f986STejun Heo
27810b80f986STejun Heo /*
27820b80f986STejun Heo * Otherwise, account it as debt if @iocg is online, which it should
27830b80f986STejun Heo * be for the vast majority of cases. See debt handling in
27840b80f986STejun Heo * ioc_rqos_throttle() for details.
27850b80f986STejun Heo */
2786c421a3ebSTejun Heo spin_lock_irqsave(&ioc->lock, flags);
2787c421a3ebSTejun Heo spin_lock(&iocg->waitq.lock);
2788c421a3ebSTejun Heo
27890b80f986STejun Heo if (likely(!list_empty(&iocg->active_list))) {
2790c421a3ebSTejun Heo iocg_incur_debt(iocg, abs_cost, &now);
2791c421a3ebSTejun Heo if (iocg_kick_delay(iocg, &now))
2792ba91c849SChristoph Hellwig blkcg_schedule_throttle(rqos->disk,
2793c421a3ebSTejun Heo (bio->bi_opf & REQ_SWAP) == REQ_SWAP);
27940b80f986STejun Heo } else {
279597eb1975STejun Heo iocg_commit_bio(iocg, bio, abs_cost, cost);
27960b80f986STejun Heo }
2797c421a3ebSTejun Heo
2798c421a3ebSTejun Heo spin_unlock(&iocg->waitq.lock);
2799c421a3ebSTejun Heo spin_unlock_irqrestore(&ioc->lock, flags);
28007caa4715STejun Heo }
28017caa4715STejun Heo
ioc_rqos_done_bio(struct rq_qos * rqos,struct bio * bio)28027caa4715STejun Heo static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
28037caa4715STejun Heo {
28047caa4715STejun Heo struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
28057caa4715STejun Heo
28067caa4715STejun Heo if (iocg && bio->bi_iocost_cost)
28077caa4715STejun Heo atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
28087caa4715STejun Heo }
28097caa4715STejun Heo
ioc_rqos_done(struct rq_qos * rqos,struct request * rq)28107caa4715STejun Heo static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
28117caa4715STejun Heo {
28127caa4715STejun Heo struct ioc *ioc = rqos_to_ioc(rqos);
28135e124f74STejun Heo struct ioc_pcpu_stat *ccs;
2814cd006509STejun Heo u64 on_q_ns, rq_wait_ns, size_nsec;
28157caa4715STejun Heo int pidx, rw;
28167caa4715STejun Heo
28177caa4715STejun Heo if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
28187caa4715STejun Heo return;
28197caa4715STejun Heo
282062c159a0SBart Van Assche switch (req_op(rq)) {
28217caa4715STejun Heo case REQ_OP_READ:
28227caa4715STejun Heo pidx = QOS_RLAT;
28237caa4715STejun Heo rw = READ;
28247caa4715STejun Heo break;
28257caa4715STejun Heo case REQ_OP_WRITE:
28267caa4715STejun Heo pidx = QOS_WLAT;
28277caa4715STejun Heo rw = WRITE;
28287caa4715STejun Heo break;
28297caa4715STejun Heo default:
28307caa4715STejun Heo return;
28317caa4715STejun Heo }
28327caa4715STejun Heo
28337caa4715STejun Heo on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
28347caa4715STejun Heo rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2835cd006509STejun Heo size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
28367caa4715STejun Heo
28375e124f74STejun Heo ccs = get_cpu_ptr(ioc->pcpu_stat);
28385e124f74STejun Heo
2839cd006509STejun Heo if (on_q_ns <= size_nsec ||
2840cd006509STejun Heo on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
28415e124f74STejun Heo local_inc(&ccs->missed[rw].nr_met);
28427caa4715STejun Heo else
28435e124f74STejun Heo local_inc(&ccs->missed[rw].nr_missed);
28447caa4715STejun Heo
28455e124f74STejun Heo local64_add(rq_wait_ns, &ccs->rq_wait_ns);
28465e124f74STejun Heo
28475e124f74STejun Heo put_cpu_ptr(ccs);
28487caa4715STejun Heo }
28497caa4715STejun Heo
ioc_rqos_queue_depth_changed(struct rq_qos * rqos)28507caa4715STejun Heo static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
28517caa4715STejun Heo {
28527caa4715STejun Heo struct ioc *ioc = rqos_to_ioc(rqos);
28537caa4715STejun Heo
28547caa4715STejun Heo spin_lock_irq(&ioc->lock);
28557caa4715STejun Heo ioc_refresh_params(ioc, false);
28567caa4715STejun Heo spin_unlock_irq(&ioc->lock);
28577caa4715STejun Heo }
28587caa4715STejun Heo
ioc_rqos_exit(struct rq_qos * rqos)28597caa4715STejun Heo static void ioc_rqos_exit(struct rq_qos *rqos)
28607caa4715STejun Heo {
28617caa4715STejun Heo struct ioc *ioc = rqos_to_ioc(rqos);
28627caa4715STejun Heo
286340e4996eSChristoph Hellwig blkcg_deactivate_policy(rqos->disk, &blkcg_policy_iocost);
28647caa4715STejun Heo
28657caa4715STejun Heo spin_lock_irq(&ioc->lock);
28667caa4715STejun Heo ioc->running = IOC_STOP;
28677caa4715STejun Heo spin_unlock_irq(&ioc->lock);
28687caa4715STejun Heo
2869292a089dSSteven Rostedt (Google) timer_shutdown_sync(&ioc->timer);
28707caa4715STejun Heo free_percpu(ioc->pcpu_stat);
28717caa4715STejun Heo kfree(ioc);
28727caa4715STejun Heo }
28737caa4715STejun Heo
28743963d84dSChristoph Hellwig static const struct rq_qos_ops ioc_rqos_ops = {
28757caa4715STejun Heo .throttle = ioc_rqos_throttle,
28767caa4715STejun Heo .merge = ioc_rqos_merge,
28777caa4715STejun Heo .done_bio = ioc_rqos_done_bio,
28787caa4715STejun Heo .done = ioc_rqos_done,
28797caa4715STejun Heo .queue_depth_changed = ioc_rqos_queue_depth_changed,
28807caa4715STejun Heo .exit = ioc_rqos_exit,
28817caa4715STejun Heo };
28827caa4715STejun Heo
blk_iocost_init(struct gendisk * disk)288357b64554SChristoph Hellwig static int blk_iocost_init(struct gendisk *disk)
28847caa4715STejun Heo {
28857caa4715STejun Heo struct ioc *ioc;
28865e124f74STejun Heo int i, cpu, ret;
28877caa4715STejun Heo
28887caa4715STejun Heo ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
28897caa4715STejun Heo if (!ioc)
28907caa4715STejun Heo return -ENOMEM;
28917caa4715STejun Heo
28927caa4715STejun Heo ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
28937caa4715STejun Heo if (!ioc->pcpu_stat) {
28947caa4715STejun Heo kfree(ioc);
28957caa4715STejun Heo return -ENOMEM;
28967caa4715STejun Heo }
28977caa4715STejun Heo
28985e124f74STejun Heo for_each_possible_cpu(cpu) {
28995e124f74STejun Heo struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
29005e124f74STejun Heo
29015e124f74STejun Heo for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
29025e124f74STejun Heo local_set(&ccs->missed[i].nr_met, 0);
29035e124f74STejun Heo local_set(&ccs->missed[i].nr_missed, 0);
29045e124f74STejun Heo }
29055e124f74STejun Heo local64_set(&ccs->rq_wait_ns, 0);
29065e124f74STejun Heo }
29075e124f74STejun Heo
29087caa4715STejun Heo spin_lock_init(&ioc->lock);
29097caa4715STejun Heo timer_setup(&ioc->timer, ioc_timer_fn, 0);
29107caa4715STejun Heo INIT_LIST_HEAD(&ioc->active_iocgs);
29117caa4715STejun Heo
29127caa4715STejun Heo ioc->running = IOC_IDLE;
2913ac33e91eSTejun Heo ioc->vtime_base_rate = VTIME_PER_USEC;
29147caa4715STejun Heo atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
291567b7b641SAhmed S. Darwish seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
29167caa4715STejun Heo ioc->period_at = ktime_to_us(ktime_get());
29177caa4715STejun Heo atomic64_set(&ioc->cur_period, 0);
29187caa4715STejun Heo atomic_set(&ioc->hweight_gen, 0);
29197caa4715STejun Heo
29207caa4715STejun Heo spin_lock_irq(&ioc->lock);
29217caa4715STejun Heo ioc->autop_idx = AUTOP_INVALID;
2922e33b9365SBreno Leitao ioc_refresh_params_disk(ioc, true, disk);
29237caa4715STejun Heo spin_unlock_irq(&ioc->lock);
29247caa4715STejun Heo
2925d16baa3fSTejun Heo /*
29267a88b1a8SKemeng Shi * rqos must be added before activation to allow ioc_pd_init() to
2927d16baa3fSTejun Heo * lookup the ioc from q. This means that the rqos methods may get
2928d16baa3fSTejun Heo * called before policy activation completion, can't assume that the
2929d16baa3fSTejun Heo * target bio has an iocg associated and need to test for NULL iocg.
2930d16baa3fSTejun Heo */
2931ce57b558SChristoph Hellwig ret = rq_qos_add(&ioc->rqos, disk, RQ_QOS_COST, &ioc_rqos_ops);
293214a6e2ebSJinke Han if (ret)
293314a6e2ebSJinke Han goto err_free_ioc;
293414a6e2ebSJinke Han
293540e4996eSChristoph Hellwig ret = blkcg_activate_policy(disk, &blkcg_policy_iocost);
293614a6e2ebSJinke Han if (ret)
293714a6e2ebSJinke Han goto err_del_qos;
293814a6e2ebSJinke Han return 0;
293914a6e2ebSJinke Han
294014a6e2ebSJinke Han err_del_qos:
2941ce57b558SChristoph Hellwig rq_qos_del(&ioc->rqos);
294214a6e2ebSJinke Han err_free_ioc:
29433532e722STejun Heo free_percpu(ioc->pcpu_stat);
29447caa4715STejun Heo kfree(ioc);
29457caa4715STejun Heo return ret;
29467caa4715STejun Heo }
29477caa4715STejun Heo
ioc_cpd_alloc(gfp_t gfp)29487caa4715STejun Heo static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
29497caa4715STejun Heo {
29507caa4715STejun Heo struct ioc_cgrp *iocc;
29517caa4715STejun Heo
29527caa4715STejun Heo iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2953e916ad29STejun Heo if (!iocc)
2954e916ad29STejun Heo return NULL;
29557caa4715STejun Heo
2956bd0adb91STejun Heo iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
29577caa4715STejun Heo return &iocc->cpd;
29587caa4715STejun Heo }
29597caa4715STejun Heo
ioc_cpd_free(struct blkcg_policy_data * cpd)29607caa4715STejun Heo static void ioc_cpd_free(struct blkcg_policy_data *cpd)
29617caa4715STejun Heo {
29627caa4715STejun Heo kfree(container_of(cpd, struct ioc_cgrp, cpd));
29637caa4715STejun Heo }
29647caa4715STejun Heo
ioc_pd_alloc(struct gendisk * disk,struct blkcg * blkcg,gfp_t gfp)29650a0b4f79SChristoph Hellwig static struct blkg_policy_data *ioc_pd_alloc(struct gendisk *disk,
29660a0b4f79SChristoph Hellwig struct blkcg *blkcg, gfp_t gfp)
29677caa4715STejun Heo {
29687caa4715STejun Heo int levels = blkcg->css.cgroup->level + 1;
29697caa4715STejun Heo struct ioc_gq *iocg;
29707caa4715STejun Heo
29710a0b4f79SChristoph Hellwig iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp,
29720a0b4f79SChristoph Hellwig disk->node_id);
29737caa4715STejun Heo if (!iocg)
29747caa4715STejun Heo return NULL;
29757caa4715STejun Heo
297697eb1975STejun Heo iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
297797eb1975STejun Heo if (!iocg->pcpu_stat) {
297897eb1975STejun Heo kfree(iocg);
297997eb1975STejun Heo return NULL;
298097eb1975STejun Heo }
298197eb1975STejun Heo
29827caa4715STejun Heo return &iocg->pd;
29837caa4715STejun Heo }
29847caa4715STejun Heo
ioc_pd_init(struct blkg_policy_data * pd)29857caa4715STejun Heo static void ioc_pd_init(struct blkg_policy_data *pd)
29867caa4715STejun Heo {
29877caa4715STejun Heo struct ioc_gq *iocg = pd_to_iocg(pd);
29887caa4715STejun Heo struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2989a06377c5SChristoph Hellwig struct ioc *ioc = q_to_ioc(blkg->q);
29907caa4715STejun Heo struct ioc_now now;
29917caa4715STejun Heo struct blkcg_gq *tblkg;
29927caa4715STejun Heo unsigned long flags;
29937caa4715STejun Heo
29947caa4715STejun Heo ioc_now(ioc, &now);
29957caa4715STejun Heo
29967caa4715STejun Heo iocg->ioc = ioc;
29977caa4715STejun Heo atomic64_set(&iocg->vtime, now.vnow);
29987caa4715STejun Heo atomic64_set(&iocg->done_vtime, now.vnow);
29997caa4715STejun Heo atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
30007caa4715STejun Heo INIT_LIST_HEAD(&iocg->active_list);
300197eb1975STejun Heo INIT_LIST_HEAD(&iocg->walk_list);
30028692d2dbSTejun Heo INIT_LIST_HEAD(&iocg->surplus_list);
3003fe20cdb5STejun Heo iocg->hweight_active = WEIGHT_ONE;
3004fe20cdb5STejun Heo iocg->hweight_inuse = WEIGHT_ONE;
30057caa4715STejun Heo
30067caa4715STejun Heo init_waitqueue_head(&iocg->waitq);
30077caa4715STejun Heo hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
30087caa4715STejun Heo iocg->waitq_timer.function = iocg_waitq_timer_fn;
30097caa4715STejun Heo
30107caa4715STejun Heo iocg->level = blkg->blkcg->css.cgroup->level;
30117caa4715STejun Heo
30127caa4715STejun Heo for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
30137caa4715STejun Heo struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
30147caa4715STejun Heo iocg->ancestors[tiocg->level] = tiocg;
30157caa4715STejun Heo }
30167caa4715STejun Heo
30177caa4715STejun Heo spin_lock_irqsave(&ioc->lock, flags);
3018b0853ab4STejun Heo weight_updated(iocg, &now);
30197caa4715STejun Heo spin_unlock_irqrestore(&ioc->lock, flags);
30207caa4715STejun Heo }
30217caa4715STejun Heo
ioc_pd_free(struct blkg_policy_data * pd)30227caa4715STejun Heo static void ioc_pd_free(struct blkg_policy_data *pd)
30237caa4715STejun Heo {
30247caa4715STejun Heo struct ioc_gq *iocg = pd_to_iocg(pd);
30257caa4715STejun Heo struct ioc *ioc = iocg->ioc;
30265aeac7c4STejun Heo unsigned long flags;
30277caa4715STejun Heo
30287caa4715STejun Heo if (ioc) {
30295aeac7c4STejun Heo spin_lock_irqsave(&ioc->lock, flags);
303097eb1975STejun Heo
30317caa4715STejun Heo if (!list_empty(&iocg->active_list)) {
3032b0853ab4STejun Heo struct ioc_now now;
3033b0853ab4STejun Heo
3034b0853ab4STejun Heo ioc_now(ioc, &now);
3035b0853ab4STejun Heo propagate_weights(iocg, 0, 0, false, &now);
30367caa4715STejun Heo list_del_init(&iocg->active_list);
30377caa4715STejun Heo }
303897eb1975STejun Heo
303997eb1975STejun Heo WARN_ON_ONCE(!list_empty(&iocg->walk_list));
30408692d2dbSTejun Heo WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
304197eb1975STejun Heo
30425aeac7c4STejun Heo spin_unlock_irqrestore(&ioc->lock, flags);
3043e036c4caSTejun Heo
3044e036c4caSTejun Heo hrtimer_cancel(&iocg->waitq_timer);
30457caa4715STejun Heo }
304697eb1975STejun Heo free_percpu(iocg->pcpu_stat);
30477caa4715STejun Heo kfree(iocg);
30487caa4715STejun Heo }
30497caa4715STejun Heo
ioc_pd_stat(struct blkg_policy_data * pd,struct seq_file * s)30503607849dSWolfgang Bumiller static void ioc_pd_stat(struct blkg_policy_data *pd, struct seq_file *s)
305197eb1975STejun Heo {
305297eb1975STejun Heo struct ioc_gq *iocg = pd_to_iocg(pd);
305397eb1975STejun Heo struct ioc *ioc = iocg->ioc;
305497eb1975STejun Heo
305597eb1975STejun Heo if (!ioc->enabled)
30563607849dSWolfgang Bumiller return;
305797eb1975STejun Heo
305897eb1975STejun Heo if (iocg->level == 0) {
305997eb1975STejun Heo unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
3060ac33e91eSTejun Heo ioc->vtime_base_rate * 10000,
306197eb1975STejun Heo VTIME_PER_USEC);
3062252c651aSChristoph Hellwig seq_printf(s, " cost.vrate=%u.%02u", vp10k / 100, vp10k % 100);
306397eb1975STejun Heo }
306497eb1975STejun Heo
3065252c651aSChristoph Hellwig seq_printf(s, " cost.usage=%llu", iocg->last_stat.usage_us);
306697eb1975STejun Heo
3067f0bf84a5STejun Heo if (blkcg_debug_stats)
3068252c651aSChristoph Hellwig seq_printf(s, " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
3069f0bf84a5STejun Heo iocg->last_stat.wait_us,
3070f0bf84a5STejun Heo iocg->last_stat.indebt_us,
3071f0bf84a5STejun Heo iocg->last_stat.indelay_us);
307297eb1975STejun Heo }
307397eb1975STejun Heo
ioc_weight_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)30747caa4715STejun Heo static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
30757caa4715STejun Heo int off)
30767caa4715STejun Heo {
30777caa4715STejun Heo const char *dname = blkg_dev_name(pd->blkg);
30787caa4715STejun Heo struct ioc_gq *iocg = pd_to_iocg(pd);
30797caa4715STejun Heo
30807caa4715STejun Heo if (dname && iocg->cfg_weight)
3081bd0adb91STejun Heo seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
30827caa4715STejun Heo return 0;
30837caa4715STejun Heo }
30847caa4715STejun Heo
30857caa4715STejun Heo
ioc_weight_show(struct seq_file * sf,void * v)30867caa4715STejun Heo static int ioc_weight_show(struct seq_file *sf, void *v)
30877caa4715STejun Heo {
30887caa4715STejun Heo struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
30897caa4715STejun Heo struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
30907caa4715STejun Heo
3091bd0adb91STejun Heo seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
30927caa4715STejun Heo blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
30937caa4715STejun Heo &blkcg_policy_iocost, seq_cft(sf)->private, false);
30947caa4715STejun Heo return 0;
30957caa4715STejun Heo }
30967caa4715STejun Heo
ioc_weight_write(struct kernfs_open_file * of,char * buf,size_t nbytes,loff_t off)30977caa4715STejun Heo static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
30987caa4715STejun Heo size_t nbytes, loff_t off)
30997caa4715STejun Heo {
31007caa4715STejun Heo struct blkcg *blkcg = css_to_blkcg(of_css(of));
31017caa4715STejun Heo struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
31027caa4715STejun Heo struct blkg_conf_ctx ctx;
3103b0853ab4STejun Heo struct ioc_now now;
31047caa4715STejun Heo struct ioc_gq *iocg;
31057caa4715STejun Heo u32 v;
31067caa4715STejun Heo int ret;
31077caa4715STejun Heo
31087caa4715STejun Heo if (!strchr(buf, ':')) {
31097caa4715STejun Heo struct blkcg_gq *blkg;
31107caa4715STejun Heo
31117caa4715STejun Heo if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
31127caa4715STejun Heo return -EINVAL;
31137caa4715STejun Heo
31147caa4715STejun Heo if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
31157caa4715STejun Heo return -EINVAL;
31167caa4715STejun Heo
311711431e26SMing Lei spin_lock_irq(&blkcg->lock);
3118bd0adb91STejun Heo iocc->dfl_weight = v * WEIGHT_ONE;
31197caa4715STejun Heo hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
31207caa4715STejun Heo struct ioc_gq *iocg = blkg_to_iocg(blkg);
31217caa4715STejun Heo
31227caa4715STejun Heo if (iocg) {
312311431e26SMing Lei spin_lock(&iocg->ioc->lock);
3124b0853ab4STejun Heo ioc_now(iocg->ioc, &now);
3125b0853ab4STejun Heo weight_updated(iocg, &now);
312611431e26SMing Lei spin_unlock(&iocg->ioc->lock);
31277caa4715STejun Heo }
31287caa4715STejun Heo }
312911431e26SMing Lei spin_unlock_irq(&blkcg->lock);
31307caa4715STejun Heo
31317caa4715STejun Heo return nbytes;
31327caa4715STejun Heo }
31337caa4715STejun Heo
3134faffaab2STejun Heo blkg_conf_init(&ctx, buf);
3135faffaab2STejun Heo
3136faffaab2STejun Heo ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, &ctx);
31377caa4715STejun Heo if (ret)
3138faffaab2STejun Heo goto err;
31397caa4715STejun Heo
31407caa4715STejun Heo iocg = blkg_to_iocg(ctx.blkg);
31417caa4715STejun Heo
31427caa4715STejun Heo if (!strncmp(ctx.body, "default", 7)) {
31437caa4715STejun Heo v = 0;
31447caa4715STejun Heo } else {
31457caa4715STejun Heo if (!sscanf(ctx.body, "%u", &v))
31467caa4715STejun Heo goto einval;
31477caa4715STejun Heo if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
31487caa4715STejun Heo goto einval;
31497caa4715STejun Heo }
31507caa4715STejun Heo
315141591a51SDan Carpenter spin_lock(&iocg->ioc->lock);
3152bd0adb91STejun Heo iocg->cfg_weight = v * WEIGHT_ONE;
3153b0853ab4STejun Heo ioc_now(iocg->ioc, &now);
3154b0853ab4STejun Heo weight_updated(iocg, &now);
315541591a51SDan Carpenter spin_unlock(&iocg->ioc->lock);
31567caa4715STejun Heo
3157faffaab2STejun Heo blkg_conf_exit(&ctx);
31587caa4715STejun Heo return nbytes;
31597caa4715STejun Heo
31607caa4715STejun Heo einval:
3161faffaab2STejun Heo ret = -EINVAL;
3162faffaab2STejun Heo err:
3163faffaab2STejun Heo blkg_conf_exit(&ctx);
3164faffaab2STejun Heo return ret;
31657caa4715STejun Heo }
31667caa4715STejun Heo
ioc_qos_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)31677caa4715STejun Heo static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
31687caa4715STejun Heo int off)
31697caa4715STejun Heo {
31707caa4715STejun Heo const char *dname = blkg_dev_name(pd->blkg);
31717caa4715STejun Heo struct ioc *ioc = pd_to_iocg(pd)->ioc;
31727caa4715STejun Heo
31737caa4715STejun Heo if (!dname)
31747caa4715STejun Heo return 0;
31757caa4715STejun Heo
317635198e32SYu Kuai spin_lock_irq(&ioc->lock);
31777caa4715STejun Heo seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
31787caa4715STejun Heo dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
31797caa4715STejun Heo ioc->params.qos[QOS_RPPM] / 10000,
31807caa4715STejun Heo ioc->params.qos[QOS_RPPM] % 10000 / 100,
31817caa4715STejun Heo ioc->params.qos[QOS_RLAT],
31827caa4715STejun Heo ioc->params.qos[QOS_WPPM] / 10000,
31837caa4715STejun Heo ioc->params.qos[QOS_WPPM] % 10000 / 100,
31847caa4715STejun Heo ioc->params.qos[QOS_WLAT],
31857caa4715STejun Heo ioc->params.qos[QOS_MIN] / 10000,
31867caa4715STejun Heo ioc->params.qos[QOS_MIN] % 10000 / 100,
31877caa4715STejun Heo ioc->params.qos[QOS_MAX] / 10000,
31887caa4715STejun Heo ioc->params.qos[QOS_MAX] % 10000 / 100);
318935198e32SYu Kuai spin_unlock_irq(&ioc->lock);
31907caa4715STejun Heo return 0;
31917caa4715STejun Heo }
31927caa4715STejun Heo
ioc_qos_show(struct seq_file * sf,void * v)31937caa4715STejun Heo static int ioc_qos_show(struct seq_file *sf, void *v)
31947caa4715STejun Heo {
31957caa4715STejun Heo struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
31967caa4715STejun Heo
31977caa4715STejun Heo blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
31987caa4715STejun Heo &blkcg_policy_iocost, seq_cft(sf)->private, false);
31997caa4715STejun Heo return 0;
32007caa4715STejun Heo }
32017caa4715STejun Heo
32027caa4715STejun Heo static const match_table_t qos_ctrl_tokens = {
32037caa4715STejun Heo { QOS_ENABLE, "enable=%u" },
32047caa4715STejun Heo { QOS_CTRL, "ctrl=%s" },
32057caa4715STejun Heo { NR_QOS_CTRL_PARAMS, NULL },
32067caa4715STejun Heo };
32077caa4715STejun Heo
32087caa4715STejun Heo static const match_table_t qos_tokens = {
32097caa4715STejun Heo { QOS_RPPM, "rpct=%s" },
32107caa4715STejun Heo { QOS_RLAT, "rlat=%u" },
32117caa4715STejun Heo { QOS_WPPM, "wpct=%s" },
32127caa4715STejun Heo { QOS_WLAT, "wlat=%u" },
32137caa4715STejun Heo { QOS_MIN, "min=%s" },
32147caa4715STejun Heo { QOS_MAX, "max=%s" },
32157caa4715STejun Heo { NR_QOS_PARAMS, NULL },
32167caa4715STejun Heo };
32177caa4715STejun Heo
ioc_qos_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)32187caa4715STejun Heo static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
32197caa4715STejun Heo size_t nbytes, loff_t off)
32207caa4715STejun Heo {
3221faffaab2STejun Heo struct blkg_conf_ctx ctx;
32223657647eSChristoph Hellwig struct gendisk *disk;
32237caa4715STejun Heo struct ioc *ioc;
32247caa4715STejun Heo u32 qos[NR_QOS_PARAMS];
32257caa4715STejun Heo bool enable, user;
3226faffaab2STejun Heo char *body, *p;
32277caa4715STejun Heo int ret;
32287caa4715STejun Heo
3229faffaab2STejun Heo blkg_conf_init(&ctx, input);
32307caa4715STejun Heo
3231faffaab2STejun Heo ret = blkg_conf_open_bdev(&ctx);
3232faffaab2STejun Heo if (ret)
3233faffaab2STejun Heo goto err;
3234faffaab2STejun Heo
3235faffaab2STejun Heo body = ctx.body;
3236faffaab2STejun Heo disk = ctx.bdev->bd_disk;
3237235a5a83SYu Kuai if (!queue_is_mq(disk->queue)) {
3238235a5a83SYu Kuai ret = -EOPNOTSUPP;
3239235a5a83SYu Kuai goto err;
3240235a5a83SYu Kuai }
3241235a5a83SYu Kuai
32423657647eSChristoph Hellwig ioc = q_to_ioc(disk->queue);
32437caa4715STejun Heo if (!ioc) {
32443657647eSChristoph Hellwig ret = blk_iocost_init(disk);
32457caa4715STejun Heo if (ret)
32467caa4715STejun Heo goto err;
32473657647eSChristoph Hellwig ioc = q_to_ioc(disk->queue);
32487caa4715STejun Heo }
32497caa4715STejun Heo
32502b2da2f6SYu Kuai blk_mq_freeze_queue(disk->queue);
32512b2da2f6SYu Kuai blk_mq_quiesce_queue(disk->queue);
32522b2da2f6SYu Kuai
32537caa4715STejun Heo spin_lock_irq(&ioc->lock);
32547caa4715STejun Heo memcpy(qos, ioc->params.qos, sizeof(qos));
32557caa4715STejun Heo enable = ioc->enabled;
32567caa4715STejun Heo user = ioc->user_qos_params;
32577caa4715STejun Heo
3258faffaab2STejun Heo while ((p = strsep(&body, " \t\n"))) {
32597caa4715STejun Heo substring_t args[MAX_OPT_ARGS];
32607caa4715STejun Heo char buf[32];
32617caa4715STejun Heo int tok;
32627caa4715STejun Heo s64 v;
32637caa4715STejun Heo
32647caa4715STejun Heo if (!*p)
32657caa4715STejun Heo continue;
32667caa4715STejun Heo
32677caa4715STejun Heo switch (match_token(p, qos_ctrl_tokens, args)) {
32687caa4715STejun Heo case QOS_ENABLE:
32697b7c5ae4SYu Kuai if (match_u64(&args[0], &v))
32707b7c5ae4SYu Kuai goto einval;
32717caa4715STejun Heo enable = v;
32727caa4715STejun Heo continue;
32737caa4715STejun Heo case QOS_CTRL:
32747caa4715STejun Heo match_strlcpy(buf, &args[0], sizeof(buf));
32757caa4715STejun Heo if (!strcmp(buf, "auto"))
32767caa4715STejun Heo user = false;
32777caa4715STejun Heo else if (!strcmp(buf, "user"))
32787caa4715STejun Heo user = true;
32797caa4715STejun Heo else
32807caa4715STejun Heo goto einval;
32817caa4715STejun Heo continue;
32827caa4715STejun Heo }
32837caa4715STejun Heo
32847caa4715STejun Heo tok = match_token(p, qos_tokens, args);
32857caa4715STejun Heo switch (tok) {
32867caa4715STejun Heo case QOS_RPPM:
32877caa4715STejun Heo case QOS_WPPM:
32887caa4715STejun Heo if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
32897caa4715STejun Heo sizeof(buf))
32907caa4715STejun Heo goto einval;
32917caa4715STejun Heo if (cgroup_parse_float(buf, 2, &v))
32927caa4715STejun Heo goto einval;
32937caa4715STejun Heo if (v < 0 || v > 10000)
32947caa4715STejun Heo goto einval;
32957caa4715STejun Heo qos[tok] = v * 100;
32967caa4715STejun Heo break;
32977caa4715STejun Heo case QOS_RLAT:
32987caa4715STejun Heo case QOS_WLAT:
32997caa4715STejun Heo if (match_u64(&args[0], &v))
33007caa4715STejun Heo goto einval;
33017caa4715STejun Heo qos[tok] = v;
33027caa4715STejun Heo break;
33037caa4715STejun Heo case QOS_MIN:
33047caa4715STejun Heo case QOS_MAX:
33057caa4715STejun Heo if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
33067caa4715STejun Heo sizeof(buf))
33077caa4715STejun Heo goto einval;
33087caa4715STejun Heo if (cgroup_parse_float(buf, 2, &v))
33097caa4715STejun Heo goto einval;
33107caa4715STejun Heo if (v < 0)
33117caa4715STejun Heo goto einval;
33127caa4715STejun Heo qos[tok] = clamp_t(s64, v * 100,
33137caa4715STejun Heo VRATE_MIN_PPM, VRATE_MAX_PPM);
33147caa4715STejun Heo break;
33157caa4715STejun Heo default:
33167caa4715STejun Heo goto einval;
33177caa4715STejun Heo }
33187caa4715STejun Heo user = true;
33197caa4715STejun Heo }
33207caa4715STejun Heo
33217caa4715STejun Heo if (qos[QOS_MIN] > qos[QOS_MAX])
33227caa4715STejun Heo goto einval;
33237caa4715STejun Heo
3324f099a108SChengming Zhou if (enable && !ioc->enabled) {
33253657647eSChristoph Hellwig blk_stat_enable_accounting(disk->queue);
33263657647eSChristoph Hellwig blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
33277caa4715STejun Heo ioc->enabled = true;
3328f099a108SChengming Zhou } else if (!enable && ioc->enabled) {
3329f099a108SChengming Zhou blk_stat_disable_accounting(disk->queue);
33303657647eSChristoph Hellwig blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, disk->queue);
33317caa4715STejun Heo ioc->enabled = false;
33327caa4715STejun Heo }
33337caa4715STejun Heo
33347caa4715STejun Heo if (user) {
33357caa4715STejun Heo memcpy(ioc->params.qos, qos, sizeof(qos));
33367caa4715STejun Heo ioc->user_qos_params = true;
33377caa4715STejun Heo } else {
33387caa4715STejun Heo ioc->user_qos_params = false;
33397caa4715STejun Heo }
33407caa4715STejun Heo
33417caa4715STejun Heo ioc_refresh_params(ioc, true);
33427caa4715STejun Heo spin_unlock_irq(&ioc->lock);
33437caa4715STejun Heo
3344eebc21d1SYu Kuai if (enable)
3345eebc21d1SYu Kuai wbt_disable_default(disk);
3346eebc21d1SYu Kuai else
3347eebc21d1SYu Kuai wbt_enable_default(disk);
3348eebc21d1SYu Kuai
33492b2da2f6SYu Kuai blk_mq_unquiesce_queue(disk->queue);
33502b2da2f6SYu Kuai blk_mq_unfreeze_queue(disk->queue);
33512b2da2f6SYu Kuai
3352faffaab2STejun Heo blkg_conf_exit(&ctx);
33537caa4715STejun Heo return nbytes;
33547caa4715STejun Heo einval:
33552c064798SYu Kuai spin_unlock_irq(&ioc->lock);
33562b2da2f6SYu Kuai
33572b2da2f6SYu Kuai blk_mq_unquiesce_queue(disk->queue);
33582b2da2f6SYu Kuai blk_mq_unfreeze_queue(disk->queue);
33592b2da2f6SYu Kuai
33607caa4715STejun Heo ret = -EINVAL;
33617caa4715STejun Heo err:
3362faffaab2STejun Heo blkg_conf_exit(&ctx);
33637caa4715STejun Heo return ret;
33647caa4715STejun Heo }
33657caa4715STejun Heo
ioc_cost_model_prfill(struct seq_file * sf,struct blkg_policy_data * pd,int off)33667caa4715STejun Heo static u64 ioc_cost_model_prfill(struct seq_file *sf,
33677caa4715STejun Heo struct blkg_policy_data *pd, int off)
33687caa4715STejun Heo {
33697caa4715STejun Heo const char *dname = blkg_dev_name(pd->blkg);
33707caa4715STejun Heo struct ioc *ioc = pd_to_iocg(pd)->ioc;
33717caa4715STejun Heo u64 *u = ioc->params.i_lcoefs;
33727caa4715STejun Heo
33737caa4715STejun Heo if (!dname)
33747caa4715STejun Heo return 0;
33757caa4715STejun Heo
337635198e32SYu Kuai spin_lock_irq(&ioc->lock);
33777caa4715STejun Heo seq_printf(sf, "%s ctrl=%s model=linear "
33787caa4715STejun Heo "rbps=%llu rseqiops=%llu rrandiops=%llu "
33797caa4715STejun Heo "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
33807caa4715STejun Heo dname, ioc->user_cost_model ? "user" : "auto",
33817caa4715STejun Heo u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
33827caa4715STejun Heo u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
338335198e32SYu Kuai spin_unlock_irq(&ioc->lock);
33847caa4715STejun Heo return 0;
33857caa4715STejun Heo }
33867caa4715STejun Heo
ioc_cost_model_show(struct seq_file * sf,void * v)33877caa4715STejun Heo static int ioc_cost_model_show(struct seq_file *sf, void *v)
33887caa4715STejun Heo {
33897caa4715STejun Heo struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
33907caa4715STejun Heo
33917caa4715STejun Heo blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
33927caa4715STejun Heo &blkcg_policy_iocost, seq_cft(sf)->private, false);
33937caa4715STejun Heo return 0;
33947caa4715STejun Heo }
33957caa4715STejun Heo
33967caa4715STejun Heo static const match_table_t cost_ctrl_tokens = {
33977caa4715STejun Heo { COST_CTRL, "ctrl=%s" },
33987caa4715STejun Heo { COST_MODEL, "model=%s" },
33997caa4715STejun Heo { NR_COST_CTRL_PARAMS, NULL },
34007caa4715STejun Heo };
34017caa4715STejun Heo
34027caa4715STejun Heo static const match_table_t i_lcoef_tokens = {
34037caa4715STejun Heo { I_LCOEF_RBPS, "rbps=%u" },
34047caa4715STejun Heo { I_LCOEF_RSEQIOPS, "rseqiops=%u" },
34057caa4715STejun Heo { I_LCOEF_RRANDIOPS, "rrandiops=%u" },
34067caa4715STejun Heo { I_LCOEF_WBPS, "wbps=%u" },
34077caa4715STejun Heo { I_LCOEF_WSEQIOPS, "wseqiops=%u" },
34087caa4715STejun Heo { I_LCOEF_WRANDIOPS, "wrandiops=%u" },
34097caa4715STejun Heo { NR_I_LCOEFS, NULL },
34107caa4715STejun Heo };
34117caa4715STejun Heo
ioc_cost_model_write(struct kernfs_open_file * of,char * input,size_t nbytes,loff_t off)34127caa4715STejun Heo static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
34137caa4715STejun Heo size_t nbytes, loff_t off)
34147caa4715STejun Heo {
3415faffaab2STejun Heo struct blkg_conf_ctx ctx;
34162b2da2f6SYu Kuai struct request_queue *q;
34177caa4715STejun Heo struct ioc *ioc;
34187caa4715STejun Heo u64 u[NR_I_LCOEFS];
34197caa4715STejun Heo bool user;
3420faffaab2STejun Heo char *body, *p;
34217caa4715STejun Heo int ret;
34227caa4715STejun Heo
3423faffaab2STejun Heo blkg_conf_init(&ctx, input);
34247caa4715STejun Heo
3425faffaab2STejun Heo ret = blkg_conf_open_bdev(&ctx);
3426faffaab2STejun Heo if (ret)
3427faffaab2STejun Heo goto err;
3428faffaab2STejun Heo
3429faffaab2STejun Heo body = ctx.body;
3430faffaab2STejun Heo q = bdev_get_queue(ctx.bdev);
3431235a5a83SYu Kuai if (!queue_is_mq(q)) {
3432235a5a83SYu Kuai ret = -EOPNOTSUPP;
3433235a5a83SYu Kuai goto err;
3434235a5a83SYu Kuai }
3435235a5a83SYu Kuai
34362b2da2f6SYu Kuai ioc = q_to_ioc(q);
34377caa4715STejun Heo if (!ioc) {
3438faffaab2STejun Heo ret = blk_iocost_init(ctx.bdev->bd_disk);
34397caa4715STejun Heo if (ret)
34407caa4715STejun Heo goto err;
34412b2da2f6SYu Kuai ioc = q_to_ioc(q);
34427caa4715STejun Heo }
34437caa4715STejun Heo
34442b2da2f6SYu Kuai blk_mq_freeze_queue(q);
34452b2da2f6SYu Kuai blk_mq_quiesce_queue(q);
34462b2da2f6SYu Kuai
34477caa4715STejun Heo spin_lock_irq(&ioc->lock);
34487caa4715STejun Heo memcpy(u, ioc->params.i_lcoefs, sizeof(u));
34497caa4715STejun Heo user = ioc->user_cost_model;
34507caa4715STejun Heo
3451faffaab2STejun Heo while ((p = strsep(&body, " \t\n"))) {
34527caa4715STejun Heo substring_t args[MAX_OPT_ARGS];
34537caa4715STejun Heo char buf[32];
34547caa4715STejun Heo int tok;
34557caa4715STejun Heo u64 v;
34567caa4715STejun Heo
34577caa4715STejun Heo if (!*p)
34587caa4715STejun Heo continue;
34597caa4715STejun Heo
34607caa4715STejun Heo switch (match_token(p, cost_ctrl_tokens, args)) {
34617caa4715STejun Heo case COST_CTRL:
34627caa4715STejun Heo match_strlcpy(buf, &args[0], sizeof(buf));
34637caa4715STejun Heo if (!strcmp(buf, "auto"))
34647caa4715STejun Heo user = false;
34657caa4715STejun Heo else if (!strcmp(buf, "user"))
34667caa4715STejun Heo user = true;
34677caa4715STejun Heo else
34687caa4715STejun Heo goto einval;
34697caa4715STejun Heo continue;
34707caa4715STejun Heo case COST_MODEL:
34717caa4715STejun Heo match_strlcpy(buf, &args[0], sizeof(buf));
34727caa4715STejun Heo if (strcmp(buf, "linear"))
34737caa4715STejun Heo goto einval;
34747caa4715STejun Heo continue;
34757caa4715STejun Heo }
34767caa4715STejun Heo
34777caa4715STejun Heo tok = match_token(p, i_lcoef_tokens, args);
34787caa4715STejun Heo if (tok == NR_I_LCOEFS)
34797caa4715STejun Heo goto einval;
34807caa4715STejun Heo if (match_u64(&args[0], &v))
34817caa4715STejun Heo goto einval;
34827caa4715STejun Heo u[tok] = v;
34837caa4715STejun Heo user = true;
34847caa4715STejun Heo }
34857caa4715STejun Heo
34867caa4715STejun Heo if (user) {
34877caa4715STejun Heo memcpy(ioc->params.i_lcoefs, u, sizeof(u));
34887caa4715STejun Heo ioc->user_cost_model = true;
34897caa4715STejun Heo } else {
34907caa4715STejun Heo ioc->user_cost_model = false;
34917caa4715STejun Heo }
34927caa4715STejun Heo ioc_refresh_params(ioc, true);
34937caa4715STejun Heo spin_unlock_irq(&ioc->lock);
34947caa4715STejun Heo
34952b2da2f6SYu Kuai blk_mq_unquiesce_queue(q);
34962b2da2f6SYu Kuai blk_mq_unfreeze_queue(q);
34972b2da2f6SYu Kuai
3498faffaab2STejun Heo blkg_conf_exit(&ctx);
34997caa4715STejun Heo return nbytes;
35007caa4715STejun Heo
35017caa4715STejun Heo einval:
35022c064798SYu Kuai spin_unlock_irq(&ioc->lock);
35032b2da2f6SYu Kuai
35042b2da2f6SYu Kuai blk_mq_unquiesce_queue(q);
35052b2da2f6SYu Kuai blk_mq_unfreeze_queue(q);
35062b2da2f6SYu Kuai
35077caa4715STejun Heo ret = -EINVAL;
35087caa4715STejun Heo err:
3509faffaab2STejun Heo blkg_conf_exit(&ctx);
35107caa4715STejun Heo return ret;
35117caa4715STejun Heo }
35127caa4715STejun Heo
35137caa4715STejun Heo static struct cftype ioc_files[] = {
35147caa4715STejun Heo {
35157caa4715STejun Heo .name = "weight",
35167caa4715STejun Heo .flags = CFTYPE_NOT_ON_ROOT,
35177caa4715STejun Heo .seq_show = ioc_weight_show,
35187caa4715STejun Heo .write = ioc_weight_write,
35197caa4715STejun Heo },
35207caa4715STejun Heo {
35217caa4715STejun Heo .name = "cost.qos",
35227caa4715STejun Heo .flags = CFTYPE_ONLY_ON_ROOT,
35237caa4715STejun Heo .seq_show = ioc_qos_show,
35247caa4715STejun Heo .write = ioc_qos_write,
35257caa4715STejun Heo },
35267caa4715STejun Heo {
35277caa4715STejun Heo .name = "cost.model",
35287caa4715STejun Heo .flags = CFTYPE_ONLY_ON_ROOT,
35297caa4715STejun Heo .seq_show = ioc_cost_model_show,
35307caa4715STejun Heo .write = ioc_cost_model_write,
35317caa4715STejun Heo },
35327caa4715STejun Heo {}
35337caa4715STejun Heo };
35347caa4715STejun Heo
35357caa4715STejun Heo static struct blkcg_policy blkcg_policy_iocost = {
35367caa4715STejun Heo .dfl_cftypes = ioc_files,
35377caa4715STejun Heo .cpd_alloc_fn = ioc_cpd_alloc,
35387caa4715STejun Heo .cpd_free_fn = ioc_cpd_free,
35397caa4715STejun Heo .pd_alloc_fn = ioc_pd_alloc,
35407caa4715STejun Heo .pd_init_fn = ioc_pd_init,
35417caa4715STejun Heo .pd_free_fn = ioc_pd_free,
354297eb1975STejun Heo .pd_stat_fn = ioc_pd_stat,
35437caa4715STejun Heo };
35447caa4715STejun Heo
ioc_init(void)35457caa4715STejun Heo static int __init ioc_init(void)
35467caa4715STejun Heo {
35477caa4715STejun Heo return blkcg_policy_register(&blkcg_policy_iocost);
35487caa4715STejun Heo }
35497caa4715STejun Heo
ioc_exit(void)35507caa4715STejun Heo static void __exit ioc_exit(void)
35517caa4715STejun Heo {
3552fa1c3eafSBaolin Wang blkcg_policy_unregister(&blkcg_policy_iocost);
35537caa4715STejun Heo }
35547caa4715STejun Heo
35557caa4715STejun Heo module_init(ioc_init);
35567caa4715STejun Heo module_exit(ioc_exit);
3557