xref: /openbmc/linux/block/blk-iocost.c (revision 5ed132db5ad4f58156ae9d28219396b6f764a9cb)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (WEIGHT_ONE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO iff doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The ouput looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include <asm/local.h>
183 #include <asm/local64.h>
184 #include "blk-rq-qos.h"
185 #include "blk-stat.h"
186 #include "blk-wbt.h"
187 
188 #ifdef CONFIG_TRACEPOINTS
189 
190 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
191 #define TRACE_IOCG_PATH_LEN 1024
192 static DEFINE_SPINLOCK(trace_iocg_path_lock);
193 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
194 
195 #define TRACE_IOCG_PATH(type, iocg, ...)					\
196 	do {									\
197 		unsigned long flags;						\
198 		if (trace_iocost_##type##_enabled()) {				\
199 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
200 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
201 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
202 			trace_iocost_##type(iocg, trace_iocg_path,		\
203 					      ##__VA_ARGS__);			\
204 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
205 		}								\
206 	} while (0)
207 
208 #else	/* CONFIG_TRACE_POINTS */
209 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
210 #endif	/* CONFIG_TRACE_POINTS */
211 
212 enum {
213 	MILLION			= 1000000,
214 
215 	/* timer period is calculated from latency requirements, bound it */
216 	MIN_PERIOD		= USEC_PER_MSEC,
217 	MAX_PERIOD		= USEC_PER_SEC,
218 
219 	/*
220 	 * iocg->vtime is targeted at 50% behind the device vtime, which
221 	 * serves as its IO credit buffer.  Surplus weight adjustment is
222 	 * immediately canceled if the vtime margin runs below 10%.
223 	 */
224 	MARGIN_MIN_PCT		= 10,
225 	MARGIN_LOW_PCT		= 20,
226 	MARGIN_TARGET_PCT	= 50,
227 
228 	INUSE_ADJ_STEP_PCT	= 25,
229 
230 	/* Have some play in timer operations */
231 	TIMER_SLACK_PCT		= 1,
232 
233 	/* 1/64k is granular enough and can easily be handled w/ u32 */
234 	WEIGHT_ONE		= 1 << 16,
235 
236 	/*
237 	 * As vtime is used to calculate the cost of each IO, it needs to
238 	 * be fairly high precision.  For example, it should be able to
239 	 * represent the cost of a single page worth of discard with
240 	 * suffificient accuracy.  At the same time, it should be able to
241 	 * represent reasonably long enough durations to be useful and
242 	 * convenient during operation.
243 	 *
244 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
245 	 * granularity and days of wrap-around time even at extreme vrates.
246 	 */
247 	VTIME_PER_SEC_SHIFT	= 37,
248 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
249 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
250 	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
251 
252 	/* bound vrate adjustments within two orders of magnitude */
253 	VRATE_MIN_PPM		= 10000,	/* 1% */
254 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
255 
256 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
257 	VRATE_CLAMP_ADJ_PCT	= 4,
258 
259 	/* if IOs end up waiting for requests, issue less */
260 	RQ_WAIT_BUSY_PCT	= 5,
261 
262 	/* unbusy hysterisis */
263 	UNBUSY_THR_PCT		= 75,
264 
265 	/*
266 	 * The effect of delay is indirect and non-linear and a huge amount of
267 	 * future debt can accumulate abruptly while unthrottled. Linearly scale
268 	 * up delay as debt is going up and then let it decay exponentially.
269 	 * This gives us quick ramp ups while delay is accumulating and long
270 	 * tails which can help reducing the frequency of debt explosions on
271 	 * unthrottle. The parameters are experimentally determined.
272 	 *
273 	 * The delay mechanism provides adequate protection and behavior in many
274 	 * cases. However, this is far from ideal and falls shorts on both
275 	 * fronts. The debtors are often throttled too harshly costing a
276 	 * significant level of fairness and possibly total work while the
277 	 * protection against their impacts on the system can be choppy and
278 	 * unreliable.
279 	 *
280 	 * The shortcoming primarily stems from the fact that, unlike for page
281 	 * cache, the kernel doesn't have well-defined back-pressure propagation
282 	 * mechanism and policies for anonymous memory. Fully addressing this
283 	 * issue will likely require substantial improvements in the area.
284 	 */
285 	MIN_DELAY_THR_PCT	= 500,
286 	MAX_DELAY_THR_PCT	= 25000,
287 	MIN_DELAY		= 250,
288 	MAX_DELAY		= 250 * USEC_PER_MSEC,
289 
290 	/* halve debts if avg usage over 100ms is under 50% */
291 	DFGV_USAGE_PCT		= 50,
292 	DFGV_PERIOD		= 100 * USEC_PER_MSEC,
293 
294 	/* don't let cmds which take a very long time pin lagging for too long */
295 	MAX_LAGGING_PERIODS	= 10,
296 
297 	/* switch iff the conditions are met for longer than this */
298 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
299 
300 	/*
301 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
302 	 * size-proportional components of cost calculation in closer
303 	 * numbers of digits to per-IO cost components.
304 	 */
305 	IOC_PAGE_SHIFT		= 12,
306 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
307 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
308 
309 	/* if apart further than 16M, consider randio for linear model */
310 	LCOEF_RANDIO_PAGES	= 4096,
311 };
312 
313 enum ioc_running {
314 	IOC_IDLE,
315 	IOC_RUNNING,
316 	IOC_STOP,
317 };
318 
319 /* io.cost.qos controls including per-dev enable of the whole controller */
320 enum {
321 	QOS_ENABLE,
322 	QOS_CTRL,
323 	NR_QOS_CTRL_PARAMS,
324 };
325 
326 /* io.cost.qos params */
327 enum {
328 	QOS_RPPM,
329 	QOS_RLAT,
330 	QOS_WPPM,
331 	QOS_WLAT,
332 	QOS_MIN,
333 	QOS_MAX,
334 	NR_QOS_PARAMS,
335 };
336 
337 /* io.cost.model controls */
338 enum {
339 	COST_CTRL,
340 	COST_MODEL,
341 	NR_COST_CTRL_PARAMS,
342 };
343 
344 /* builtin linear cost model coefficients */
345 enum {
346 	I_LCOEF_RBPS,
347 	I_LCOEF_RSEQIOPS,
348 	I_LCOEF_RRANDIOPS,
349 	I_LCOEF_WBPS,
350 	I_LCOEF_WSEQIOPS,
351 	I_LCOEF_WRANDIOPS,
352 	NR_I_LCOEFS,
353 };
354 
355 enum {
356 	LCOEF_RPAGE,
357 	LCOEF_RSEQIO,
358 	LCOEF_RRANDIO,
359 	LCOEF_WPAGE,
360 	LCOEF_WSEQIO,
361 	LCOEF_WRANDIO,
362 	NR_LCOEFS,
363 };
364 
365 enum {
366 	AUTOP_INVALID,
367 	AUTOP_HDD,
368 	AUTOP_SSD_QD1,
369 	AUTOP_SSD_DFL,
370 	AUTOP_SSD_FAST,
371 };
372 
373 struct ioc_gq;
374 
375 struct ioc_params {
376 	u32				qos[NR_QOS_PARAMS];
377 	u64				i_lcoefs[NR_I_LCOEFS];
378 	u64				lcoefs[NR_LCOEFS];
379 	u32				too_fast_vrate_pct;
380 	u32				too_slow_vrate_pct;
381 };
382 
383 struct ioc_margins {
384 	s64				min;
385 	s64				low;
386 	s64				target;
387 };
388 
389 struct ioc_missed {
390 	local_t				nr_met;
391 	local_t				nr_missed;
392 	u32				last_met;
393 	u32				last_missed;
394 };
395 
396 struct ioc_pcpu_stat {
397 	struct ioc_missed		missed[2];
398 
399 	local64_t			rq_wait_ns;
400 	u64				last_rq_wait_ns;
401 };
402 
403 /* per device */
404 struct ioc {
405 	struct rq_qos			rqos;
406 
407 	bool				enabled;
408 
409 	struct ioc_params		params;
410 	struct ioc_margins		margins;
411 	u32				period_us;
412 	u32				timer_slack_ns;
413 	u64				vrate_min;
414 	u64				vrate_max;
415 
416 	spinlock_t			lock;
417 	struct timer_list		timer;
418 	struct list_head		active_iocgs;	/* active cgroups */
419 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
420 
421 	enum ioc_running		running;
422 	atomic64_t			vtime_rate;
423 	u64				vtime_base_rate;
424 	s64				vtime_err;
425 
426 	seqcount_spinlock_t		period_seqcount;
427 	u64				period_at;	/* wallclock starttime */
428 	u64				period_at_vtime; /* vtime starttime */
429 
430 	atomic64_t			cur_period;	/* inc'd each period */
431 	int				busy_level;	/* saturation history */
432 
433 	bool				weights_updated;
434 	atomic_t			hweight_gen;	/* for lazy hweights */
435 
436 	/* debt forgivness */
437 	u64				dfgv_period_at;
438 	u64				dfgv_period_rem;
439 	u64				dfgv_usage_us_sum;
440 
441 	u64				autop_too_fast_at;
442 	u64				autop_too_slow_at;
443 	int				autop_idx;
444 	bool				user_qos_params:1;
445 	bool				user_cost_model:1;
446 };
447 
448 struct iocg_pcpu_stat {
449 	local64_t			abs_vusage;
450 };
451 
452 struct iocg_stat {
453 	u64				usage_us;
454 	u64				wait_us;
455 	u64				indebt_us;
456 	u64				indelay_us;
457 };
458 
459 /* per device-cgroup pair */
460 struct ioc_gq {
461 	struct blkg_policy_data		pd;
462 	struct ioc			*ioc;
463 
464 	/*
465 	 * A iocg can get its weight from two sources - an explicit
466 	 * per-device-cgroup configuration or the default weight of the
467 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
468 	 * configuration.  `weight` is the effective considering both
469 	 * sources.
470 	 *
471 	 * When an idle cgroup becomes active its `active` goes from 0 to
472 	 * `weight`.  `inuse` is the surplus adjusted active weight.
473 	 * `active` and `inuse` are used to calculate `hweight_active` and
474 	 * `hweight_inuse`.
475 	 *
476 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
477 	 * surplus adjustments.
478 	 *
479 	 * `inuse` may be adjusted dynamically during period. `saved_*` are used
480 	 * to determine and track adjustments.
481 	 */
482 	u32				cfg_weight;
483 	u32				weight;
484 	u32				active;
485 	u32				inuse;
486 
487 	u32				last_inuse;
488 	s64				saved_margin;
489 
490 	sector_t			cursor;		/* to detect randio */
491 
492 	/*
493 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
494 	 * issued.  If lagging behind device vtime, the delta represents
495 	 * the currently available IO budget.  If runnning ahead, the
496 	 * overage.
497 	 *
498 	 * `vtime_done` is the same but progressed on completion rather
499 	 * than issue.  The delta behind `vtime` represents the cost of
500 	 * currently in-flight IOs.
501 	 */
502 	atomic64_t			vtime;
503 	atomic64_t			done_vtime;
504 	u64				abs_vdebt;
505 
506 	/* current delay in effect and when it started */
507 	u64				delay;
508 	u64				delay_at;
509 
510 	/*
511 	 * The period this iocg was last active in.  Used for deactivation
512 	 * and invalidating `vtime`.
513 	 */
514 	atomic64_t			active_period;
515 	struct list_head		active_list;
516 
517 	/* see __propagate_weights() and current_hweight() for details */
518 	u64				child_active_sum;
519 	u64				child_inuse_sum;
520 	u64				child_adjusted_sum;
521 	int				hweight_gen;
522 	u32				hweight_active;
523 	u32				hweight_inuse;
524 	u32				hweight_donating;
525 	u32				hweight_after_donation;
526 
527 	struct list_head		walk_list;
528 	struct list_head		surplus_list;
529 
530 	struct wait_queue_head		waitq;
531 	struct hrtimer			waitq_timer;
532 
533 	/* timestamp at the latest activation */
534 	u64				activated_at;
535 
536 	/* statistics */
537 	struct iocg_pcpu_stat __percpu	*pcpu_stat;
538 	struct iocg_stat		local_stat;
539 	struct iocg_stat		desc_stat;
540 	struct iocg_stat		last_stat;
541 	u64				last_stat_abs_vusage;
542 	u64				usage_delta_us;
543 	u64				wait_since;
544 	u64				indebt_since;
545 	u64				indelay_since;
546 
547 	/* this iocg's depth in the hierarchy and ancestors including self */
548 	int				level;
549 	struct ioc_gq			*ancestors[];
550 };
551 
552 /* per cgroup */
553 struct ioc_cgrp {
554 	struct blkcg_policy_data	cpd;
555 	unsigned int			dfl_weight;
556 };
557 
558 struct ioc_now {
559 	u64				now_ns;
560 	u64				now;
561 	u64				vnow;
562 	u64				vrate;
563 };
564 
565 struct iocg_wait {
566 	struct wait_queue_entry		wait;
567 	struct bio			*bio;
568 	u64				abs_cost;
569 	bool				committed;
570 };
571 
572 struct iocg_wake_ctx {
573 	struct ioc_gq			*iocg;
574 	u32				hw_inuse;
575 	s64				vbudget;
576 };
577 
578 static const struct ioc_params autop[] = {
579 	[AUTOP_HDD] = {
580 		.qos				= {
581 			[QOS_RLAT]		=        250000, /* 250ms */
582 			[QOS_WLAT]		=        250000,
583 			[QOS_MIN]		= VRATE_MIN_PPM,
584 			[QOS_MAX]		= VRATE_MAX_PPM,
585 		},
586 		.i_lcoefs			= {
587 			[I_LCOEF_RBPS]		=     174019176,
588 			[I_LCOEF_RSEQIOPS]	=         41708,
589 			[I_LCOEF_RRANDIOPS]	=           370,
590 			[I_LCOEF_WBPS]		=     178075866,
591 			[I_LCOEF_WSEQIOPS]	=         42705,
592 			[I_LCOEF_WRANDIOPS]	=           378,
593 		},
594 	},
595 	[AUTOP_SSD_QD1] = {
596 		.qos				= {
597 			[QOS_RLAT]		=         25000, /* 25ms */
598 			[QOS_WLAT]		=         25000,
599 			[QOS_MIN]		= VRATE_MIN_PPM,
600 			[QOS_MAX]		= VRATE_MAX_PPM,
601 		},
602 		.i_lcoefs			= {
603 			[I_LCOEF_RBPS]		=     245855193,
604 			[I_LCOEF_RSEQIOPS]	=         61575,
605 			[I_LCOEF_RRANDIOPS]	=          6946,
606 			[I_LCOEF_WBPS]		=     141365009,
607 			[I_LCOEF_WSEQIOPS]	=         33716,
608 			[I_LCOEF_WRANDIOPS]	=         26796,
609 		},
610 	},
611 	[AUTOP_SSD_DFL] = {
612 		.qos				= {
613 			[QOS_RLAT]		=         25000, /* 25ms */
614 			[QOS_WLAT]		=         25000,
615 			[QOS_MIN]		= VRATE_MIN_PPM,
616 			[QOS_MAX]		= VRATE_MAX_PPM,
617 		},
618 		.i_lcoefs			= {
619 			[I_LCOEF_RBPS]		=     488636629,
620 			[I_LCOEF_RSEQIOPS]	=          8932,
621 			[I_LCOEF_RRANDIOPS]	=          8518,
622 			[I_LCOEF_WBPS]		=     427891549,
623 			[I_LCOEF_WSEQIOPS]	=         28755,
624 			[I_LCOEF_WRANDIOPS]	=         21940,
625 		},
626 		.too_fast_vrate_pct		=           500,
627 	},
628 	[AUTOP_SSD_FAST] = {
629 		.qos				= {
630 			[QOS_RLAT]		=          5000, /* 5ms */
631 			[QOS_WLAT]		=          5000,
632 			[QOS_MIN]		= VRATE_MIN_PPM,
633 			[QOS_MAX]		= VRATE_MAX_PPM,
634 		},
635 		.i_lcoefs			= {
636 			[I_LCOEF_RBPS]		=    3102524156LLU,
637 			[I_LCOEF_RSEQIOPS]	=        724816,
638 			[I_LCOEF_RRANDIOPS]	=        778122,
639 			[I_LCOEF_WBPS]		=    1742780862LLU,
640 			[I_LCOEF_WSEQIOPS]	=        425702,
641 			[I_LCOEF_WRANDIOPS]	=	 443193,
642 		},
643 		.too_slow_vrate_pct		=            10,
644 	},
645 };
646 
647 /*
648  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
649  * vtime credit shortage and down on device saturation.
650  */
651 static u32 vrate_adj_pct[] =
652 	{ 0, 0, 0, 0,
653 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
654 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
655 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
656 
657 static struct blkcg_policy blkcg_policy_iocost;
658 
659 /* accessors and helpers */
660 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
661 {
662 	return container_of(rqos, struct ioc, rqos);
663 }
664 
665 static struct ioc *q_to_ioc(struct request_queue *q)
666 {
667 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
668 }
669 
670 static const char *q_name(struct request_queue *q)
671 {
672 	if (blk_queue_registered(q))
673 		return kobject_name(q->kobj.parent);
674 	else
675 		return "<unknown>";
676 }
677 
678 static const char __maybe_unused *ioc_name(struct ioc *ioc)
679 {
680 	return q_name(ioc->rqos.q);
681 }
682 
683 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
684 {
685 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
686 }
687 
688 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
689 {
690 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
691 }
692 
693 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
694 {
695 	return pd_to_blkg(&iocg->pd);
696 }
697 
698 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
699 {
700 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
701 			    struct ioc_cgrp, cpd);
702 }
703 
704 /*
705  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
706  * weight, the more expensive each IO.  Must round up.
707  */
708 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
709 {
710 	return DIV64_U64_ROUND_UP(abs_cost * WEIGHT_ONE, hw_inuse);
711 }
712 
713 /*
714  * The inverse of abs_cost_to_cost().  Must round up.
715  */
716 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
717 {
718 	return DIV64_U64_ROUND_UP(cost * hw_inuse, WEIGHT_ONE);
719 }
720 
721 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio,
722 			    u64 abs_cost, u64 cost)
723 {
724 	struct iocg_pcpu_stat *gcs;
725 
726 	bio->bi_iocost_cost = cost;
727 	atomic64_add(cost, &iocg->vtime);
728 
729 	gcs = get_cpu_ptr(iocg->pcpu_stat);
730 	local64_add(abs_cost, &gcs->abs_vusage);
731 	put_cpu_ptr(gcs);
732 }
733 
734 static void iocg_lock(struct ioc_gq *iocg, bool lock_ioc, unsigned long *flags)
735 {
736 	if (lock_ioc) {
737 		spin_lock_irqsave(&iocg->ioc->lock, *flags);
738 		spin_lock(&iocg->waitq.lock);
739 	} else {
740 		spin_lock_irqsave(&iocg->waitq.lock, *flags);
741 	}
742 }
743 
744 static void iocg_unlock(struct ioc_gq *iocg, bool unlock_ioc, unsigned long *flags)
745 {
746 	if (unlock_ioc) {
747 		spin_unlock(&iocg->waitq.lock);
748 		spin_unlock_irqrestore(&iocg->ioc->lock, *flags);
749 	} else {
750 		spin_unlock_irqrestore(&iocg->waitq.lock, *flags);
751 	}
752 }
753 
754 #define CREATE_TRACE_POINTS
755 #include <trace/events/iocost.h>
756 
757 static void ioc_refresh_margins(struct ioc *ioc)
758 {
759 	struct ioc_margins *margins = &ioc->margins;
760 	u32 period_us = ioc->period_us;
761 	u64 vrate = ioc->vtime_base_rate;
762 
763 	margins->min = (period_us * MARGIN_MIN_PCT / 100) * vrate;
764 	margins->low = (period_us * MARGIN_LOW_PCT / 100) * vrate;
765 	margins->target = (period_us * MARGIN_TARGET_PCT / 100) * vrate;
766 }
767 
768 /* latency Qos params changed, update period_us and all the dependent params */
769 static void ioc_refresh_period_us(struct ioc *ioc)
770 {
771 	u32 ppm, lat, multi, period_us;
772 
773 	lockdep_assert_held(&ioc->lock);
774 
775 	/* pick the higher latency target */
776 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
777 		ppm = ioc->params.qos[QOS_RPPM];
778 		lat = ioc->params.qos[QOS_RLAT];
779 	} else {
780 		ppm = ioc->params.qos[QOS_WPPM];
781 		lat = ioc->params.qos[QOS_WLAT];
782 	}
783 
784 	/*
785 	 * We want the period to be long enough to contain a healthy number
786 	 * of IOs while short enough for granular control.  Define it as a
787 	 * multiple of the latency target.  Ideally, the multiplier should
788 	 * be scaled according to the percentile so that it would nominally
789 	 * contain a certain number of requests.  Let's be simpler and
790 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
791 	 */
792 	if (ppm)
793 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
794 	else
795 		multi = 2;
796 	period_us = multi * lat;
797 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
798 
799 	/* calculate dependent params */
800 	ioc->period_us = period_us;
801 	ioc->timer_slack_ns = div64_u64(
802 		(u64)period_us * NSEC_PER_USEC * TIMER_SLACK_PCT,
803 		100);
804 	ioc_refresh_margins(ioc);
805 }
806 
807 static int ioc_autop_idx(struct ioc *ioc)
808 {
809 	int idx = ioc->autop_idx;
810 	const struct ioc_params *p = &autop[idx];
811 	u32 vrate_pct;
812 	u64 now_ns;
813 
814 	/* rotational? */
815 	if (!blk_queue_nonrot(ioc->rqos.q))
816 		return AUTOP_HDD;
817 
818 	/* handle SATA SSDs w/ broken NCQ */
819 	if (blk_queue_depth(ioc->rqos.q) == 1)
820 		return AUTOP_SSD_QD1;
821 
822 	/* use one of the normal ssd sets */
823 	if (idx < AUTOP_SSD_DFL)
824 		return AUTOP_SSD_DFL;
825 
826 	/* if user is overriding anything, maintain what was there */
827 	if (ioc->user_qos_params || ioc->user_cost_model)
828 		return idx;
829 
830 	/* step up/down based on the vrate */
831 	vrate_pct = div64_u64(ioc->vtime_base_rate * 100, VTIME_PER_USEC);
832 	now_ns = ktime_get_ns();
833 
834 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
835 		if (!ioc->autop_too_fast_at)
836 			ioc->autop_too_fast_at = now_ns;
837 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
838 			return idx + 1;
839 	} else {
840 		ioc->autop_too_fast_at = 0;
841 	}
842 
843 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
844 		if (!ioc->autop_too_slow_at)
845 			ioc->autop_too_slow_at = now_ns;
846 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
847 			return idx - 1;
848 	} else {
849 		ioc->autop_too_slow_at = 0;
850 	}
851 
852 	return idx;
853 }
854 
855 /*
856  * Take the followings as input
857  *
858  *  @bps	maximum sequential throughput
859  *  @seqiops	maximum sequential 4k iops
860  *  @randiops	maximum random 4k iops
861  *
862  * and calculate the linear model cost coefficients.
863  *
864  *  *@page	per-page cost		1s / (@bps / 4096)
865  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
866  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
867  */
868 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
869 			u64 *page, u64 *seqio, u64 *randio)
870 {
871 	u64 v;
872 
873 	*page = *seqio = *randio = 0;
874 
875 	if (bps)
876 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
877 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
878 
879 	if (seqiops) {
880 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
881 		if (v > *page)
882 			*seqio = v - *page;
883 	}
884 
885 	if (randiops) {
886 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
887 		if (v > *page)
888 			*randio = v - *page;
889 	}
890 }
891 
892 static void ioc_refresh_lcoefs(struct ioc *ioc)
893 {
894 	u64 *u = ioc->params.i_lcoefs;
895 	u64 *c = ioc->params.lcoefs;
896 
897 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
898 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
899 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
900 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
901 }
902 
903 static bool ioc_refresh_params(struct ioc *ioc, bool force)
904 {
905 	const struct ioc_params *p;
906 	int idx;
907 
908 	lockdep_assert_held(&ioc->lock);
909 
910 	idx = ioc_autop_idx(ioc);
911 	p = &autop[idx];
912 
913 	if (idx == ioc->autop_idx && !force)
914 		return false;
915 
916 	if (idx != ioc->autop_idx)
917 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
918 
919 	ioc->autop_idx = idx;
920 	ioc->autop_too_fast_at = 0;
921 	ioc->autop_too_slow_at = 0;
922 
923 	if (!ioc->user_qos_params)
924 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
925 	if (!ioc->user_cost_model)
926 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
927 
928 	ioc_refresh_period_us(ioc);
929 	ioc_refresh_lcoefs(ioc);
930 
931 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
932 					    VTIME_PER_USEC, MILLION);
933 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
934 				   VTIME_PER_USEC, MILLION);
935 
936 	return true;
937 }
938 
939 /*
940  * When an iocg accumulates too much vtime or gets deactivated, we throw away
941  * some vtime, which lowers the overall device utilization. As the exact amount
942  * which is being thrown away is known, we can compensate by accelerating the
943  * vrate accordingly so that the extra vtime generated in the current period
944  * matches what got lost.
945  */
946 static void ioc_refresh_vrate(struct ioc *ioc, struct ioc_now *now)
947 {
948 	s64 pleft = ioc->period_at + ioc->period_us - now->now;
949 	s64 vperiod = ioc->period_us * ioc->vtime_base_rate;
950 	s64 vcomp, vcomp_min, vcomp_max;
951 
952 	lockdep_assert_held(&ioc->lock);
953 
954 	/* we need some time left in this period */
955 	if (pleft <= 0)
956 		goto done;
957 
958 	/*
959 	 * Calculate how much vrate should be adjusted to offset the error.
960 	 * Limit the amount of adjustment and deduct the adjusted amount from
961 	 * the error.
962 	 */
963 	vcomp = -div64_s64(ioc->vtime_err, pleft);
964 	vcomp_min = -(ioc->vtime_base_rate >> 1);
965 	vcomp_max = ioc->vtime_base_rate;
966 	vcomp = clamp(vcomp, vcomp_min, vcomp_max);
967 
968 	ioc->vtime_err += vcomp * pleft;
969 
970 	atomic64_set(&ioc->vtime_rate, ioc->vtime_base_rate + vcomp);
971 done:
972 	/* bound how much error can accumulate */
973 	ioc->vtime_err = clamp(ioc->vtime_err, -vperiod, vperiod);
974 }
975 
976 /* take a snapshot of the current [v]time and vrate */
977 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
978 {
979 	unsigned seq;
980 
981 	now->now_ns = ktime_get();
982 	now->now = ktime_to_us(now->now_ns);
983 	now->vrate = atomic64_read(&ioc->vtime_rate);
984 
985 	/*
986 	 * The current vtime is
987 	 *
988 	 *   vtime at period start + (wallclock time since the start) * vrate
989 	 *
990 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
991 	 * needed, they're seqcount protected.
992 	 */
993 	do {
994 		seq = read_seqcount_begin(&ioc->period_seqcount);
995 		now->vnow = ioc->period_at_vtime +
996 			(now->now - ioc->period_at) * now->vrate;
997 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
998 }
999 
1000 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
1001 {
1002 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
1003 
1004 	write_seqcount_begin(&ioc->period_seqcount);
1005 	ioc->period_at = now->now;
1006 	ioc->period_at_vtime = now->vnow;
1007 	write_seqcount_end(&ioc->period_seqcount);
1008 
1009 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
1010 	add_timer(&ioc->timer);
1011 }
1012 
1013 /*
1014  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
1015  * weight sums and propagate upwards accordingly. If @save, the current margin
1016  * is saved to be used as reference for later inuse in-period adjustments.
1017  */
1018 static void __propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1019 				bool save, struct ioc_now *now)
1020 {
1021 	struct ioc *ioc = iocg->ioc;
1022 	int lvl;
1023 
1024 	lockdep_assert_held(&ioc->lock);
1025 
1026 	inuse = clamp_t(u32, inuse, 1, active);
1027 
1028 	iocg->last_inuse = iocg->inuse;
1029 	if (save)
1030 		iocg->saved_margin = now->vnow - atomic64_read(&iocg->vtime);
1031 
1032 	if (active == iocg->active && inuse == iocg->inuse)
1033 		return;
1034 
1035 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1036 		struct ioc_gq *parent = iocg->ancestors[lvl];
1037 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1038 		u32 parent_active = 0, parent_inuse = 0;
1039 
1040 		/* update the level sums */
1041 		parent->child_active_sum += (s32)(active - child->active);
1042 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
1043 		/* apply the udpates */
1044 		child->active = active;
1045 		child->inuse = inuse;
1046 
1047 		/*
1048 		 * The delta between inuse and active sums indicates that
1049 		 * that much of weight is being given away.  Parent's inuse
1050 		 * and active should reflect the ratio.
1051 		 */
1052 		if (parent->child_active_sum) {
1053 			parent_active = parent->weight;
1054 			parent_inuse = DIV64_U64_ROUND_UP(
1055 				parent_active * parent->child_inuse_sum,
1056 				parent->child_active_sum);
1057 		}
1058 
1059 		/* do we need to keep walking up? */
1060 		if (parent_active == parent->active &&
1061 		    parent_inuse == parent->inuse)
1062 			break;
1063 
1064 		active = parent_active;
1065 		inuse = parent_inuse;
1066 	}
1067 
1068 	ioc->weights_updated = true;
1069 }
1070 
1071 static void commit_weights(struct ioc *ioc)
1072 {
1073 	lockdep_assert_held(&ioc->lock);
1074 
1075 	if (ioc->weights_updated) {
1076 		/* paired with rmb in current_hweight(), see there */
1077 		smp_wmb();
1078 		atomic_inc(&ioc->hweight_gen);
1079 		ioc->weights_updated = false;
1080 	}
1081 }
1082 
1083 static void propagate_weights(struct ioc_gq *iocg, u32 active, u32 inuse,
1084 			      bool save, struct ioc_now *now)
1085 {
1086 	__propagate_weights(iocg, active, inuse, save, now);
1087 	commit_weights(iocg->ioc);
1088 }
1089 
1090 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
1091 {
1092 	struct ioc *ioc = iocg->ioc;
1093 	int lvl;
1094 	u32 hwa, hwi;
1095 	int ioc_gen;
1096 
1097 	/* hot path - if uptodate, use cached */
1098 	ioc_gen = atomic_read(&ioc->hweight_gen);
1099 	if (ioc_gen == iocg->hweight_gen)
1100 		goto out;
1101 
1102 	/*
1103 	 * Paired with wmb in commit_weights(). If we saw the updated
1104 	 * hweight_gen, all the weight updates from __propagate_weights() are
1105 	 * visible too.
1106 	 *
1107 	 * We can race with weight updates during calculation and get it
1108 	 * wrong.  However, hweight_gen would have changed and a future
1109 	 * reader will recalculate and we're guaranteed to discard the
1110 	 * wrong result soon.
1111 	 */
1112 	smp_rmb();
1113 
1114 	hwa = hwi = WEIGHT_ONE;
1115 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
1116 		struct ioc_gq *parent = iocg->ancestors[lvl];
1117 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1118 		u64 active_sum = READ_ONCE(parent->child_active_sum);
1119 		u64 inuse_sum = READ_ONCE(parent->child_inuse_sum);
1120 		u32 active = READ_ONCE(child->active);
1121 		u32 inuse = READ_ONCE(child->inuse);
1122 
1123 		/* we can race with deactivations and either may read as zero */
1124 		if (!active_sum || !inuse_sum)
1125 			continue;
1126 
1127 		active_sum = max_t(u64, active, active_sum);
1128 		hwa = div64_u64((u64)hwa * active, active_sum);
1129 
1130 		inuse_sum = max_t(u64, inuse, inuse_sum);
1131 		hwi = div64_u64((u64)hwi * inuse, inuse_sum);
1132 	}
1133 
1134 	iocg->hweight_active = max_t(u32, hwa, 1);
1135 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1136 	iocg->hweight_gen = ioc_gen;
1137 out:
1138 	if (hw_activep)
1139 		*hw_activep = iocg->hweight_active;
1140 	if (hw_inusep)
1141 		*hw_inusep = iocg->hweight_inuse;
1142 }
1143 
1144 /*
1145  * Calculate the hweight_inuse @iocg would get with max @inuse assuming all the
1146  * other weights stay unchanged.
1147  */
1148 static u32 current_hweight_max(struct ioc_gq *iocg)
1149 {
1150 	u32 hwm = WEIGHT_ONE;
1151 	u32 inuse = iocg->active;
1152 	u64 child_inuse_sum;
1153 	int lvl;
1154 
1155 	lockdep_assert_held(&iocg->ioc->lock);
1156 
1157 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1158 		struct ioc_gq *parent = iocg->ancestors[lvl];
1159 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
1160 
1161 		child_inuse_sum = parent->child_inuse_sum + inuse - child->inuse;
1162 		hwm = div64_u64((u64)hwm * inuse, child_inuse_sum);
1163 		inuse = DIV64_U64_ROUND_UP(parent->active * child_inuse_sum,
1164 					   parent->child_active_sum);
1165 	}
1166 
1167 	return max_t(u32, hwm, 1);
1168 }
1169 
1170 static void weight_updated(struct ioc_gq *iocg, struct ioc_now *now)
1171 {
1172 	struct ioc *ioc = iocg->ioc;
1173 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1174 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1175 	u32 weight;
1176 
1177 	lockdep_assert_held(&ioc->lock);
1178 
1179 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1180 	if (weight != iocg->weight && iocg->active)
1181 		propagate_weights(iocg, weight, iocg->inuse, true, now);
1182 	iocg->weight = weight;
1183 }
1184 
1185 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1186 {
1187 	struct ioc *ioc = iocg->ioc;
1188 	u64 last_period, cur_period;
1189 	u64 vtime, vtarget;
1190 	int i;
1191 
1192 	/*
1193 	 * If seem to be already active, just update the stamp to tell the
1194 	 * timer that we're still active.  We don't mind occassional races.
1195 	 */
1196 	if (!list_empty(&iocg->active_list)) {
1197 		ioc_now(ioc, now);
1198 		cur_period = atomic64_read(&ioc->cur_period);
1199 		if (atomic64_read(&iocg->active_period) != cur_period)
1200 			atomic64_set(&iocg->active_period, cur_period);
1201 		return true;
1202 	}
1203 
1204 	/* racy check on internal node IOs, treat as root level IOs */
1205 	if (iocg->child_active_sum)
1206 		return false;
1207 
1208 	spin_lock_irq(&ioc->lock);
1209 
1210 	ioc_now(ioc, now);
1211 
1212 	/* update period */
1213 	cur_period = atomic64_read(&ioc->cur_period);
1214 	last_period = atomic64_read(&iocg->active_period);
1215 	atomic64_set(&iocg->active_period, cur_period);
1216 
1217 	/* already activated or breaking leaf-only constraint? */
1218 	if (!list_empty(&iocg->active_list))
1219 		goto succeed_unlock;
1220 	for (i = iocg->level - 1; i > 0; i--)
1221 		if (!list_empty(&iocg->ancestors[i]->active_list))
1222 			goto fail_unlock;
1223 
1224 	if (iocg->child_active_sum)
1225 		goto fail_unlock;
1226 
1227 	/*
1228 	 * Always start with the target budget. On deactivation, we throw away
1229 	 * anything above it.
1230 	 */
1231 	vtarget = now->vnow - ioc->margins.target;
1232 	vtime = atomic64_read(&iocg->vtime);
1233 
1234 	atomic64_add(vtarget - vtime, &iocg->vtime);
1235 	atomic64_add(vtarget - vtime, &iocg->done_vtime);
1236 	vtime = vtarget;
1237 
1238 	/*
1239 	 * Activate, propagate weight and start period timer if not
1240 	 * running.  Reset hweight_gen to avoid accidental match from
1241 	 * wrapping.
1242 	 */
1243 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1244 	list_add(&iocg->active_list, &ioc->active_iocgs);
1245 
1246 	propagate_weights(iocg, iocg->weight,
1247 			  iocg->last_inuse ?: iocg->weight, true, now);
1248 
1249 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1250 			last_period, cur_period, vtime);
1251 
1252 	iocg->activated_at = now->now;
1253 
1254 	if (ioc->running == IOC_IDLE) {
1255 		ioc->running = IOC_RUNNING;
1256 		ioc->dfgv_period_at = now->now;
1257 		ioc->dfgv_period_rem = 0;
1258 		ioc_start_period(ioc, now);
1259 	}
1260 
1261 succeed_unlock:
1262 	spin_unlock_irq(&ioc->lock);
1263 	return true;
1264 
1265 fail_unlock:
1266 	spin_unlock_irq(&ioc->lock);
1267 	return false;
1268 }
1269 
1270 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1271 {
1272 	struct ioc *ioc = iocg->ioc;
1273 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1274 	u64 tdelta, delay, new_delay;
1275 	s64 vover, vover_pct;
1276 	u32 hwa;
1277 
1278 	lockdep_assert_held(&iocg->waitq.lock);
1279 
1280 	/* calculate the current delay in effect - 1/2 every second */
1281 	tdelta = now->now - iocg->delay_at;
1282 	if (iocg->delay)
1283 		delay = iocg->delay >> div64_u64(tdelta, USEC_PER_SEC);
1284 	else
1285 		delay = 0;
1286 
1287 	/* calculate the new delay from the debt amount */
1288 	current_hweight(iocg, &hwa, NULL);
1289 	vover = atomic64_read(&iocg->vtime) +
1290 		abs_cost_to_cost(iocg->abs_vdebt, hwa) - now->vnow;
1291 	vover_pct = div64_s64(100 * vover,
1292 			      ioc->period_us * ioc->vtime_base_rate);
1293 
1294 	if (vover_pct <= MIN_DELAY_THR_PCT)
1295 		new_delay = 0;
1296 	else if (vover_pct >= MAX_DELAY_THR_PCT)
1297 		new_delay = MAX_DELAY;
1298 	else
1299 		new_delay = MIN_DELAY +
1300 			div_u64((MAX_DELAY - MIN_DELAY) *
1301 				(vover_pct - MIN_DELAY_THR_PCT),
1302 				MAX_DELAY_THR_PCT - MIN_DELAY_THR_PCT);
1303 
1304 	/* pick the higher one and apply */
1305 	if (new_delay > delay) {
1306 		iocg->delay = new_delay;
1307 		iocg->delay_at = now->now;
1308 		delay = new_delay;
1309 	}
1310 
1311 	if (delay >= MIN_DELAY) {
1312 		if (!iocg->indelay_since)
1313 			iocg->indelay_since = now->now;
1314 		blkcg_set_delay(blkg, delay * NSEC_PER_USEC);
1315 		return true;
1316 	} else {
1317 		if (iocg->indelay_since) {
1318 			iocg->local_stat.indelay_us += now->now - iocg->indelay_since;
1319 			iocg->indelay_since = 0;
1320 		}
1321 		iocg->delay = 0;
1322 		blkcg_clear_delay(blkg);
1323 		return false;
1324 	}
1325 }
1326 
1327 static void iocg_incur_debt(struct ioc_gq *iocg, u64 abs_cost,
1328 			    struct ioc_now *now)
1329 {
1330 	struct iocg_pcpu_stat *gcs;
1331 
1332 	lockdep_assert_held(&iocg->ioc->lock);
1333 	lockdep_assert_held(&iocg->waitq.lock);
1334 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1335 
1336 	/*
1337 	 * Once in debt, debt handling owns inuse. @iocg stays at the minimum
1338 	 * inuse donating all of it share to others until its debt is paid off.
1339 	 */
1340 	if (!iocg->abs_vdebt && abs_cost) {
1341 		iocg->indebt_since = now->now;
1342 		propagate_weights(iocg, iocg->active, 0, false, now);
1343 	}
1344 
1345 	iocg->abs_vdebt += abs_cost;
1346 
1347 	gcs = get_cpu_ptr(iocg->pcpu_stat);
1348 	local64_add(abs_cost, &gcs->abs_vusage);
1349 	put_cpu_ptr(gcs);
1350 }
1351 
1352 static void iocg_pay_debt(struct ioc_gq *iocg, u64 abs_vpay,
1353 			  struct ioc_now *now)
1354 {
1355 	lockdep_assert_held(&iocg->ioc->lock);
1356 	lockdep_assert_held(&iocg->waitq.lock);
1357 
1358 	/* make sure that nobody messed with @iocg */
1359 	WARN_ON_ONCE(list_empty(&iocg->active_list));
1360 	WARN_ON_ONCE(iocg->inuse > 1);
1361 
1362 	iocg->abs_vdebt -= min(abs_vpay, iocg->abs_vdebt);
1363 
1364 	/* if debt is paid in full, restore inuse */
1365 	if (!iocg->abs_vdebt) {
1366 		iocg->local_stat.indebt_us += now->now - iocg->indebt_since;
1367 		iocg->indebt_since = 0;
1368 
1369 		propagate_weights(iocg, iocg->active, iocg->last_inuse,
1370 				  false, now);
1371 	}
1372 }
1373 
1374 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1375 			int flags, void *key)
1376 {
1377 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1378 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1379 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1380 
1381 	ctx->vbudget -= cost;
1382 
1383 	if (ctx->vbudget < 0)
1384 		return -1;
1385 
1386 	iocg_commit_bio(ctx->iocg, wait->bio, wait->abs_cost, cost);
1387 
1388 	/*
1389 	 * autoremove_wake_function() removes the wait entry only when it
1390 	 * actually changed the task state.  We want the wait always
1391 	 * removed.  Remove explicitly and use default_wake_function().
1392 	 */
1393 	list_del_init(&wq_entry->entry);
1394 	wait->committed = true;
1395 
1396 	default_wake_function(wq_entry, mode, flags, key);
1397 	return 0;
1398 }
1399 
1400 /*
1401  * Calculate the accumulated budget, pay debt if @pay_debt and wake up waiters
1402  * accordingly. When @pay_debt is %true, the caller must be holding ioc->lock in
1403  * addition to iocg->waitq.lock.
1404  */
1405 static void iocg_kick_waitq(struct ioc_gq *iocg, bool pay_debt,
1406 			    struct ioc_now *now)
1407 {
1408 	struct ioc *ioc = iocg->ioc;
1409 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1410 	u64 vshortage, expires, oexpires;
1411 	s64 vbudget;
1412 	u32 hwa;
1413 
1414 	lockdep_assert_held(&iocg->waitq.lock);
1415 
1416 	current_hweight(iocg, &hwa, NULL);
1417 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1418 
1419 	/* pay off debt */
1420 	if (pay_debt && iocg->abs_vdebt && vbudget > 0) {
1421 		u64 abs_vbudget = cost_to_abs_cost(vbudget, hwa);
1422 		u64 abs_vpay = min_t(u64, abs_vbudget, iocg->abs_vdebt);
1423 		u64 vpay = abs_cost_to_cost(abs_vpay, hwa);
1424 
1425 		lockdep_assert_held(&ioc->lock);
1426 
1427 		atomic64_add(vpay, &iocg->vtime);
1428 		atomic64_add(vpay, &iocg->done_vtime);
1429 		iocg_pay_debt(iocg, abs_vpay, now);
1430 		vbudget -= vpay;
1431 	}
1432 
1433 	if (iocg->abs_vdebt || iocg->delay)
1434 		iocg_kick_delay(iocg, now);
1435 
1436 	/*
1437 	 * Debt can still be outstanding if we haven't paid all yet or the
1438 	 * caller raced and called without @pay_debt. Shouldn't wake up waiters
1439 	 * under debt. Make sure @vbudget reflects the outstanding amount and is
1440 	 * not positive.
1441 	 */
1442 	if (iocg->abs_vdebt) {
1443 		s64 vdebt = abs_cost_to_cost(iocg->abs_vdebt, hwa);
1444 		vbudget = min_t(s64, 0, vbudget - vdebt);
1445 	}
1446 
1447 	/*
1448 	 * Wake up the ones which are due and see how much vtime we'll need for
1449 	 * the next one. As paying off debt restores hw_inuse, it must be read
1450 	 * after the above debt payment.
1451 	 */
1452 	ctx.vbudget = vbudget;
1453 	current_hweight(iocg, NULL, &ctx.hw_inuse);
1454 
1455 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1456 
1457 	if (!waitqueue_active(&iocg->waitq)) {
1458 		if (iocg->wait_since) {
1459 			iocg->local_stat.wait_us += now->now - iocg->wait_since;
1460 			iocg->wait_since = 0;
1461 		}
1462 		return;
1463 	}
1464 
1465 	if (!iocg->wait_since)
1466 		iocg->wait_since = now->now;
1467 
1468 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1469 		return;
1470 
1471 	/* determine next wakeup, add a timer margin to guarantee chunking */
1472 	vshortage = -ctx.vbudget;
1473 	expires = now->now_ns +
1474 		DIV64_U64_ROUND_UP(vshortage, ioc->vtime_base_rate) *
1475 		NSEC_PER_USEC;
1476 	expires += ioc->timer_slack_ns;
1477 
1478 	/* if already active and close enough, don't bother */
1479 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1480 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1481 	    abs(oexpires - expires) <= ioc->timer_slack_ns)
1482 		return;
1483 
1484 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1485 			       ioc->timer_slack_ns, HRTIMER_MODE_ABS);
1486 }
1487 
1488 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1489 {
1490 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1491 	bool pay_debt = READ_ONCE(iocg->abs_vdebt);
1492 	struct ioc_now now;
1493 	unsigned long flags;
1494 
1495 	ioc_now(iocg->ioc, &now);
1496 
1497 	iocg_lock(iocg, pay_debt, &flags);
1498 	iocg_kick_waitq(iocg, pay_debt, &now);
1499 	iocg_unlock(iocg, pay_debt, &flags);
1500 
1501 	return HRTIMER_NORESTART;
1502 }
1503 
1504 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1505 {
1506 	u32 nr_met[2] = { };
1507 	u32 nr_missed[2] = { };
1508 	u64 rq_wait_ns = 0;
1509 	int cpu, rw;
1510 
1511 	for_each_online_cpu(cpu) {
1512 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1513 		u64 this_rq_wait_ns;
1514 
1515 		for (rw = READ; rw <= WRITE; rw++) {
1516 			u32 this_met = local_read(&stat->missed[rw].nr_met);
1517 			u32 this_missed = local_read(&stat->missed[rw].nr_missed);
1518 
1519 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1520 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1521 			stat->missed[rw].last_met = this_met;
1522 			stat->missed[rw].last_missed = this_missed;
1523 		}
1524 
1525 		this_rq_wait_ns = local64_read(&stat->rq_wait_ns);
1526 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1527 		stat->last_rq_wait_ns = this_rq_wait_ns;
1528 	}
1529 
1530 	for (rw = READ; rw <= WRITE; rw++) {
1531 		if (nr_met[rw] + nr_missed[rw])
1532 			missed_ppm_ar[rw] =
1533 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1534 						   nr_met[rw] + nr_missed[rw]);
1535 		else
1536 			missed_ppm_ar[rw] = 0;
1537 	}
1538 
1539 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1540 				   ioc->period_us * NSEC_PER_USEC);
1541 }
1542 
1543 /* was iocg idle this period? */
1544 static bool iocg_is_idle(struct ioc_gq *iocg)
1545 {
1546 	struct ioc *ioc = iocg->ioc;
1547 
1548 	/* did something get issued this period? */
1549 	if (atomic64_read(&iocg->active_period) ==
1550 	    atomic64_read(&ioc->cur_period))
1551 		return false;
1552 
1553 	/* is something in flight? */
1554 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1555 		return false;
1556 
1557 	return true;
1558 }
1559 
1560 /*
1561  * Call this function on the target leaf @iocg's to build pre-order traversal
1562  * list of all the ancestors in @inner_walk. The inner nodes are linked through
1563  * ->walk_list and the caller is responsible for dissolving the list after use.
1564  */
1565 static void iocg_build_inner_walk(struct ioc_gq *iocg,
1566 				  struct list_head *inner_walk)
1567 {
1568 	int lvl;
1569 
1570 	WARN_ON_ONCE(!list_empty(&iocg->walk_list));
1571 
1572 	/* find the first ancestor which hasn't been visited yet */
1573 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
1574 		if (!list_empty(&iocg->ancestors[lvl]->walk_list))
1575 			break;
1576 	}
1577 
1578 	/* walk down and visit the inner nodes to get pre-order traversal */
1579 	while (++lvl <= iocg->level - 1) {
1580 		struct ioc_gq *inner = iocg->ancestors[lvl];
1581 
1582 		/* record traversal order */
1583 		list_add_tail(&inner->walk_list, inner_walk);
1584 	}
1585 }
1586 
1587 /* collect per-cpu counters and propagate the deltas to the parent */
1588 static void iocg_flush_stat_one(struct ioc_gq *iocg, struct ioc_now *now)
1589 {
1590 	struct ioc *ioc = iocg->ioc;
1591 	struct iocg_stat new_stat;
1592 	u64 abs_vusage = 0;
1593 	u64 vusage_delta;
1594 	int cpu;
1595 
1596 	lockdep_assert_held(&iocg->ioc->lock);
1597 
1598 	/* collect per-cpu counters */
1599 	for_each_possible_cpu(cpu) {
1600 		abs_vusage += local64_read(
1601 				per_cpu_ptr(&iocg->pcpu_stat->abs_vusage, cpu));
1602 	}
1603 	vusage_delta = abs_vusage - iocg->last_stat_abs_vusage;
1604 	iocg->last_stat_abs_vusage = abs_vusage;
1605 
1606 	iocg->usage_delta_us = div64_u64(vusage_delta, ioc->vtime_base_rate);
1607 	iocg->local_stat.usage_us += iocg->usage_delta_us;
1608 
1609 	/* propagate upwards */
1610 	new_stat.usage_us =
1611 		iocg->local_stat.usage_us + iocg->desc_stat.usage_us;
1612 	new_stat.wait_us =
1613 		iocg->local_stat.wait_us + iocg->desc_stat.wait_us;
1614 	new_stat.indebt_us =
1615 		iocg->local_stat.indebt_us + iocg->desc_stat.indebt_us;
1616 	new_stat.indelay_us =
1617 		iocg->local_stat.indelay_us + iocg->desc_stat.indelay_us;
1618 
1619 	/* propagate the deltas to the parent */
1620 	if (iocg->level > 0) {
1621 		struct iocg_stat *parent_stat =
1622 			&iocg->ancestors[iocg->level - 1]->desc_stat;
1623 
1624 		parent_stat->usage_us +=
1625 			new_stat.usage_us - iocg->last_stat.usage_us;
1626 		parent_stat->wait_us +=
1627 			new_stat.wait_us - iocg->last_stat.wait_us;
1628 		parent_stat->indebt_us +=
1629 			new_stat.indebt_us - iocg->last_stat.indebt_us;
1630 		parent_stat->indelay_us +=
1631 			new_stat.indelay_us - iocg->last_stat.indelay_us;
1632 	}
1633 
1634 	iocg->last_stat = new_stat;
1635 }
1636 
1637 /* get stat counters ready for reading on all active iocgs */
1638 static void iocg_flush_stat(struct list_head *target_iocgs, struct ioc_now *now)
1639 {
1640 	LIST_HEAD(inner_walk);
1641 	struct ioc_gq *iocg, *tiocg;
1642 
1643 	/* flush leaves and build inner node walk list */
1644 	list_for_each_entry(iocg, target_iocgs, active_list) {
1645 		iocg_flush_stat_one(iocg, now);
1646 		iocg_build_inner_walk(iocg, &inner_walk);
1647 	}
1648 
1649 	/* keep flushing upwards by walking the inner list backwards */
1650 	list_for_each_entry_safe_reverse(iocg, tiocg, &inner_walk, walk_list) {
1651 		iocg_flush_stat_one(iocg, now);
1652 		list_del_init(&iocg->walk_list);
1653 	}
1654 }
1655 
1656 /*
1657  * Determine what @iocg's hweight_inuse should be after donating unused
1658  * capacity. @hwm is the upper bound and used to signal no donation. This
1659  * function also throws away @iocg's excess budget.
1660  */
1661 static u32 hweight_after_donation(struct ioc_gq *iocg, u32 old_hwi, u32 hwm,
1662 				  u32 usage, struct ioc_now *now)
1663 {
1664 	struct ioc *ioc = iocg->ioc;
1665 	u64 vtime = atomic64_read(&iocg->vtime);
1666 	s64 excess, delta, target, new_hwi;
1667 
1668 	/* debt handling owns inuse for debtors */
1669 	if (iocg->abs_vdebt)
1670 		return 1;
1671 
1672 	/* see whether minimum margin requirement is met */
1673 	if (waitqueue_active(&iocg->waitq) ||
1674 	    time_after64(vtime, now->vnow - ioc->margins.min))
1675 		return hwm;
1676 
1677 	/* throw away excess above target */
1678 	excess = now->vnow - vtime - ioc->margins.target;
1679 	if (excess > 0) {
1680 		atomic64_add(excess, &iocg->vtime);
1681 		atomic64_add(excess, &iocg->done_vtime);
1682 		vtime += excess;
1683 		ioc->vtime_err -= div64_u64(excess * old_hwi, WEIGHT_ONE);
1684 	}
1685 
1686 	/*
1687 	 * Let's say the distance between iocg's and device's vtimes as a
1688 	 * fraction of period duration is delta. Assuming that the iocg will
1689 	 * consume the usage determined above, we want to determine new_hwi so
1690 	 * that delta equals MARGIN_TARGET at the end of the next period.
1691 	 *
1692 	 * We need to execute usage worth of IOs while spending the sum of the
1693 	 * new budget (1 - MARGIN_TARGET) and the leftover from the last period
1694 	 * (delta):
1695 	 *
1696 	 *   usage = (1 - MARGIN_TARGET + delta) * new_hwi
1697 	 *
1698 	 * Therefore, the new_hwi is:
1699 	 *
1700 	 *   new_hwi = usage / (1 - MARGIN_TARGET + delta)
1701 	 */
1702 	delta = div64_s64(WEIGHT_ONE * (now->vnow - vtime),
1703 			  now->vnow - ioc->period_at_vtime);
1704 	target = WEIGHT_ONE * MARGIN_TARGET_PCT / 100;
1705 	new_hwi = div64_s64(WEIGHT_ONE * usage, WEIGHT_ONE - target + delta);
1706 
1707 	return clamp_t(s64, new_hwi, 1, hwm);
1708 }
1709 
1710 /*
1711  * For work-conservation, an iocg which isn't using all of its share should
1712  * donate the leftover to other iocgs. There are two ways to achieve this - 1.
1713  * bumping up vrate accordingly 2. lowering the donating iocg's inuse weight.
1714  *
1715  * #1 is mathematically simpler but has the drawback of requiring synchronous
1716  * global hweight_inuse updates when idle iocg's get activated or inuse weights
1717  * change due to donation snapbacks as it has the possibility of grossly
1718  * overshooting what's allowed by the model and vrate.
1719  *
1720  * #2 is inherently safe with local operations. The donating iocg can easily
1721  * snap back to higher weights when needed without worrying about impacts on
1722  * other nodes as the impacts will be inherently correct. This also makes idle
1723  * iocg activations safe. The only effect activations have is decreasing
1724  * hweight_inuse of others, the right solution to which is for those iocgs to
1725  * snap back to higher weights.
1726  *
1727  * So, we go with #2. The challenge is calculating how each donating iocg's
1728  * inuse should be adjusted to achieve the target donation amounts. This is done
1729  * using Andy's method described in the following pdf.
1730  *
1731  *   https://drive.google.com/file/d/1PsJwxPFtjUnwOY1QJ5AeICCcsL7BM3bo
1732  *
1733  * Given the weights and target after-donation hweight_inuse values, Andy's
1734  * method determines how the proportional distribution should look like at each
1735  * sibling level to maintain the relative relationship between all non-donating
1736  * pairs. To roughly summarize, it divides the tree into donating and
1737  * non-donating parts, calculates global donation rate which is used to
1738  * determine the target hweight_inuse for each node, and then derives per-level
1739  * proportions.
1740  *
1741  * The following pdf shows that global distribution calculated this way can be
1742  * achieved by scaling inuse weights of donating leaves and propagating the
1743  * adjustments upwards proportionally.
1744  *
1745  *   https://drive.google.com/file/d/1vONz1-fzVO7oY5DXXsLjSxEtYYQbOvsE
1746  *
1747  * Combining the above two, we can determine how each leaf iocg's inuse should
1748  * be adjusted to achieve the target donation.
1749  *
1750  *   https://drive.google.com/file/d/1WcrltBOSPN0qXVdBgnKm4mdp9FhuEFQN
1751  *
1752  * The inline comments use symbols from the last pdf.
1753  *
1754  *   b is the sum of the absolute budgets in the subtree. 1 for the root node.
1755  *   f is the sum of the absolute budgets of non-donating nodes in the subtree.
1756  *   t is the sum of the absolute budgets of donating nodes in the subtree.
1757  *   w is the weight of the node. w = w_f + w_t
1758  *   w_f is the non-donating portion of w. w_f = w * f / b
1759  *   w_b is the donating portion of w. w_t = w * t / b
1760  *   s is the sum of all sibling weights. s = Sum(w) for siblings
1761  *   s_f and s_t are the non-donating and donating portions of s.
1762  *
1763  * Subscript p denotes the parent's counterpart and ' the adjusted value - e.g.
1764  * w_pt is the donating portion of the parent's weight and w'_pt the same value
1765  * after adjustments. Subscript r denotes the root node's values.
1766  */
1767 static void transfer_surpluses(struct list_head *surpluses, struct ioc_now *now)
1768 {
1769 	LIST_HEAD(over_hwa);
1770 	LIST_HEAD(inner_walk);
1771 	struct ioc_gq *iocg, *tiocg, *root_iocg;
1772 	u32 after_sum, over_sum, over_target, gamma;
1773 
1774 	/*
1775 	 * It's pretty unlikely but possible for the total sum of
1776 	 * hweight_after_donation's to be higher than WEIGHT_ONE, which will
1777 	 * confuse the following calculations. If such condition is detected,
1778 	 * scale down everyone over its full share equally to keep the sum below
1779 	 * WEIGHT_ONE.
1780 	 */
1781 	after_sum = 0;
1782 	over_sum = 0;
1783 	list_for_each_entry(iocg, surpluses, surplus_list) {
1784 		u32 hwa;
1785 
1786 		current_hweight(iocg, &hwa, NULL);
1787 		after_sum += iocg->hweight_after_donation;
1788 
1789 		if (iocg->hweight_after_donation > hwa) {
1790 			over_sum += iocg->hweight_after_donation;
1791 			list_add(&iocg->walk_list, &over_hwa);
1792 		}
1793 	}
1794 
1795 	if (after_sum >= WEIGHT_ONE) {
1796 		/*
1797 		 * The delta should be deducted from the over_sum, calculate
1798 		 * target over_sum value.
1799 		 */
1800 		u32 over_delta = after_sum - (WEIGHT_ONE - 1);
1801 		WARN_ON_ONCE(over_sum <= over_delta);
1802 		over_target = over_sum - over_delta;
1803 	} else {
1804 		over_target = 0;
1805 	}
1806 
1807 	list_for_each_entry_safe(iocg, tiocg, &over_hwa, walk_list) {
1808 		if (over_target)
1809 			iocg->hweight_after_donation =
1810 				div_u64((u64)iocg->hweight_after_donation *
1811 					over_target, over_sum);
1812 		list_del_init(&iocg->walk_list);
1813 	}
1814 
1815 	/*
1816 	 * Build pre-order inner node walk list and prepare for donation
1817 	 * adjustment calculations.
1818 	 */
1819 	list_for_each_entry(iocg, surpluses, surplus_list) {
1820 		iocg_build_inner_walk(iocg, &inner_walk);
1821 	}
1822 
1823 	root_iocg = list_first_entry(&inner_walk, struct ioc_gq, walk_list);
1824 	WARN_ON_ONCE(root_iocg->level > 0);
1825 
1826 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1827 		iocg->child_adjusted_sum = 0;
1828 		iocg->hweight_donating = 0;
1829 		iocg->hweight_after_donation = 0;
1830 	}
1831 
1832 	/*
1833 	 * Propagate the donating budget (b_t) and after donation budget (b'_t)
1834 	 * up the hierarchy.
1835 	 */
1836 	list_for_each_entry(iocg, surpluses, surplus_list) {
1837 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1838 
1839 		parent->hweight_donating += iocg->hweight_donating;
1840 		parent->hweight_after_donation += iocg->hweight_after_donation;
1841 	}
1842 
1843 	list_for_each_entry_reverse(iocg, &inner_walk, walk_list) {
1844 		if (iocg->level > 0) {
1845 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1846 
1847 			parent->hweight_donating += iocg->hweight_donating;
1848 			parent->hweight_after_donation += iocg->hweight_after_donation;
1849 		}
1850 	}
1851 
1852 	/*
1853 	 * Calculate inner hwa's (b) and make sure the donation values are
1854 	 * within the accepted ranges as we're doing low res calculations with
1855 	 * roundups.
1856 	 */
1857 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1858 		if (iocg->level) {
1859 			struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1860 
1861 			iocg->hweight_active = DIV64_U64_ROUND_UP(
1862 				(u64)parent->hweight_active * iocg->active,
1863 				parent->child_active_sum);
1864 
1865 		}
1866 
1867 		iocg->hweight_donating = min(iocg->hweight_donating,
1868 					     iocg->hweight_active);
1869 		iocg->hweight_after_donation = min(iocg->hweight_after_donation,
1870 						   iocg->hweight_donating - 1);
1871 		if (WARN_ON_ONCE(iocg->hweight_active <= 1 ||
1872 				 iocg->hweight_donating <= 1 ||
1873 				 iocg->hweight_after_donation == 0)) {
1874 			pr_warn("iocg: invalid donation weights in ");
1875 			pr_cont_cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup);
1876 			pr_cont(": active=%u donating=%u after=%u\n",
1877 				iocg->hweight_active, iocg->hweight_donating,
1878 				iocg->hweight_after_donation);
1879 		}
1880 	}
1881 
1882 	/*
1883 	 * Calculate the global donation rate (gamma) - the rate to adjust
1884 	 * non-donating budgets by.
1885 	 *
1886 	 * No need to use 64bit multiplication here as the first operand is
1887 	 * guaranteed to be smaller than WEIGHT_ONE (1<<16).
1888 	 *
1889 	 * We know that there are beneficiary nodes and the sum of the donating
1890 	 * hweights can't be whole; however, due to the round-ups during hweight
1891 	 * calculations, root_iocg->hweight_donating might still end up equal to
1892 	 * or greater than whole. Limit the range when calculating the divider.
1893 	 *
1894 	 * gamma = (1 - t_r') / (1 - t_r)
1895 	 */
1896 	gamma = DIV_ROUND_UP(
1897 		(WEIGHT_ONE - root_iocg->hweight_after_donation) * WEIGHT_ONE,
1898 		WEIGHT_ONE - min_t(u32, root_iocg->hweight_donating, WEIGHT_ONE - 1));
1899 
1900 	/*
1901 	 * Calculate adjusted hwi, child_adjusted_sum and inuse for the inner
1902 	 * nodes.
1903 	 */
1904 	list_for_each_entry(iocg, &inner_walk, walk_list) {
1905 		struct ioc_gq *parent;
1906 		u32 inuse, wpt, wptp;
1907 		u64 st, sf;
1908 
1909 		if (iocg->level == 0) {
1910 			/* adjusted weight sum for 1st level: s' = s * b_pf / b'_pf */
1911 			iocg->child_adjusted_sum = DIV64_U64_ROUND_UP(
1912 				iocg->child_active_sum * (WEIGHT_ONE - iocg->hweight_donating),
1913 				WEIGHT_ONE - iocg->hweight_after_donation);
1914 			continue;
1915 		}
1916 
1917 		parent = iocg->ancestors[iocg->level - 1];
1918 
1919 		/* b' = gamma * b_f + b_t' */
1920 		iocg->hweight_inuse = DIV64_U64_ROUND_UP(
1921 			(u64)gamma * (iocg->hweight_active - iocg->hweight_donating),
1922 			WEIGHT_ONE) + iocg->hweight_after_donation;
1923 
1924 		/* w' = s' * b' / b'_p */
1925 		inuse = DIV64_U64_ROUND_UP(
1926 			(u64)parent->child_adjusted_sum * iocg->hweight_inuse,
1927 			parent->hweight_inuse);
1928 
1929 		/* adjusted weight sum for children: s' = s_f + s_t * w'_pt / w_pt */
1930 		st = DIV64_U64_ROUND_UP(
1931 			iocg->child_active_sum * iocg->hweight_donating,
1932 			iocg->hweight_active);
1933 		sf = iocg->child_active_sum - st;
1934 		wpt = DIV64_U64_ROUND_UP(
1935 			(u64)iocg->active * iocg->hweight_donating,
1936 			iocg->hweight_active);
1937 		wptp = DIV64_U64_ROUND_UP(
1938 			(u64)inuse * iocg->hweight_after_donation,
1939 			iocg->hweight_inuse);
1940 
1941 		iocg->child_adjusted_sum = sf + DIV64_U64_ROUND_UP(st * wptp, wpt);
1942 	}
1943 
1944 	/*
1945 	 * All inner nodes now have ->hweight_inuse and ->child_adjusted_sum and
1946 	 * we can finally determine leaf adjustments.
1947 	 */
1948 	list_for_each_entry(iocg, surpluses, surplus_list) {
1949 		struct ioc_gq *parent = iocg->ancestors[iocg->level - 1];
1950 		u32 inuse;
1951 
1952 		/*
1953 		 * In-debt iocgs participated in the donation calculation with
1954 		 * the minimum target hweight_inuse. Configuring inuse
1955 		 * accordingly would work fine but debt handling expects
1956 		 * @iocg->inuse stay at the minimum and we don't wanna
1957 		 * interfere.
1958 		 */
1959 		if (iocg->abs_vdebt) {
1960 			WARN_ON_ONCE(iocg->inuse > 1);
1961 			continue;
1962 		}
1963 
1964 		/* w' = s' * b' / b'_p, note that b' == b'_t for donating leaves */
1965 		inuse = DIV64_U64_ROUND_UP(
1966 			parent->child_adjusted_sum * iocg->hweight_after_donation,
1967 			parent->hweight_inuse);
1968 
1969 		TRACE_IOCG_PATH(inuse_transfer, iocg, now,
1970 				iocg->inuse, inuse,
1971 				iocg->hweight_inuse,
1972 				iocg->hweight_after_donation);
1973 
1974 		__propagate_weights(iocg, iocg->active, inuse, true, now);
1975 	}
1976 
1977 	/* walk list should be dissolved after use */
1978 	list_for_each_entry_safe(iocg, tiocg, &inner_walk, walk_list)
1979 		list_del_init(&iocg->walk_list);
1980 }
1981 
1982 /*
1983  * A low weight iocg can amass a large amount of debt, for example, when
1984  * anonymous memory gets reclaimed aggressively. If the system has a lot of
1985  * memory paired with a slow IO device, the debt can span multiple seconds or
1986  * more. If there are no other subsequent IO issuers, the in-debt iocg may end
1987  * up blocked paying its debt while the IO device is idle.
1988  *
1989  * The following protects against such cases. If the device has been
1990  * sufficiently idle for a while, the debts are halved and delays are
1991  * recalculated.
1992  */
1993 static void ioc_forgive_debts(struct ioc *ioc, u64 usage_us_sum, int nr_debtors,
1994 			      struct ioc_now *now)
1995 {
1996 	struct ioc_gq *iocg;
1997 	u64 dur, usage_pct, nr_cycles;
1998 
1999 	/* if no debtor, reset the cycle */
2000 	if (!nr_debtors) {
2001 		ioc->dfgv_period_at = now->now;
2002 		ioc->dfgv_period_rem = 0;
2003 		ioc->dfgv_usage_us_sum = 0;
2004 		return;
2005 	}
2006 
2007 	/*
2008 	 * Debtors can pass through a lot of writes choking the device and we
2009 	 * don't want to be forgiving debts while the device is struggling from
2010 	 * write bursts. If we're missing latency targets, consider the device
2011 	 * fully utilized.
2012 	 */
2013 	if (ioc->busy_level > 0)
2014 		usage_us_sum = max_t(u64, usage_us_sum, ioc->period_us);
2015 
2016 	ioc->dfgv_usage_us_sum += usage_us_sum;
2017 	if (time_before64(now->now, ioc->dfgv_period_at + DFGV_PERIOD))
2018 		return;
2019 
2020 	/*
2021 	 * At least DFGV_PERIOD has passed since the last period. Calculate the
2022 	 * average usage and reset the period counters.
2023 	 */
2024 	dur = now->now - ioc->dfgv_period_at;
2025 	usage_pct = div64_u64(100 * ioc->dfgv_usage_us_sum, dur);
2026 
2027 	ioc->dfgv_period_at = now->now;
2028 	ioc->dfgv_usage_us_sum = 0;
2029 
2030 	/* if was too busy, reset everything */
2031 	if (usage_pct > DFGV_USAGE_PCT) {
2032 		ioc->dfgv_period_rem = 0;
2033 		return;
2034 	}
2035 
2036 	/*
2037 	 * Usage is lower than threshold. Let's forgive some debts. Debt
2038 	 * forgiveness runs off of the usual ioc timer but its period usually
2039 	 * doesn't match ioc's. Compensate the difference by performing the
2040 	 * reduction as many times as would fit in the duration since the last
2041 	 * run and carrying over the left-over duration in @ioc->dfgv_period_rem
2042 	 * - if ioc period is 75% of DFGV_PERIOD, one out of three consecutive
2043 	 * reductions is doubled.
2044 	 */
2045 	nr_cycles = dur + ioc->dfgv_period_rem;
2046 	ioc->dfgv_period_rem = do_div(nr_cycles, DFGV_PERIOD);
2047 
2048 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2049 		u64 __maybe_unused old_debt, __maybe_unused old_delay;
2050 
2051 		if (!iocg->abs_vdebt && !iocg->delay)
2052 			continue;
2053 
2054 		spin_lock(&iocg->waitq.lock);
2055 
2056 		old_debt = iocg->abs_vdebt;
2057 		old_delay = iocg->delay;
2058 
2059 		if (iocg->abs_vdebt)
2060 			iocg->abs_vdebt = iocg->abs_vdebt >> nr_cycles ?: 1;
2061 		if (iocg->delay)
2062 			iocg->delay = iocg->delay >> nr_cycles ?: 1;
2063 
2064 		iocg_kick_waitq(iocg, true, now);
2065 
2066 		TRACE_IOCG_PATH(iocg_forgive_debt, iocg, now, usage_pct,
2067 				old_debt, iocg->abs_vdebt,
2068 				old_delay, iocg->delay);
2069 
2070 		spin_unlock(&iocg->waitq.lock);
2071 	}
2072 }
2073 
2074 static void ioc_timer_fn(struct timer_list *timer)
2075 {
2076 	struct ioc *ioc = container_of(timer, struct ioc, timer);
2077 	struct ioc_gq *iocg, *tiocg;
2078 	struct ioc_now now;
2079 	LIST_HEAD(surpluses);
2080 	int nr_debtors = 0, nr_shortages = 0, nr_lagging = 0;
2081 	u64 usage_us_sum = 0;
2082 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
2083 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
2084 	u32 missed_ppm[2], rq_wait_pct;
2085 	u64 period_vtime;
2086 	int prev_busy_level;
2087 
2088 	/* how were the latencies during the period? */
2089 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
2090 
2091 	/* take care of active iocgs */
2092 	spin_lock_irq(&ioc->lock);
2093 
2094 	ioc_now(ioc, &now);
2095 
2096 	period_vtime = now.vnow - ioc->period_at_vtime;
2097 	if (WARN_ON_ONCE(!period_vtime)) {
2098 		spin_unlock_irq(&ioc->lock);
2099 		return;
2100 	}
2101 
2102 	/*
2103 	 * Waiters determine the sleep durations based on the vrate they
2104 	 * saw at the time of sleep.  If vrate has increased, some waiters
2105 	 * could be sleeping for too long.  Wake up tardy waiters which
2106 	 * should have woken up in the last period and expire idle iocgs.
2107 	 */
2108 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
2109 		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2110 		    !iocg->delay && !iocg_is_idle(iocg))
2111 			continue;
2112 
2113 		spin_lock(&iocg->waitq.lock);
2114 
2115 		/* flush wait and indebt stat deltas */
2116 		if (iocg->wait_since) {
2117 			iocg->local_stat.wait_us += now.now - iocg->wait_since;
2118 			iocg->wait_since = now.now;
2119 		}
2120 		if (iocg->indebt_since) {
2121 			iocg->local_stat.indebt_us +=
2122 				now.now - iocg->indebt_since;
2123 			iocg->indebt_since = now.now;
2124 		}
2125 		if (iocg->indelay_since) {
2126 			iocg->local_stat.indelay_us +=
2127 				now.now - iocg->indelay_since;
2128 			iocg->indelay_since = now.now;
2129 		}
2130 
2131 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt ||
2132 		    iocg->delay) {
2133 			/* might be oversleeping vtime / hweight changes, kick */
2134 			iocg_kick_waitq(iocg, true, &now);
2135 			if (iocg->abs_vdebt || iocg->delay)
2136 				nr_debtors++;
2137 		} else if (iocg_is_idle(iocg)) {
2138 			/* no waiter and idle, deactivate */
2139 			u64 vtime = atomic64_read(&iocg->vtime);
2140 			s64 excess;
2141 
2142 			/*
2143 			 * @iocg has been inactive for a full duration and will
2144 			 * have a high budget. Account anything above target as
2145 			 * error and throw away. On reactivation, it'll start
2146 			 * with the target budget.
2147 			 */
2148 			excess = now.vnow - vtime - ioc->margins.target;
2149 			if (excess > 0) {
2150 				u32 old_hwi;
2151 
2152 				current_hweight(iocg, NULL, &old_hwi);
2153 				ioc->vtime_err -= div64_u64(excess * old_hwi,
2154 							    WEIGHT_ONE);
2155 			}
2156 
2157 			__propagate_weights(iocg, 0, 0, false, &now);
2158 			list_del_init(&iocg->active_list);
2159 		}
2160 
2161 		spin_unlock(&iocg->waitq.lock);
2162 	}
2163 	commit_weights(ioc);
2164 
2165 	/*
2166 	 * Wait and indebt stat are flushed above and the donation calculation
2167 	 * below needs updated usage stat. Let's bring stat up-to-date.
2168 	 */
2169 	iocg_flush_stat(&ioc->active_iocgs, &now);
2170 
2171 	/* calc usage and see whether some weights need to be moved around */
2172 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
2173 		u64 vdone, vtime, usage_us, usage_dur;
2174 		u32 usage, hw_active, hw_inuse;
2175 
2176 		/*
2177 		 * Collect unused and wind vtime closer to vnow to prevent
2178 		 * iocgs from accumulating a large amount of budget.
2179 		 */
2180 		vdone = atomic64_read(&iocg->done_vtime);
2181 		vtime = atomic64_read(&iocg->vtime);
2182 		current_hweight(iocg, &hw_active, &hw_inuse);
2183 
2184 		/*
2185 		 * Latency QoS detection doesn't account for IOs which are
2186 		 * in-flight for longer than a period.  Detect them by
2187 		 * comparing vdone against period start.  If lagging behind
2188 		 * IOs from past periods, don't increase vrate.
2189 		 */
2190 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
2191 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
2192 		    time_after64(vtime, vdone) &&
2193 		    time_after64(vtime, now.vnow -
2194 				 MAX_LAGGING_PERIODS * period_vtime) &&
2195 		    time_before64(vdone, now.vnow - period_vtime))
2196 			nr_lagging++;
2197 
2198 		/*
2199 		 * Determine absolute usage factoring in in-flight IOs to avoid
2200 		 * high-latency completions appearing as idle.
2201 		 */
2202 		usage_us = iocg->usage_delta_us;
2203 		usage_us_sum += usage_us;
2204 
2205 		if (vdone != vtime) {
2206 			u64 inflight_us = DIV64_U64_ROUND_UP(
2207 				cost_to_abs_cost(vtime - vdone, hw_inuse),
2208 				ioc->vtime_base_rate);
2209 			usage_us = max(usage_us, inflight_us);
2210 		}
2211 
2212 		/* convert to hweight based usage ratio */
2213 		if (time_after64(iocg->activated_at, ioc->period_at))
2214 			usage_dur = max_t(u64, now.now - iocg->activated_at, 1);
2215 		else
2216 			usage_dur = max_t(u64, now.now - ioc->period_at, 1);
2217 
2218 		usage = clamp_t(u32,
2219 				DIV64_U64_ROUND_UP(usage_us * WEIGHT_ONE,
2220 						   usage_dur),
2221 				1, WEIGHT_ONE);
2222 
2223 		/* see whether there's surplus vtime */
2224 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2225 		if (hw_inuse < hw_active ||
2226 		    (!waitqueue_active(&iocg->waitq) &&
2227 		     time_before64(vtime, now.vnow - ioc->margins.low))) {
2228 			u32 hwa, old_hwi, hwm, new_hwi;
2229 
2230 			/*
2231 			 * Already donating or accumulated enough to start.
2232 			 * Determine the donation amount.
2233 			 */
2234 			current_hweight(iocg, &hwa, &old_hwi);
2235 			hwm = current_hweight_max(iocg);
2236 			new_hwi = hweight_after_donation(iocg, old_hwi, hwm,
2237 							 usage, &now);
2238 			if (new_hwi < hwm) {
2239 				iocg->hweight_donating = hwa;
2240 				iocg->hweight_after_donation = new_hwi;
2241 				list_add(&iocg->surplus_list, &surpluses);
2242 			} else {
2243 				TRACE_IOCG_PATH(inuse_shortage, iocg, &now,
2244 						iocg->inuse, iocg->active,
2245 						iocg->hweight_inuse, new_hwi);
2246 
2247 				__propagate_weights(iocg, iocg->active,
2248 						    iocg->active, true, &now);
2249 				nr_shortages++;
2250 			}
2251 		} else {
2252 			/* genuinely short on vtime */
2253 			nr_shortages++;
2254 		}
2255 	}
2256 
2257 	if (!list_empty(&surpluses) && nr_shortages)
2258 		transfer_surpluses(&surpluses, &now);
2259 
2260 	commit_weights(ioc);
2261 
2262 	/* surplus list should be dissolved after use */
2263 	list_for_each_entry_safe(iocg, tiocg, &surpluses, surplus_list)
2264 		list_del_init(&iocg->surplus_list);
2265 
2266 	/*
2267 	 * If q is getting clogged or we're missing too much, we're issuing
2268 	 * too much IO and should lower vtime rate.  If we're not missing
2269 	 * and experiencing shortages but not surpluses, we're too stingy
2270 	 * and should increase vtime rate.
2271 	 */
2272 	prev_busy_level = ioc->busy_level;
2273 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
2274 	    missed_ppm[READ] > ppm_rthr ||
2275 	    missed_ppm[WRITE] > ppm_wthr) {
2276 		/* clearly missing QoS targets, slow down vrate */
2277 		ioc->busy_level = max(ioc->busy_level, 0);
2278 		ioc->busy_level++;
2279 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
2280 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
2281 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
2282 		/* QoS targets are being met with >25% margin */
2283 		if (nr_shortages) {
2284 			/*
2285 			 * We're throttling while the device has spare
2286 			 * capacity.  If vrate was being slowed down, stop.
2287 			 */
2288 			ioc->busy_level = min(ioc->busy_level, 0);
2289 
2290 			/*
2291 			 * If there are IOs spanning multiple periods, wait
2292 			 * them out before pushing the device harder.
2293 			 */
2294 			if (!nr_lagging)
2295 				ioc->busy_level--;
2296 		} else {
2297 			/*
2298 			 * Nobody is being throttled and the users aren't
2299 			 * issuing enough IOs to saturate the device.  We
2300 			 * simply don't know how close the device is to
2301 			 * saturation.  Coast.
2302 			 */
2303 			ioc->busy_level = 0;
2304 		}
2305 	} else {
2306 		/* inside the hysterisis margin, we're good */
2307 		ioc->busy_level = 0;
2308 	}
2309 
2310 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
2311 
2312 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
2313 		u64 vrate = ioc->vtime_base_rate;
2314 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
2315 
2316 		/* rq_wait signal is always reliable, ignore user vrate_min */
2317 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
2318 			vrate_min = VRATE_MIN;
2319 
2320 		/*
2321 		 * If vrate is out of bounds, apply clamp gradually as the
2322 		 * bounds can change abruptly.  Otherwise, apply busy_level
2323 		 * based adjustment.
2324 		 */
2325 		if (vrate < vrate_min) {
2326 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
2327 					  100);
2328 			vrate = min(vrate, vrate_min);
2329 		} else if (vrate > vrate_max) {
2330 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
2331 					  100);
2332 			vrate = max(vrate, vrate_max);
2333 		} else {
2334 			int idx = min_t(int, abs(ioc->busy_level),
2335 					ARRAY_SIZE(vrate_adj_pct) - 1);
2336 			u32 adj_pct = vrate_adj_pct[idx];
2337 
2338 			if (ioc->busy_level > 0)
2339 				adj_pct = 100 - adj_pct;
2340 			else
2341 				adj_pct = 100 + adj_pct;
2342 
2343 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
2344 				      vrate_min, vrate_max);
2345 		}
2346 
2347 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
2348 					   nr_lagging, nr_shortages);
2349 
2350 		ioc->vtime_base_rate = vrate;
2351 		ioc_refresh_margins(ioc);
2352 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
2353 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
2354 					   missed_ppm, rq_wait_pct, nr_lagging,
2355 					   nr_shortages);
2356 	}
2357 
2358 	ioc_refresh_params(ioc, false);
2359 
2360 	ioc_forgive_debts(ioc, usage_us_sum, nr_debtors, &now);
2361 
2362 	/*
2363 	 * This period is done.  Move onto the next one.  If nothing's
2364 	 * going on with the device, stop the timer.
2365 	 */
2366 	atomic64_inc(&ioc->cur_period);
2367 
2368 	if (ioc->running != IOC_STOP) {
2369 		if (!list_empty(&ioc->active_iocgs)) {
2370 			ioc_start_period(ioc, &now);
2371 		} else {
2372 			ioc->busy_level = 0;
2373 			ioc->vtime_err = 0;
2374 			ioc->running = IOC_IDLE;
2375 		}
2376 
2377 		ioc_refresh_vrate(ioc, &now);
2378 	}
2379 
2380 	spin_unlock_irq(&ioc->lock);
2381 }
2382 
2383 static u64 adjust_inuse_and_calc_cost(struct ioc_gq *iocg, u64 vtime,
2384 				      u64 abs_cost, struct ioc_now *now)
2385 {
2386 	struct ioc *ioc = iocg->ioc;
2387 	struct ioc_margins *margins = &ioc->margins;
2388 	u32 __maybe_unused old_inuse = iocg->inuse, __maybe_unused old_hwi;
2389 	u32 hwi, adj_step;
2390 	s64 margin;
2391 	u64 cost, new_inuse;
2392 
2393 	current_hweight(iocg, NULL, &hwi);
2394 	old_hwi = hwi;
2395 	cost = abs_cost_to_cost(abs_cost, hwi);
2396 	margin = now->vnow - vtime - cost;
2397 
2398 	/* debt handling owns inuse for debtors */
2399 	if (iocg->abs_vdebt)
2400 		return cost;
2401 
2402 	/*
2403 	 * We only increase inuse during period and do so iff the margin has
2404 	 * deteriorated since the previous adjustment.
2405 	 */
2406 	if (margin >= iocg->saved_margin || margin >= margins->low ||
2407 	    iocg->inuse == iocg->active)
2408 		return cost;
2409 
2410 	spin_lock_irq(&ioc->lock);
2411 
2412 	/* we own inuse only when @iocg is in the normal active state */
2413 	if (iocg->abs_vdebt || list_empty(&iocg->active_list)) {
2414 		spin_unlock_irq(&ioc->lock);
2415 		return cost;
2416 	}
2417 
2418 	/*
2419 	 * Bump up inuse till @abs_cost fits in the existing budget.
2420 	 * adj_step must be determined after acquiring ioc->lock - we might
2421 	 * have raced and lost to another thread for activation and could
2422 	 * be reading 0 iocg->active before ioc->lock which will lead to
2423 	 * infinite loop.
2424 	 */
2425 	new_inuse = iocg->inuse;
2426 	adj_step = DIV_ROUND_UP(iocg->active * INUSE_ADJ_STEP_PCT, 100);
2427 	do {
2428 		new_inuse = new_inuse + adj_step;
2429 		propagate_weights(iocg, iocg->active, new_inuse, true, now);
2430 		current_hweight(iocg, NULL, &hwi);
2431 		cost = abs_cost_to_cost(abs_cost, hwi);
2432 	} while (time_after64(vtime + cost, now->vnow) &&
2433 		 iocg->inuse != iocg->active);
2434 
2435 	spin_unlock_irq(&ioc->lock);
2436 
2437 	TRACE_IOCG_PATH(inuse_adjust, iocg, now,
2438 			old_inuse, iocg->inuse, old_hwi, hwi);
2439 
2440 	return cost;
2441 }
2442 
2443 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
2444 				    bool is_merge, u64 *costp)
2445 {
2446 	struct ioc *ioc = iocg->ioc;
2447 	u64 coef_seqio, coef_randio, coef_page;
2448 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
2449 	u64 seek_pages = 0;
2450 	u64 cost = 0;
2451 
2452 	switch (bio_op(bio)) {
2453 	case REQ_OP_READ:
2454 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
2455 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
2456 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
2457 		break;
2458 	case REQ_OP_WRITE:
2459 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
2460 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
2461 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
2462 		break;
2463 	default:
2464 		goto out;
2465 	}
2466 
2467 	if (iocg->cursor) {
2468 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
2469 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
2470 	}
2471 
2472 	if (!is_merge) {
2473 		if (seek_pages > LCOEF_RANDIO_PAGES) {
2474 			cost += coef_randio;
2475 		} else {
2476 			cost += coef_seqio;
2477 		}
2478 	}
2479 	cost += pages * coef_page;
2480 out:
2481 	*costp = cost;
2482 }
2483 
2484 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
2485 {
2486 	u64 cost;
2487 
2488 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
2489 	return cost;
2490 }
2491 
2492 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
2493 					 u64 *costp)
2494 {
2495 	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
2496 
2497 	switch (req_op(rq)) {
2498 	case REQ_OP_READ:
2499 		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
2500 		break;
2501 	case REQ_OP_WRITE:
2502 		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
2503 		break;
2504 	default:
2505 		*costp = 0;
2506 	}
2507 }
2508 
2509 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
2510 {
2511 	u64 cost;
2512 
2513 	calc_size_vtime_cost_builtin(rq, ioc, &cost);
2514 	return cost;
2515 }
2516 
2517 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
2518 {
2519 	struct blkcg_gq *blkg = bio->bi_blkg;
2520 	struct ioc *ioc = rqos_to_ioc(rqos);
2521 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
2522 	struct ioc_now now;
2523 	struct iocg_wait wait;
2524 	u64 abs_cost, cost, vtime;
2525 	bool use_debt, ioc_locked;
2526 	unsigned long flags;
2527 
2528 	/* bypass IOs if disabled or for root cgroup */
2529 	if (!ioc->enabled || !iocg->level)
2530 		return;
2531 
2532 	/* calculate the absolute vtime cost */
2533 	abs_cost = calc_vtime_cost(bio, iocg, false);
2534 	if (!abs_cost)
2535 		return;
2536 
2537 	if (!iocg_activate(iocg, &now))
2538 		return;
2539 
2540 	iocg->cursor = bio_end_sector(bio);
2541 	vtime = atomic64_read(&iocg->vtime);
2542 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2543 
2544 	/*
2545 	 * If no one's waiting and within budget, issue right away.  The
2546 	 * tests are racy but the races aren't systemic - we only miss once
2547 	 * in a while which is fine.
2548 	 */
2549 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
2550 	    time_before_eq64(vtime + cost, now.vnow)) {
2551 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2552 		return;
2553 	}
2554 
2555 	/*
2556 	 * We're over budget. This can be handled in two ways. IOs which may
2557 	 * cause priority inversions are punted to @ioc->aux_iocg and charged as
2558 	 * debt. Otherwise, the issuer is blocked on @iocg->waitq. Debt handling
2559 	 * requires @ioc->lock, waitq handling @iocg->waitq.lock. Determine
2560 	 * whether debt handling is needed and acquire locks accordingly.
2561 	 */
2562 	use_debt = bio_issue_as_root_blkg(bio) || fatal_signal_pending(current);
2563 	ioc_locked = use_debt || READ_ONCE(iocg->abs_vdebt);
2564 retry_lock:
2565 	iocg_lock(iocg, ioc_locked, &flags);
2566 
2567 	/*
2568 	 * @iocg must stay activated for debt and waitq handling. Deactivation
2569 	 * is synchronized against both ioc->lock and waitq.lock and we won't
2570 	 * get deactivated as long as we're waiting or has debt, so we're good
2571 	 * if we're activated here. In the unlikely cases that we aren't, just
2572 	 * issue the IO.
2573 	 */
2574 	if (unlikely(list_empty(&iocg->active_list))) {
2575 		iocg_unlock(iocg, ioc_locked, &flags);
2576 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2577 		return;
2578 	}
2579 
2580 	/*
2581 	 * We're over budget. If @bio has to be issued regardless, remember
2582 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
2583 	 * off the debt before waking more IOs.
2584 	 *
2585 	 * This way, the debt is continuously paid off each period with the
2586 	 * actual budget available to the cgroup. If we just wound vtime, we
2587 	 * would incorrectly use the current hw_inuse for the entire amount
2588 	 * which, for example, can lead to the cgroup staying blocked for a
2589 	 * long time even with substantially raised hw_inuse.
2590 	 *
2591 	 * An iocg with vdebt should stay online so that the timer can keep
2592 	 * deducting its vdebt and [de]activate use_delay mechanism
2593 	 * accordingly. We don't want to race against the timer trying to
2594 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
2595 	 * penalizing the cgroup and its descendants.
2596 	 */
2597 	if (use_debt) {
2598 		iocg_incur_debt(iocg, abs_cost, &now);
2599 		if (iocg_kick_delay(iocg, &now))
2600 			blkcg_schedule_throttle(rqos->q,
2601 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2602 		iocg_unlock(iocg, ioc_locked, &flags);
2603 		return;
2604 	}
2605 
2606 	/* guarantee that iocgs w/ waiters have maximum inuse */
2607 	if (!iocg->abs_vdebt && iocg->inuse != iocg->active) {
2608 		if (!ioc_locked) {
2609 			iocg_unlock(iocg, false, &flags);
2610 			ioc_locked = true;
2611 			goto retry_lock;
2612 		}
2613 		propagate_weights(iocg, iocg->active, iocg->active, true,
2614 				  &now);
2615 	}
2616 
2617 	/*
2618 	 * Append self to the waitq and schedule the wakeup timer if we're
2619 	 * the first waiter.  The timer duration is calculated based on the
2620 	 * current vrate.  vtime and hweight changes can make it too short
2621 	 * or too long.  Each wait entry records the absolute cost it's
2622 	 * waiting for to allow re-evaluation using a custom wait entry.
2623 	 *
2624 	 * If too short, the timer simply reschedules itself.  If too long,
2625 	 * the period timer will notice and trigger wakeups.
2626 	 *
2627 	 * All waiters are on iocg->waitq and the wait states are
2628 	 * synchronized using waitq.lock.
2629 	 */
2630 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
2631 	wait.wait.private = current;
2632 	wait.bio = bio;
2633 	wait.abs_cost = abs_cost;
2634 	wait.committed = false;	/* will be set true by waker */
2635 
2636 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
2637 	iocg_kick_waitq(iocg, ioc_locked, &now);
2638 
2639 	iocg_unlock(iocg, ioc_locked, &flags);
2640 
2641 	while (true) {
2642 		set_current_state(TASK_UNINTERRUPTIBLE);
2643 		if (wait.committed)
2644 			break;
2645 		io_schedule();
2646 	}
2647 
2648 	/* waker already committed us, proceed */
2649 	finish_wait(&iocg->waitq, &wait.wait);
2650 }
2651 
2652 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
2653 			   struct bio *bio)
2654 {
2655 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2656 	struct ioc *ioc = iocg->ioc;
2657 	sector_t bio_end = bio_end_sector(bio);
2658 	struct ioc_now now;
2659 	u64 vtime, abs_cost, cost;
2660 	unsigned long flags;
2661 
2662 	/* bypass if disabled or for root cgroup */
2663 	if (!ioc->enabled || !iocg->level)
2664 		return;
2665 
2666 	abs_cost = calc_vtime_cost(bio, iocg, true);
2667 	if (!abs_cost)
2668 		return;
2669 
2670 	ioc_now(ioc, &now);
2671 
2672 	vtime = atomic64_read(&iocg->vtime);
2673 	cost = adjust_inuse_and_calc_cost(iocg, vtime, abs_cost, &now);
2674 
2675 	/* update cursor if backmerging into the request at the cursor */
2676 	if (blk_rq_pos(rq) < bio_end &&
2677 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
2678 		iocg->cursor = bio_end;
2679 
2680 	/*
2681 	 * Charge if there's enough vtime budget and the existing request has
2682 	 * cost assigned.
2683 	 */
2684 	if (rq->bio && rq->bio->bi_iocost_cost &&
2685 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
2686 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2687 		return;
2688 	}
2689 
2690 	/*
2691 	 * Otherwise, account it as debt if @iocg is online, which it should
2692 	 * be for the vast majority of cases. See debt handling in
2693 	 * ioc_rqos_throttle() for details.
2694 	 */
2695 	spin_lock_irqsave(&ioc->lock, flags);
2696 	spin_lock(&iocg->waitq.lock);
2697 
2698 	if (likely(!list_empty(&iocg->active_list))) {
2699 		iocg_incur_debt(iocg, abs_cost, &now);
2700 		if (iocg_kick_delay(iocg, &now))
2701 			blkcg_schedule_throttle(rqos->q,
2702 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
2703 	} else {
2704 		iocg_commit_bio(iocg, bio, abs_cost, cost);
2705 	}
2706 
2707 	spin_unlock(&iocg->waitq.lock);
2708 	spin_unlock_irqrestore(&ioc->lock, flags);
2709 }
2710 
2711 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
2712 {
2713 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
2714 
2715 	if (iocg && bio->bi_iocost_cost)
2716 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
2717 }
2718 
2719 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
2720 {
2721 	struct ioc *ioc = rqos_to_ioc(rqos);
2722 	struct ioc_pcpu_stat *ccs;
2723 	u64 on_q_ns, rq_wait_ns, size_nsec;
2724 	int pidx, rw;
2725 
2726 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
2727 		return;
2728 
2729 	switch (req_op(rq) & REQ_OP_MASK) {
2730 	case REQ_OP_READ:
2731 		pidx = QOS_RLAT;
2732 		rw = READ;
2733 		break;
2734 	case REQ_OP_WRITE:
2735 		pidx = QOS_WLAT;
2736 		rw = WRITE;
2737 		break;
2738 	default:
2739 		return;
2740 	}
2741 
2742 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
2743 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
2744 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
2745 
2746 	ccs = get_cpu_ptr(ioc->pcpu_stat);
2747 
2748 	if (on_q_ns <= size_nsec ||
2749 	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
2750 		local_inc(&ccs->missed[rw].nr_met);
2751 	else
2752 		local_inc(&ccs->missed[rw].nr_missed);
2753 
2754 	local64_add(rq_wait_ns, &ccs->rq_wait_ns);
2755 
2756 	put_cpu_ptr(ccs);
2757 }
2758 
2759 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
2760 {
2761 	struct ioc *ioc = rqos_to_ioc(rqos);
2762 
2763 	spin_lock_irq(&ioc->lock);
2764 	ioc_refresh_params(ioc, false);
2765 	spin_unlock_irq(&ioc->lock);
2766 }
2767 
2768 static void ioc_rqos_exit(struct rq_qos *rqos)
2769 {
2770 	struct ioc *ioc = rqos_to_ioc(rqos);
2771 
2772 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
2773 
2774 	spin_lock_irq(&ioc->lock);
2775 	ioc->running = IOC_STOP;
2776 	spin_unlock_irq(&ioc->lock);
2777 
2778 	del_timer_sync(&ioc->timer);
2779 	free_percpu(ioc->pcpu_stat);
2780 	kfree(ioc);
2781 }
2782 
2783 static struct rq_qos_ops ioc_rqos_ops = {
2784 	.throttle = ioc_rqos_throttle,
2785 	.merge = ioc_rqos_merge,
2786 	.done_bio = ioc_rqos_done_bio,
2787 	.done = ioc_rqos_done,
2788 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
2789 	.exit = ioc_rqos_exit,
2790 };
2791 
2792 static int blk_iocost_init(struct request_queue *q)
2793 {
2794 	struct ioc *ioc;
2795 	struct rq_qos *rqos;
2796 	int i, cpu, ret;
2797 
2798 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
2799 	if (!ioc)
2800 		return -ENOMEM;
2801 
2802 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
2803 	if (!ioc->pcpu_stat) {
2804 		kfree(ioc);
2805 		return -ENOMEM;
2806 	}
2807 
2808 	for_each_possible_cpu(cpu) {
2809 		struct ioc_pcpu_stat *ccs = per_cpu_ptr(ioc->pcpu_stat, cpu);
2810 
2811 		for (i = 0; i < ARRAY_SIZE(ccs->missed); i++) {
2812 			local_set(&ccs->missed[i].nr_met, 0);
2813 			local_set(&ccs->missed[i].nr_missed, 0);
2814 		}
2815 		local64_set(&ccs->rq_wait_ns, 0);
2816 	}
2817 
2818 	rqos = &ioc->rqos;
2819 	rqos->id = RQ_QOS_COST;
2820 	rqos->ops = &ioc_rqos_ops;
2821 	rqos->q = q;
2822 
2823 	spin_lock_init(&ioc->lock);
2824 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
2825 	INIT_LIST_HEAD(&ioc->active_iocgs);
2826 
2827 	ioc->running = IOC_IDLE;
2828 	ioc->vtime_base_rate = VTIME_PER_USEC;
2829 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2830 	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2831 	ioc->period_at = ktime_to_us(ktime_get());
2832 	atomic64_set(&ioc->cur_period, 0);
2833 	atomic_set(&ioc->hweight_gen, 0);
2834 
2835 	spin_lock_irq(&ioc->lock);
2836 	ioc->autop_idx = AUTOP_INVALID;
2837 	ioc_refresh_params(ioc, true);
2838 	spin_unlock_irq(&ioc->lock);
2839 
2840 	rq_qos_add(q, rqos);
2841 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2842 	if (ret) {
2843 		rq_qos_del(q, rqos);
2844 		free_percpu(ioc->pcpu_stat);
2845 		kfree(ioc);
2846 		return ret;
2847 	}
2848 	return 0;
2849 }
2850 
2851 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2852 {
2853 	struct ioc_cgrp *iocc;
2854 
2855 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2856 	if (!iocc)
2857 		return NULL;
2858 
2859 	iocc->dfl_weight = CGROUP_WEIGHT_DFL * WEIGHT_ONE;
2860 	return &iocc->cpd;
2861 }
2862 
2863 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2864 {
2865 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2866 }
2867 
2868 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2869 					     struct blkcg *blkcg)
2870 {
2871 	int levels = blkcg->css.cgroup->level + 1;
2872 	struct ioc_gq *iocg;
2873 
2874 	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2875 	if (!iocg)
2876 		return NULL;
2877 
2878 	iocg->pcpu_stat = alloc_percpu_gfp(struct iocg_pcpu_stat, gfp);
2879 	if (!iocg->pcpu_stat) {
2880 		kfree(iocg);
2881 		return NULL;
2882 	}
2883 
2884 	return &iocg->pd;
2885 }
2886 
2887 static void ioc_pd_init(struct blkg_policy_data *pd)
2888 {
2889 	struct ioc_gq *iocg = pd_to_iocg(pd);
2890 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2891 	struct ioc *ioc = q_to_ioc(blkg->q);
2892 	struct ioc_now now;
2893 	struct blkcg_gq *tblkg;
2894 	unsigned long flags;
2895 
2896 	ioc_now(ioc, &now);
2897 
2898 	iocg->ioc = ioc;
2899 	atomic64_set(&iocg->vtime, now.vnow);
2900 	atomic64_set(&iocg->done_vtime, now.vnow);
2901 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2902 	INIT_LIST_HEAD(&iocg->active_list);
2903 	INIT_LIST_HEAD(&iocg->walk_list);
2904 	INIT_LIST_HEAD(&iocg->surplus_list);
2905 	iocg->hweight_active = WEIGHT_ONE;
2906 	iocg->hweight_inuse = WEIGHT_ONE;
2907 
2908 	init_waitqueue_head(&iocg->waitq);
2909 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2910 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2911 
2912 	iocg->level = blkg->blkcg->css.cgroup->level;
2913 
2914 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2915 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2916 		iocg->ancestors[tiocg->level] = tiocg;
2917 	}
2918 
2919 	spin_lock_irqsave(&ioc->lock, flags);
2920 	weight_updated(iocg, &now);
2921 	spin_unlock_irqrestore(&ioc->lock, flags);
2922 }
2923 
2924 static void ioc_pd_free(struct blkg_policy_data *pd)
2925 {
2926 	struct ioc_gq *iocg = pd_to_iocg(pd);
2927 	struct ioc *ioc = iocg->ioc;
2928 	unsigned long flags;
2929 
2930 	if (ioc) {
2931 		spin_lock_irqsave(&ioc->lock, flags);
2932 
2933 		if (!list_empty(&iocg->active_list)) {
2934 			struct ioc_now now;
2935 
2936 			ioc_now(ioc, &now);
2937 			propagate_weights(iocg, 0, 0, false, &now);
2938 			list_del_init(&iocg->active_list);
2939 		}
2940 
2941 		WARN_ON_ONCE(!list_empty(&iocg->walk_list));
2942 		WARN_ON_ONCE(!list_empty(&iocg->surplus_list));
2943 
2944 		spin_unlock_irqrestore(&ioc->lock, flags);
2945 
2946 		hrtimer_cancel(&iocg->waitq_timer);
2947 	}
2948 	free_percpu(iocg->pcpu_stat);
2949 	kfree(iocg);
2950 }
2951 
2952 static size_t ioc_pd_stat(struct blkg_policy_data *pd, char *buf, size_t size)
2953 {
2954 	struct ioc_gq *iocg = pd_to_iocg(pd);
2955 	struct ioc *ioc = iocg->ioc;
2956 	size_t pos = 0;
2957 
2958 	if (!ioc->enabled)
2959 		return 0;
2960 
2961 	if (iocg->level == 0) {
2962 		unsigned vp10k = DIV64_U64_ROUND_CLOSEST(
2963 			ioc->vtime_base_rate * 10000,
2964 			VTIME_PER_USEC);
2965 		pos += scnprintf(buf + pos, size - pos, " cost.vrate=%u.%02u",
2966 				  vp10k / 100, vp10k % 100);
2967 	}
2968 
2969 	pos += scnprintf(buf + pos, size - pos, " cost.usage=%llu",
2970 			 iocg->last_stat.usage_us);
2971 
2972 	if (blkcg_debug_stats)
2973 		pos += scnprintf(buf + pos, size - pos,
2974 				 " cost.wait=%llu cost.indebt=%llu cost.indelay=%llu",
2975 				 iocg->last_stat.wait_us,
2976 				 iocg->last_stat.indebt_us,
2977 				 iocg->last_stat.indelay_us);
2978 
2979 	return pos;
2980 }
2981 
2982 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2983 			     int off)
2984 {
2985 	const char *dname = blkg_dev_name(pd->blkg);
2986 	struct ioc_gq *iocg = pd_to_iocg(pd);
2987 
2988 	if (dname && iocg->cfg_weight)
2989 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight / WEIGHT_ONE);
2990 	return 0;
2991 }
2992 
2993 
2994 static int ioc_weight_show(struct seq_file *sf, void *v)
2995 {
2996 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2997 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2998 
2999 	seq_printf(sf, "default %u\n", iocc->dfl_weight / WEIGHT_ONE);
3000 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
3001 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3002 	return 0;
3003 }
3004 
3005 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
3006 				size_t nbytes, loff_t off)
3007 {
3008 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
3009 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
3010 	struct blkg_conf_ctx ctx;
3011 	struct ioc_now now;
3012 	struct ioc_gq *iocg;
3013 	u32 v;
3014 	int ret;
3015 
3016 	if (!strchr(buf, ':')) {
3017 		struct blkcg_gq *blkg;
3018 
3019 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
3020 			return -EINVAL;
3021 
3022 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3023 			return -EINVAL;
3024 
3025 		spin_lock(&blkcg->lock);
3026 		iocc->dfl_weight = v * WEIGHT_ONE;
3027 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
3028 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
3029 
3030 			if (iocg) {
3031 				spin_lock_irq(&iocg->ioc->lock);
3032 				ioc_now(iocg->ioc, &now);
3033 				weight_updated(iocg, &now);
3034 				spin_unlock_irq(&iocg->ioc->lock);
3035 			}
3036 		}
3037 		spin_unlock(&blkcg->lock);
3038 
3039 		return nbytes;
3040 	}
3041 
3042 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
3043 	if (ret)
3044 		return ret;
3045 
3046 	iocg = blkg_to_iocg(ctx.blkg);
3047 
3048 	if (!strncmp(ctx.body, "default", 7)) {
3049 		v = 0;
3050 	} else {
3051 		if (!sscanf(ctx.body, "%u", &v))
3052 			goto einval;
3053 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
3054 			goto einval;
3055 	}
3056 
3057 	spin_lock(&iocg->ioc->lock);
3058 	iocg->cfg_weight = v * WEIGHT_ONE;
3059 	ioc_now(iocg->ioc, &now);
3060 	weight_updated(iocg, &now);
3061 	spin_unlock(&iocg->ioc->lock);
3062 
3063 	blkg_conf_finish(&ctx);
3064 	return nbytes;
3065 
3066 einval:
3067 	blkg_conf_finish(&ctx);
3068 	return -EINVAL;
3069 }
3070 
3071 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
3072 			  int off)
3073 {
3074 	const char *dname = blkg_dev_name(pd->blkg);
3075 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3076 
3077 	if (!dname)
3078 		return 0;
3079 
3080 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
3081 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
3082 		   ioc->params.qos[QOS_RPPM] / 10000,
3083 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
3084 		   ioc->params.qos[QOS_RLAT],
3085 		   ioc->params.qos[QOS_WPPM] / 10000,
3086 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
3087 		   ioc->params.qos[QOS_WLAT],
3088 		   ioc->params.qos[QOS_MIN] / 10000,
3089 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
3090 		   ioc->params.qos[QOS_MAX] / 10000,
3091 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
3092 	return 0;
3093 }
3094 
3095 static int ioc_qos_show(struct seq_file *sf, void *v)
3096 {
3097 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3098 
3099 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
3100 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3101 	return 0;
3102 }
3103 
3104 static const match_table_t qos_ctrl_tokens = {
3105 	{ QOS_ENABLE,		"enable=%u"	},
3106 	{ QOS_CTRL,		"ctrl=%s"	},
3107 	{ NR_QOS_CTRL_PARAMS,	NULL		},
3108 };
3109 
3110 static const match_table_t qos_tokens = {
3111 	{ QOS_RPPM,		"rpct=%s"	},
3112 	{ QOS_RLAT,		"rlat=%u"	},
3113 	{ QOS_WPPM,		"wpct=%s"	},
3114 	{ QOS_WLAT,		"wlat=%u"	},
3115 	{ QOS_MIN,		"min=%s"	},
3116 	{ QOS_MAX,		"max=%s"	},
3117 	{ NR_QOS_PARAMS,	NULL		},
3118 };
3119 
3120 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
3121 			     size_t nbytes, loff_t off)
3122 {
3123 	struct gendisk *disk;
3124 	struct ioc *ioc;
3125 	u32 qos[NR_QOS_PARAMS];
3126 	bool enable, user;
3127 	char *p;
3128 	int ret;
3129 
3130 	disk = blkcg_conf_get_disk(&input);
3131 	if (IS_ERR(disk))
3132 		return PTR_ERR(disk);
3133 
3134 	ioc = q_to_ioc(disk->queue);
3135 	if (!ioc) {
3136 		ret = blk_iocost_init(disk->queue);
3137 		if (ret)
3138 			goto err;
3139 		ioc = q_to_ioc(disk->queue);
3140 	}
3141 
3142 	spin_lock_irq(&ioc->lock);
3143 	memcpy(qos, ioc->params.qos, sizeof(qos));
3144 	enable = ioc->enabled;
3145 	user = ioc->user_qos_params;
3146 	spin_unlock_irq(&ioc->lock);
3147 
3148 	while ((p = strsep(&input, " \t\n"))) {
3149 		substring_t args[MAX_OPT_ARGS];
3150 		char buf[32];
3151 		int tok;
3152 		s64 v;
3153 
3154 		if (!*p)
3155 			continue;
3156 
3157 		switch (match_token(p, qos_ctrl_tokens, args)) {
3158 		case QOS_ENABLE:
3159 			match_u64(&args[0], &v);
3160 			enable = v;
3161 			continue;
3162 		case QOS_CTRL:
3163 			match_strlcpy(buf, &args[0], sizeof(buf));
3164 			if (!strcmp(buf, "auto"))
3165 				user = false;
3166 			else if (!strcmp(buf, "user"))
3167 				user = true;
3168 			else
3169 				goto einval;
3170 			continue;
3171 		}
3172 
3173 		tok = match_token(p, qos_tokens, args);
3174 		switch (tok) {
3175 		case QOS_RPPM:
3176 		case QOS_WPPM:
3177 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3178 			    sizeof(buf))
3179 				goto einval;
3180 			if (cgroup_parse_float(buf, 2, &v))
3181 				goto einval;
3182 			if (v < 0 || v > 10000)
3183 				goto einval;
3184 			qos[tok] = v * 100;
3185 			break;
3186 		case QOS_RLAT:
3187 		case QOS_WLAT:
3188 			if (match_u64(&args[0], &v))
3189 				goto einval;
3190 			qos[tok] = v;
3191 			break;
3192 		case QOS_MIN:
3193 		case QOS_MAX:
3194 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
3195 			    sizeof(buf))
3196 				goto einval;
3197 			if (cgroup_parse_float(buf, 2, &v))
3198 				goto einval;
3199 			if (v < 0)
3200 				goto einval;
3201 			qos[tok] = clamp_t(s64, v * 100,
3202 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
3203 			break;
3204 		default:
3205 			goto einval;
3206 		}
3207 		user = true;
3208 	}
3209 
3210 	if (qos[QOS_MIN] > qos[QOS_MAX])
3211 		goto einval;
3212 
3213 	spin_lock_irq(&ioc->lock);
3214 
3215 	if (enable) {
3216 		blk_stat_enable_accounting(ioc->rqos.q);
3217 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3218 		ioc->enabled = true;
3219 	} else {
3220 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
3221 		ioc->enabled = false;
3222 	}
3223 
3224 	if (user) {
3225 		memcpy(ioc->params.qos, qos, sizeof(qos));
3226 		ioc->user_qos_params = true;
3227 	} else {
3228 		ioc->user_qos_params = false;
3229 	}
3230 
3231 	ioc_refresh_params(ioc, true);
3232 	spin_unlock_irq(&ioc->lock);
3233 
3234 	put_disk_and_module(disk);
3235 	return nbytes;
3236 einval:
3237 	ret = -EINVAL;
3238 err:
3239 	put_disk_and_module(disk);
3240 	return ret;
3241 }
3242 
3243 static u64 ioc_cost_model_prfill(struct seq_file *sf,
3244 				 struct blkg_policy_data *pd, int off)
3245 {
3246 	const char *dname = blkg_dev_name(pd->blkg);
3247 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
3248 	u64 *u = ioc->params.i_lcoefs;
3249 
3250 	if (!dname)
3251 		return 0;
3252 
3253 	seq_printf(sf, "%s ctrl=%s model=linear "
3254 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
3255 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
3256 		   dname, ioc->user_cost_model ? "user" : "auto",
3257 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
3258 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
3259 	return 0;
3260 }
3261 
3262 static int ioc_cost_model_show(struct seq_file *sf, void *v)
3263 {
3264 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
3265 
3266 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
3267 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
3268 	return 0;
3269 }
3270 
3271 static const match_table_t cost_ctrl_tokens = {
3272 	{ COST_CTRL,		"ctrl=%s"	},
3273 	{ COST_MODEL,		"model=%s"	},
3274 	{ NR_COST_CTRL_PARAMS,	NULL		},
3275 };
3276 
3277 static const match_table_t i_lcoef_tokens = {
3278 	{ I_LCOEF_RBPS,		"rbps=%u"	},
3279 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
3280 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
3281 	{ I_LCOEF_WBPS,		"wbps=%u"	},
3282 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
3283 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
3284 	{ NR_I_LCOEFS,		NULL		},
3285 };
3286 
3287 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
3288 				    size_t nbytes, loff_t off)
3289 {
3290 	struct gendisk *disk;
3291 	struct ioc *ioc;
3292 	u64 u[NR_I_LCOEFS];
3293 	bool user;
3294 	char *p;
3295 	int ret;
3296 
3297 	disk = blkcg_conf_get_disk(&input);
3298 	if (IS_ERR(disk))
3299 		return PTR_ERR(disk);
3300 
3301 	ioc = q_to_ioc(disk->queue);
3302 	if (!ioc) {
3303 		ret = blk_iocost_init(disk->queue);
3304 		if (ret)
3305 			goto err;
3306 		ioc = q_to_ioc(disk->queue);
3307 	}
3308 
3309 	spin_lock_irq(&ioc->lock);
3310 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
3311 	user = ioc->user_cost_model;
3312 	spin_unlock_irq(&ioc->lock);
3313 
3314 	while ((p = strsep(&input, " \t\n"))) {
3315 		substring_t args[MAX_OPT_ARGS];
3316 		char buf[32];
3317 		int tok;
3318 		u64 v;
3319 
3320 		if (!*p)
3321 			continue;
3322 
3323 		switch (match_token(p, cost_ctrl_tokens, args)) {
3324 		case COST_CTRL:
3325 			match_strlcpy(buf, &args[0], sizeof(buf));
3326 			if (!strcmp(buf, "auto"))
3327 				user = false;
3328 			else if (!strcmp(buf, "user"))
3329 				user = true;
3330 			else
3331 				goto einval;
3332 			continue;
3333 		case COST_MODEL:
3334 			match_strlcpy(buf, &args[0], sizeof(buf));
3335 			if (strcmp(buf, "linear"))
3336 				goto einval;
3337 			continue;
3338 		}
3339 
3340 		tok = match_token(p, i_lcoef_tokens, args);
3341 		if (tok == NR_I_LCOEFS)
3342 			goto einval;
3343 		if (match_u64(&args[0], &v))
3344 			goto einval;
3345 		u[tok] = v;
3346 		user = true;
3347 	}
3348 
3349 	spin_lock_irq(&ioc->lock);
3350 	if (user) {
3351 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
3352 		ioc->user_cost_model = true;
3353 	} else {
3354 		ioc->user_cost_model = false;
3355 	}
3356 	ioc_refresh_params(ioc, true);
3357 	spin_unlock_irq(&ioc->lock);
3358 
3359 	put_disk_and_module(disk);
3360 	return nbytes;
3361 
3362 einval:
3363 	ret = -EINVAL;
3364 err:
3365 	put_disk_and_module(disk);
3366 	return ret;
3367 }
3368 
3369 static struct cftype ioc_files[] = {
3370 	{
3371 		.name = "weight",
3372 		.flags = CFTYPE_NOT_ON_ROOT,
3373 		.seq_show = ioc_weight_show,
3374 		.write = ioc_weight_write,
3375 	},
3376 	{
3377 		.name = "cost.qos",
3378 		.flags = CFTYPE_ONLY_ON_ROOT,
3379 		.seq_show = ioc_qos_show,
3380 		.write = ioc_qos_write,
3381 	},
3382 	{
3383 		.name = "cost.model",
3384 		.flags = CFTYPE_ONLY_ON_ROOT,
3385 		.seq_show = ioc_cost_model_show,
3386 		.write = ioc_cost_model_write,
3387 	},
3388 	{}
3389 };
3390 
3391 static struct blkcg_policy blkcg_policy_iocost = {
3392 	.dfl_cftypes	= ioc_files,
3393 	.cpd_alloc_fn	= ioc_cpd_alloc,
3394 	.cpd_free_fn	= ioc_cpd_free,
3395 	.pd_alloc_fn	= ioc_pd_alloc,
3396 	.pd_init_fn	= ioc_pd_init,
3397 	.pd_free_fn	= ioc_pd_free,
3398 	.pd_stat_fn	= ioc_pd_stat,
3399 };
3400 
3401 static int __init ioc_init(void)
3402 {
3403 	return blkcg_policy_register(&blkcg_policy_iocost);
3404 }
3405 
3406 static void __exit ioc_exit(void)
3407 {
3408 	blkcg_policy_unregister(&blkcg_policy_iocost);
3409 }
3410 
3411 module_init(ioc_init);
3412 module_exit(ioc_exit);
3413