xref: /openbmc/linux/block/blk-iocost.c (revision 3503d56c)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (HWEIGHT_WHOLE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO iff doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The ouput looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include "blk-rq-qos.h"
183 #include "blk-stat.h"
184 #include "blk-wbt.h"
185 
186 #ifdef CONFIG_TRACEPOINTS
187 
188 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
189 #define TRACE_IOCG_PATH_LEN 1024
190 static DEFINE_SPINLOCK(trace_iocg_path_lock);
191 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
192 
193 #define TRACE_IOCG_PATH(type, iocg, ...)					\
194 	do {									\
195 		unsigned long flags;						\
196 		if (trace_iocost_##type##_enabled()) {				\
197 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
198 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
199 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
200 			trace_iocost_##type(iocg, trace_iocg_path,		\
201 					      ##__VA_ARGS__);			\
202 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
203 		}								\
204 	} while (0)
205 
206 #else	/* CONFIG_TRACE_POINTS */
207 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
208 #endif	/* CONFIG_TRACE_POINTS */
209 
210 enum {
211 	MILLION			= 1000000,
212 
213 	/* timer period is calculated from latency requirements, bound it */
214 	MIN_PERIOD		= USEC_PER_MSEC,
215 	MAX_PERIOD		= USEC_PER_SEC,
216 
217 	/*
218 	 * A cgroup's vtime can run 50% behind the device vtime, which
219 	 * serves as its IO credit buffer.  Surplus weight adjustment is
220 	 * immediately canceled if the vtime margin runs below 10%.
221 	 */
222 	MARGIN_PCT		= 50,
223 	INUSE_MARGIN_PCT	= 10,
224 
225 	/* Have some play in waitq timer operations */
226 	WAITQ_TIMER_MARGIN_PCT	= 5,
227 
228 	/*
229 	 * vtime can wrap well within a reasonable uptime when vrate is
230 	 * consistently raised.  Don't trust recorded cgroup vtime if the
231 	 * period counter indicates that it's older than 5mins.
232 	 */
233 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
234 
235 	/*
236 	 * Remember the past three non-zero usages and use the max for
237 	 * surplus calculation.  Three slots guarantee that we remember one
238 	 * full period usage from the last active stretch even after
239 	 * partial deactivation and re-activation periods.  Don't start
240 	 * giving away weight before collecting two data points to prevent
241 	 * hweight adjustments based on one partial activation period.
242 	 */
243 	NR_USAGE_SLOTS		= 3,
244 	MIN_VALID_USAGES	= 2,
245 
246 	/* 1/64k is granular enough and can easily be handled w/ u32 */
247 	HWEIGHT_WHOLE		= 1 << 16,
248 
249 	/*
250 	 * As vtime is used to calculate the cost of each IO, it needs to
251 	 * be fairly high precision.  For example, it should be able to
252 	 * represent the cost of a single page worth of discard with
253 	 * suffificient accuracy.  At the same time, it should be able to
254 	 * represent reasonably long enough durations to be useful and
255 	 * convenient during operation.
256 	 *
257 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
258 	 * granularity and days of wrap-around time even at extreme vrates.
259 	 */
260 	VTIME_PER_SEC_SHIFT	= 37,
261 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
262 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
263 	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
264 
265 	/* bound vrate adjustments within two orders of magnitude */
266 	VRATE_MIN_PPM		= 10000,	/* 1% */
267 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
268 
269 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
270 	VRATE_CLAMP_ADJ_PCT	= 4,
271 
272 	/* if IOs end up waiting for requests, issue less */
273 	RQ_WAIT_BUSY_PCT	= 5,
274 
275 	/* unbusy hysterisis */
276 	UNBUSY_THR_PCT		= 75,
277 
278 	/* don't let cmds which take a very long time pin lagging for too long */
279 	MAX_LAGGING_PERIODS	= 10,
280 
281 	/*
282 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
283 	 * donate the surplus.
284 	 */
285 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
286 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
287 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
288 
289 	/* switch iff the conditions are met for longer than this */
290 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
291 
292 	/*
293 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
294 	 * size-proportional components of cost calculation in closer
295 	 * numbers of digits to per-IO cost components.
296 	 */
297 	IOC_PAGE_SHIFT		= 12,
298 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
299 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
300 
301 	/* if apart further than 16M, consider randio for linear model */
302 	LCOEF_RANDIO_PAGES	= 4096,
303 };
304 
305 enum ioc_running {
306 	IOC_IDLE,
307 	IOC_RUNNING,
308 	IOC_STOP,
309 };
310 
311 /* io.cost.qos controls including per-dev enable of the whole controller */
312 enum {
313 	QOS_ENABLE,
314 	QOS_CTRL,
315 	NR_QOS_CTRL_PARAMS,
316 };
317 
318 /* io.cost.qos params */
319 enum {
320 	QOS_RPPM,
321 	QOS_RLAT,
322 	QOS_WPPM,
323 	QOS_WLAT,
324 	QOS_MIN,
325 	QOS_MAX,
326 	NR_QOS_PARAMS,
327 };
328 
329 /* io.cost.model controls */
330 enum {
331 	COST_CTRL,
332 	COST_MODEL,
333 	NR_COST_CTRL_PARAMS,
334 };
335 
336 /* builtin linear cost model coefficients */
337 enum {
338 	I_LCOEF_RBPS,
339 	I_LCOEF_RSEQIOPS,
340 	I_LCOEF_RRANDIOPS,
341 	I_LCOEF_WBPS,
342 	I_LCOEF_WSEQIOPS,
343 	I_LCOEF_WRANDIOPS,
344 	NR_I_LCOEFS,
345 };
346 
347 enum {
348 	LCOEF_RPAGE,
349 	LCOEF_RSEQIO,
350 	LCOEF_RRANDIO,
351 	LCOEF_WPAGE,
352 	LCOEF_WSEQIO,
353 	LCOEF_WRANDIO,
354 	NR_LCOEFS,
355 };
356 
357 enum {
358 	AUTOP_INVALID,
359 	AUTOP_HDD,
360 	AUTOP_SSD_QD1,
361 	AUTOP_SSD_DFL,
362 	AUTOP_SSD_FAST,
363 };
364 
365 struct ioc_gq;
366 
367 struct ioc_params {
368 	u32				qos[NR_QOS_PARAMS];
369 	u64				i_lcoefs[NR_I_LCOEFS];
370 	u64				lcoefs[NR_LCOEFS];
371 	u32				too_fast_vrate_pct;
372 	u32				too_slow_vrate_pct;
373 };
374 
375 struct ioc_missed {
376 	u32				nr_met;
377 	u32				nr_missed;
378 	u32				last_met;
379 	u32				last_missed;
380 };
381 
382 struct ioc_pcpu_stat {
383 	struct ioc_missed		missed[2];
384 
385 	u64				rq_wait_ns;
386 	u64				last_rq_wait_ns;
387 };
388 
389 /* per device */
390 struct ioc {
391 	struct rq_qos			rqos;
392 
393 	bool				enabled;
394 
395 	struct ioc_params		params;
396 	u32				period_us;
397 	u32				margin_us;
398 	u64				vrate_min;
399 	u64				vrate_max;
400 
401 	spinlock_t			lock;
402 	struct timer_list		timer;
403 	struct list_head		active_iocgs;	/* active cgroups */
404 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
405 
406 	enum ioc_running		running;
407 	atomic64_t			vtime_rate;
408 
409 	seqcount_t			period_seqcount;
410 	u32				period_at;	/* wallclock starttime */
411 	u64				period_at_vtime; /* vtime starttime */
412 
413 	atomic64_t			cur_period;	/* inc'd each period */
414 	int				busy_level;	/* saturation history */
415 
416 	u64				inuse_margin_vtime;
417 	bool				weights_updated;
418 	atomic_t			hweight_gen;	/* for lazy hweights */
419 
420 	u64				autop_too_fast_at;
421 	u64				autop_too_slow_at;
422 	int				autop_idx;
423 	bool				user_qos_params:1;
424 	bool				user_cost_model:1;
425 };
426 
427 /* per device-cgroup pair */
428 struct ioc_gq {
429 	struct blkg_policy_data		pd;
430 	struct ioc			*ioc;
431 
432 	/*
433 	 * A iocg can get its weight from two sources - an explicit
434 	 * per-device-cgroup configuration or the default weight of the
435 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
436 	 * configuration.  `weight` is the effective considering both
437 	 * sources.
438 	 *
439 	 * When an idle cgroup becomes active its `active` goes from 0 to
440 	 * `weight`.  `inuse` is the surplus adjusted active weight.
441 	 * `active` and `inuse` are used to calculate `hweight_active` and
442 	 * `hweight_inuse`.
443 	 *
444 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
445 	 * surplus adjustments.
446 	 */
447 	u32				cfg_weight;
448 	u32				weight;
449 	u32				active;
450 	u32				inuse;
451 	u32				last_inuse;
452 
453 	sector_t			cursor;		/* to detect randio */
454 
455 	/*
456 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
457 	 * issued.  If lagging behind device vtime, the delta represents
458 	 * the currently available IO budget.  If runnning ahead, the
459 	 * overage.
460 	 *
461 	 * `vtime_done` is the same but progressed on completion rather
462 	 * than issue.  The delta behind `vtime` represents the cost of
463 	 * currently in-flight IOs.
464 	 *
465 	 * `last_vtime` is used to remember `vtime` at the end of the last
466 	 * period to calculate utilization.
467 	 */
468 	atomic64_t			vtime;
469 	atomic64_t			done_vtime;
470 	u64				abs_vdebt;
471 	u64				last_vtime;
472 
473 	/*
474 	 * The period this iocg was last active in.  Used for deactivation
475 	 * and invalidating `vtime`.
476 	 */
477 	atomic64_t			active_period;
478 	struct list_head		active_list;
479 
480 	/* see __propagate_active_weight() and current_hweight() for details */
481 	u64				child_active_sum;
482 	u64				child_inuse_sum;
483 	int				hweight_gen;
484 	u32				hweight_active;
485 	u32				hweight_inuse;
486 	bool				has_surplus;
487 
488 	struct wait_queue_head		waitq;
489 	struct hrtimer			waitq_timer;
490 	struct hrtimer			delay_timer;
491 
492 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
493 	int				usage_idx;
494 	u32				usages[NR_USAGE_SLOTS];
495 
496 	/* this iocg's depth in the hierarchy and ancestors including self */
497 	int				level;
498 	struct ioc_gq			*ancestors[];
499 };
500 
501 /* per cgroup */
502 struct ioc_cgrp {
503 	struct blkcg_policy_data	cpd;
504 	unsigned int			dfl_weight;
505 };
506 
507 struct ioc_now {
508 	u64				now_ns;
509 	u32				now;
510 	u64				vnow;
511 	u64				vrate;
512 };
513 
514 struct iocg_wait {
515 	struct wait_queue_entry		wait;
516 	struct bio			*bio;
517 	u64				abs_cost;
518 	bool				committed;
519 };
520 
521 struct iocg_wake_ctx {
522 	struct ioc_gq			*iocg;
523 	u32				hw_inuse;
524 	s64				vbudget;
525 };
526 
527 static const struct ioc_params autop[] = {
528 	[AUTOP_HDD] = {
529 		.qos				= {
530 			[QOS_RLAT]		=        250000, /* 250ms */
531 			[QOS_WLAT]		=        250000,
532 			[QOS_MIN]		= VRATE_MIN_PPM,
533 			[QOS_MAX]		= VRATE_MAX_PPM,
534 		},
535 		.i_lcoefs			= {
536 			[I_LCOEF_RBPS]		=     174019176,
537 			[I_LCOEF_RSEQIOPS]	=         41708,
538 			[I_LCOEF_RRANDIOPS]	=           370,
539 			[I_LCOEF_WBPS]		=     178075866,
540 			[I_LCOEF_WSEQIOPS]	=         42705,
541 			[I_LCOEF_WRANDIOPS]	=           378,
542 		},
543 	},
544 	[AUTOP_SSD_QD1] = {
545 		.qos				= {
546 			[QOS_RLAT]		=         25000, /* 25ms */
547 			[QOS_WLAT]		=         25000,
548 			[QOS_MIN]		= VRATE_MIN_PPM,
549 			[QOS_MAX]		= VRATE_MAX_PPM,
550 		},
551 		.i_lcoefs			= {
552 			[I_LCOEF_RBPS]		=     245855193,
553 			[I_LCOEF_RSEQIOPS]	=         61575,
554 			[I_LCOEF_RRANDIOPS]	=          6946,
555 			[I_LCOEF_WBPS]		=     141365009,
556 			[I_LCOEF_WSEQIOPS]	=         33716,
557 			[I_LCOEF_WRANDIOPS]	=         26796,
558 		},
559 	},
560 	[AUTOP_SSD_DFL] = {
561 		.qos				= {
562 			[QOS_RLAT]		=         25000, /* 25ms */
563 			[QOS_WLAT]		=         25000,
564 			[QOS_MIN]		= VRATE_MIN_PPM,
565 			[QOS_MAX]		= VRATE_MAX_PPM,
566 		},
567 		.i_lcoefs			= {
568 			[I_LCOEF_RBPS]		=     488636629,
569 			[I_LCOEF_RSEQIOPS]	=          8932,
570 			[I_LCOEF_RRANDIOPS]	=          8518,
571 			[I_LCOEF_WBPS]		=     427891549,
572 			[I_LCOEF_WSEQIOPS]	=         28755,
573 			[I_LCOEF_WRANDIOPS]	=         21940,
574 		},
575 		.too_fast_vrate_pct		=           500,
576 	},
577 	[AUTOP_SSD_FAST] = {
578 		.qos				= {
579 			[QOS_RLAT]		=          5000, /* 5ms */
580 			[QOS_WLAT]		=          5000,
581 			[QOS_MIN]		= VRATE_MIN_PPM,
582 			[QOS_MAX]		= VRATE_MAX_PPM,
583 		},
584 		.i_lcoefs			= {
585 			[I_LCOEF_RBPS]		=    3102524156LLU,
586 			[I_LCOEF_RSEQIOPS]	=        724816,
587 			[I_LCOEF_RRANDIOPS]	=        778122,
588 			[I_LCOEF_WBPS]		=    1742780862LLU,
589 			[I_LCOEF_WSEQIOPS]	=        425702,
590 			[I_LCOEF_WRANDIOPS]	=	 443193,
591 		},
592 		.too_slow_vrate_pct		=            10,
593 	},
594 };
595 
596 /*
597  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
598  * vtime credit shortage and down on device saturation.
599  */
600 static u32 vrate_adj_pct[] =
601 	{ 0, 0, 0, 0,
602 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
603 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
604 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
605 
606 static struct blkcg_policy blkcg_policy_iocost;
607 
608 /* accessors and helpers */
609 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
610 {
611 	return container_of(rqos, struct ioc, rqos);
612 }
613 
614 static struct ioc *q_to_ioc(struct request_queue *q)
615 {
616 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
617 }
618 
619 static const char *q_name(struct request_queue *q)
620 {
621 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
622 		return kobject_name(q->kobj.parent);
623 	else
624 		return "<unknown>";
625 }
626 
627 static const char __maybe_unused *ioc_name(struct ioc *ioc)
628 {
629 	return q_name(ioc->rqos.q);
630 }
631 
632 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
633 {
634 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
635 }
636 
637 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
638 {
639 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
640 }
641 
642 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
643 {
644 	return pd_to_blkg(&iocg->pd);
645 }
646 
647 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
648 {
649 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
650 			    struct ioc_cgrp, cpd);
651 }
652 
653 /*
654  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
655  * weight, the more expensive each IO.  Must round up.
656  */
657 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
658 {
659 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
660 }
661 
662 /*
663  * The inverse of abs_cost_to_cost().  Must round up.
664  */
665 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
666 {
667 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
668 }
669 
670 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
671 {
672 	bio->bi_iocost_cost = cost;
673 	atomic64_add(cost, &iocg->vtime);
674 }
675 
676 #define CREATE_TRACE_POINTS
677 #include <trace/events/iocost.h>
678 
679 /* latency Qos params changed, update period_us and all the dependent params */
680 static void ioc_refresh_period_us(struct ioc *ioc)
681 {
682 	u32 ppm, lat, multi, period_us;
683 
684 	lockdep_assert_held(&ioc->lock);
685 
686 	/* pick the higher latency target */
687 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
688 		ppm = ioc->params.qos[QOS_RPPM];
689 		lat = ioc->params.qos[QOS_RLAT];
690 	} else {
691 		ppm = ioc->params.qos[QOS_WPPM];
692 		lat = ioc->params.qos[QOS_WLAT];
693 	}
694 
695 	/*
696 	 * We want the period to be long enough to contain a healthy number
697 	 * of IOs while short enough for granular control.  Define it as a
698 	 * multiple of the latency target.  Ideally, the multiplier should
699 	 * be scaled according to the percentile so that it would nominally
700 	 * contain a certain number of requests.  Let's be simpler and
701 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
702 	 */
703 	if (ppm)
704 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
705 	else
706 		multi = 2;
707 	period_us = multi * lat;
708 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
709 
710 	/* calculate dependent params */
711 	ioc->period_us = period_us;
712 	ioc->margin_us = period_us * MARGIN_PCT / 100;
713 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
714 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
715 }
716 
717 static int ioc_autop_idx(struct ioc *ioc)
718 {
719 	int idx = ioc->autop_idx;
720 	const struct ioc_params *p = &autop[idx];
721 	u32 vrate_pct;
722 	u64 now_ns;
723 
724 	/* rotational? */
725 	if (!blk_queue_nonrot(ioc->rqos.q))
726 		return AUTOP_HDD;
727 
728 	/* handle SATA SSDs w/ broken NCQ */
729 	if (blk_queue_depth(ioc->rqos.q) == 1)
730 		return AUTOP_SSD_QD1;
731 
732 	/* use one of the normal ssd sets */
733 	if (idx < AUTOP_SSD_DFL)
734 		return AUTOP_SSD_DFL;
735 
736 	/* if user is overriding anything, maintain what was there */
737 	if (ioc->user_qos_params || ioc->user_cost_model)
738 		return idx;
739 
740 	/* step up/down based on the vrate */
741 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
742 			      VTIME_PER_USEC);
743 	now_ns = ktime_get_ns();
744 
745 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
746 		if (!ioc->autop_too_fast_at)
747 			ioc->autop_too_fast_at = now_ns;
748 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
749 			return idx + 1;
750 	} else {
751 		ioc->autop_too_fast_at = 0;
752 	}
753 
754 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
755 		if (!ioc->autop_too_slow_at)
756 			ioc->autop_too_slow_at = now_ns;
757 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
758 			return idx - 1;
759 	} else {
760 		ioc->autop_too_slow_at = 0;
761 	}
762 
763 	return idx;
764 }
765 
766 /*
767  * Take the followings as input
768  *
769  *  @bps	maximum sequential throughput
770  *  @seqiops	maximum sequential 4k iops
771  *  @randiops	maximum random 4k iops
772  *
773  * and calculate the linear model cost coefficients.
774  *
775  *  *@page	per-page cost		1s / (@bps / 4096)
776  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
777  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
778  */
779 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
780 			u64 *page, u64 *seqio, u64 *randio)
781 {
782 	u64 v;
783 
784 	*page = *seqio = *randio = 0;
785 
786 	if (bps)
787 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
788 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
789 
790 	if (seqiops) {
791 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
792 		if (v > *page)
793 			*seqio = v - *page;
794 	}
795 
796 	if (randiops) {
797 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
798 		if (v > *page)
799 			*randio = v - *page;
800 	}
801 }
802 
803 static void ioc_refresh_lcoefs(struct ioc *ioc)
804 {
805 	u64 *u = ioc->params.i_lcoefs;
806 	u64 *c = ioc->params.lcoefs;
807 
808 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
809 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
810 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
811 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
812 }
813 
814 static bool ioc_refresh_params(struct ioc *ioc, bool force)
815 {
816 	const struct ioc_params *p;
817 	int idx;
818 
819 	lockdep_assert_held(&ioc->lock);
820 
821 	idx = ioc_autop_idx(ioc);
822 	p = &autop[idx];
823 
824 	if (idx == ioc->autop_idx && !force)
825 		return false;
826 
827 	if (idx != ioc->autop_idx)
828 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
829 
830 	ioc->autop_idx = idx;
831 	ioc->autop_too_fast_at = 0;
832 	ioc->autop_too_slow_at = 0;
833 
834 	if (!ioc->user_qos_params)
835 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
836 	if (!ioc->user_cost_model)
837 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
838 
839 	ioc_refresh_period_us(ioc);
840 	ioc_refresh_lcoefs(ioc);
841 
842 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
843 					    VTIME_PER_USEC, MILLION);
844 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
845 				   VTIME_PER_USEC, MILLION);
846 
847 	return true;
848 }
849 
850 /* take a snapshot of the current [v]time and vrate */
851 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
852 {
853 	unsigned seq;
854 
855 	now->now_ns = ktime_get();
856 	now->now = ktime_to_us(now->now_ns);
857 	now->vrate = atomic64_read(&ioc->vtime_rate);
858 
859 	/*
860 	 * The current vtime is
861 	 *
862 	 *   vtime at period start + (wallclock time since the start) * vrate
863 	 *
864 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
865 	 * needed, they're seqcount protected.
866 	 */
867 	do {
868 		seq = read_seqcount_begin(&ioc->period_seqcount);
869 		now->vnow = ioc->period_at_vtime +
870 			(now->now - ioc->period_at) * now->vrate;
871 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
872 }
873 
874 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
875 {
876 	lockdep_assert_held(&ioc->lock);
877 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
878 
879 	write_seqcount_begin(&ioc->period_seqcount);
880 	ioc->period_at = now->now;
881 	ioc->period_at_vtime = now->vnow;
882 	write_seqcount_end(&ioc->period_seqcount);
883 
884 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
885 	add_timer(&ioc->timer);
886 }
887 
888 /*
889  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
890  * weight sums and propagate upwards accordingly.
891  */
892 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
893 {
894 	struct ioc *ioc = iocg->ioc;
895 	int lvl;
896 
897 	lockdep_assert_held(&ioc->lock);
898 
899 	inuse = min(active, inuse);
900 
901 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
902 		struct ioc_gq *parent = iocg->ancestors[lvl];
903 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
904 		u32 parent_active = 0, parent_inuse = 0;
905 
906 		/* update the level sums */
907 		parent->child_active_sum += (s32)(active - child->active);
908 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
909 		/* apply the udpates */
910 		child->active = active;
911 		child->inuse = inuse;
912 
913 		/*
914 		 * The delta between inuse and active sums indicates that
915 		 * that much of weight is being given away.  Parent's inuse
916 		 * and active should reflect the ratio.
917 		 */
918 		if (parent->child_active_sum) {
919 			parent_active = parent->weight;
920 			parent_inuse = DIV64_U64_ROUND_UP(
921 				parent_active * parent->child_inuse_sum,
922 				parent->child_active_sum);
923 		}
924 
925 		/* do we need to keep walking up? */
926 		if (parent_active == parent->active &&
927 		    parent_inuse == parent->inuse)
928 			break;
929 
930 		active = parent_active;
931 		inuse = parent_inuse;
932 	}
933 
934 	ioc->weights_updated = true;
935 }
936 
937 static void commit_active_weights(struct ioc *ioc)
938 {
939 	lockdep_assert_held(&ioc->lock);
940 
941 	if (ioc->weights_updated) {
942 		/* paired with rmb in current_hweight(), see there */
943 		smp_wmb();
944 		atomic_inc(&ioc->hweight_gen);
945 		ioc->weights_updated = false;
946 	}
947 }
948 
949 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
950 {
951 	__propagate_active_weight(iocg, active, inuse);
952 	commit_active_weights(iocg->ioc);
953 }
954 
955 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
956 {
957 	struct ioc *ioc = iocg->ioc;
958 	int lvl;
959 	u32 hwa, hwi;
960 	int ioc_gen;
961 
962 	/* hot path - if uptodate, use cached */
963 	ioc_gen = atomic_read(&ioc->hweight_gen);
964 	if (ioc_gen == iocg->hweight_gen)
965 		goto out;
966 
967 	/*
968 	 * Paired with wmb in commit_active_weights().  If we saw the
969 	 * updated hweight_gen, all the weight updates from
970 	 * __propagate_active_weight() are visible too.
971 	 *
972 	 * We can race with weight updates during calculation and get it
973 	 * wrong.  However, hweight_gen would have changed and a future
974 	 * reader will recalculate and we're guaranteed to discard the
975 	 * wrong result soon.
976 	 */
977 	smp_rmb();
978 
979 	hwa = hwi = HWEIGHT_WHOLE;
980 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
981 		struct ioc_gq *parent = iocg->ancestors[lvl];
982 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
983 		u32 active_sum = READ_ONCE(parent->child_active_sum);
984 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
985 		u32 active = READ_ONCE(child->active);
986 		u32 inuse = READ_ONCE(child->inuse);
987 
988 		/* we can race with deactivations and either may read as zero */
989 		if (!active_sum || !inuse_sum)
990 			continue;
991 
992 		active_sum = max(active, active_sum);
993 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
994 
995 		inuse_sum = max(inuse, inuse_sum);
996 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
997 	}
998 
999 	iocg->hweight_active = max_t(u32, hwa, 1);
1000 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1001 	iocg->hweight_gen = ioc_gen;
1002 out:
1003 	if (hw_activep)
1004 		*hw_activep = iocg->hweight_active;
1005 	if (hw_inusep)
1006 		*hw_inusep = iocg->hweight_inuse;
1007 }
1008 
1009 static void weight_updated(struct ioc_gq *iocg)
1010 {
1011 	struct ioc *ioc = iocg->ioc;
1012 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1013 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1014 	u32 weight;
1015 
1016 	lockdep_assert_held(&ioc->lock);
1017 
1018 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1019 	if (weight != iocg->weight && iocg->active)
1020 		propagate_active_weight(iocg, weight,
1021 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1022 	iocg->weight = weight;
1023 }
1024 
1025 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1026 {
1027 	struct ioc *ioc = iocg->ioc;
1028 	u64 last_period, cur_period, max_period_delta;
1029 	u64 vtime, vmargin, vmin;
1030 	int i;
1031 
1032 	/*
1033 	 * If seem to be already active, just update the stamp to tell the
1034 	 * timer that we're still active.  We don't mind occassional races.
1035 	 */
1036 	if (!list_empty(&iocg->active_list)) {
1037 		ioc_now(ioc, now);
1038 		cur_period = atomic64_read(&ioc->cur_period);
1039 		if (atomic64_read(&iocg->active_period) != cur_period)
1040 			atomic64_set(&iocg->active_period, cur_period);
1041 		return true;
1042 	}
1043 
1044 	/* racy check on internal node IOs, treat as root level IOs */
1045 	if (iocg->child_active_sum)
1046 		return false;
1047 
1048 	spin_lock_irq(&ioc->lock);
1049 
1050 	ioc_now(ioc, now);
1051 
1052 	/* update period */
1053 	cur_period = atomic64_read(&ioc->cur_period);
1054 	last_period = atomic64_read(&iocg->active_period);
1055 	atomic64_set(&iocg->active_period, cur_period);
1056 
1057 	/* already activated or breaking leaf-only constraint? */
1058 	if (!list_empty(&iocg->active_list))
1059 		goto succeed_unlock;
1060 	for (i = iocg->level - 1; i > 0; i--)
1061 		if (!list_empty(&iocg->ancestors[i]->active_list))
1062 			goto fail_unlock;
1063 
1064 	if (iocg->child_active_sum)
1065 		goto fail_unlock;
1066 
1067 	/*
1068 	 * vtime may wrap when vrate is raised substantially due to
1069 	 * underestimated IO costs.  Look at the period and ignore its
1070 	 * vtime if the iocg has been idle for too long.  Also, cap the
1071 	 * budget it can start with to the margin.
1072 	 */
1073 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1074 	vtime = atomic64_read(&iocg->vtime);
1075 	vmargin = ioc->margin_us * now->vrate;
1076 	vmin = now->vnow - vmargin;
1077 
1078 	if (last_period + max_period_delta < cur_period ||
1079 	    time_before64(vtime, vmin)) {
1080 		atomic64_add(vmin - vtime, &iocg->vtime);
1081 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1082 		vtime = vmin;
1083 	}
1084 
1085 	/*
1086 	 * Activate, propagate weight and start period timer if not
1087 	 * running.  Reset hweight_gen to avoid accidental match from
1088 	 * wrapping.
1089 	 */
1090 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1091 	list_add(&iocg->active_list, &ioc->active_iocgs);
1092 	propagate_active_weight(iocg, iocg->weight,
1093 				iocg->last_inuse ?: iocg->weight);
1094 
1095 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1096 			last_period, cur_period, vtime);
1097 
1098 	iocg->last_vtime = vtime;
1099 
1100 	if (ioc->running == IOC_IDLE) {
1101 		ioc->running = IOC_RUNNING;
1102 		ioc_start_period(ioc, now);
1103 	}
1104 
1105 succeed_unlock:
1106 	spin_unlock_irq(&ioc->lock);
1107 	return true;
1108 
1109 fail_unlock:
1110 	spin_unlock_irq(&ioc->lock);
1111 	return false;
1112 }
1113 
1114 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1115 			int flags, void *key)
1116 {
1117 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1118 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1119 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1120 
1121 	ctx->vbudget -= cost;
1122 
1123 	if (ctx->vbudget < 0)
1124 		return -1;
1125 
1126 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1127 
1128 	/*
1129 	 * autoremove_wake_function() removes the wait entry only when it
1130 	 * actually changed the task state.  We want the wait always
1131 	 * removed.  Remove explicitly and use default_wake_function().
1132 	 */
1133 	list_del_init(&wq_entry->entry);
1134 	wait->committed = true;
1135 
1136 	default_wake_function(wq_entry, mode, flags, key);
1137 	return 0;
1138 }
1139 
1140 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1141 {
1142 	struct ioc *ioc = iocg->ioc;
1143 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1144 	u64 margin_ns = (u64)(ioc->period_us *
1145 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1146 	u64 vdebt, vshortage, expires, oexpires;
1147 	s64 vbudget;
1148 	u32 hw_inuse;
1149 
1150 	lockdep_assert_held(&iocg->waitq.lock);
1151 
1152 	current_hweight(iocg, NULL, &hw_inuse);
1153 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1154 
1155 	/* pay off debt */
1156 	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1157 	if (vdebt && vbudget > 0) {
1158 		u64 delta = min_t(u64, vbudget, vdebt);
1159 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160 				    iocg->abs_vdebt);
1161 
1162 		atomic64_add(delta, &iocg->vtime);
1163 		atomic64_add(delta, &iocg->done_vtime);
1164 		iocg->abs_vdebt -= abs_delta;
1165 	}
1166 
1167 	/*
1168 	 * Wake up the ones which are due and see how much vtime we'll need
1169 	 * for the next one.
1170 	 */
1171 	ctx.hw_inuse = hw_inuse;
1172 	ctx.vbudget = vbudget - vdebt;
1173 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1174 	if (!waitqueue_active(&iocg->waitq))
1175 		return;
1176 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1177 		return;
1178 
1179 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1180 	vshortage = -ctx.vbudget;
1181 	expires = now->now_ns +
1182 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1183 	expires += margin_ns / 4;
1184 
1185 	/* if already active and close enough, don't bother */
1186 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1187 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1188 	    abs(oexpires - expires) <= margin_ns / 4)
1189 		return;
1190 
1191 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1192 			       margin_ns / 4, HRTIMER_MODE_ABS);
1193 }
1194 
1195 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1196 {
1197 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1198 	struct ioc_now now;
1199 	unsigned long flags;
1200 
1201 	ioc_now(iocg->ioc, &now);
1202 
1203 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1204 	iocg_kick_waitq(iocg, &now);
1205 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1206 
1207 	return HRTIMER_NORESTART;
1208 }
1209 
1210 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1211 {
1212 	struct ioc *ioc = iocg->ioc;
1213 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1214 	u64 vtime = atomic64_read(&iocg->vtime);
1215 	u64 vmargin = ioc->margin_us * now->vrate;
1216 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1217 	u64 delta_ns, expires, oexpires;
1218 	u32 hw_inuse;
1219 
1220 	lockdep_assert_held(&iocg->waitq.lock);
1221 
1222 	/* debt-adjust vtime */
1223 	current_hweight(iocg, NULL, &hw_inuse);
1224 	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1225 
1226 	/*
1227 	 * Clear or maintain depending on the overage. Non-zero vdebt is what
1228 	 * guarantees that @iocg is online and future iocg_kick_delay() will
1229 	 * clear use_delay. Don't leave it on when there's no vdebt.
1230 	 */
1231 	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1232 		blkcg_clear_delay(blkg);
1233 		return false;
1234 	}
1235 	if (!atomic_read(&blkg->use_delay) &&
1236 	    time_before_eq64(vtime, now->vnow + vmargin))
1237 		return false;
1238 
1239 	/* use delay */
1240 	delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1241 				      now->vrate) * NSEC_PER_USEC;
1242 	blkcg_set_delay(blkg, delta_ns);
1243 	expires = now->now_ns + delta_ns;
1244 
1245 	/* if already active and close enough, don't bother */
1246 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1247 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1248 	    abs(oexpires - expires) <= margin_ns / 4)
1249 		return true;
1250 
1251 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1252 			       margin_ns / 4, HRTIMER_MODE_ABS);
1253 	return true;
1254 }
1255 
1256 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1257 {
1258 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1259 	struct ioc_now now;
1260 	unsigned long flags;
1261 
1262 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1263 	ioc_now(iocg->ioc, &now);
1264 	iocg_kick_delay(iocg, &now);
1265 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1266 
1267 	return HRTIMER_NORESTART;
1268 }
1269 
1270 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1271 {
1272 	u32 nr_met[2] = { };
1273 	u32 nr_missed[2] = { };
1274 	u64 rq_wait_ns = 0;
1275 	int cpu, rw;
1276 
1277 	for_each_online_cpu(cpu) {
1278 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1279 		u64 this_rq_wait_ns;
1280 
1281 		for (rw = READ; rw <= WRITE; rw++) {
1282 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1283 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1284 
1285 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1286 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1287 			stat->missed[rw].last_met = this_met;
1288 			stat->missed[rw].last_missed = this_missed;
1289 		}
1290 
1291 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1292 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1293 		stat->last_rq_wait_ns = this_rq_wait_ns;
1294 	}
1295 
1296 	for (rw = READ; rw <= WRITE; rw++) {
1297 		if (nr_met[rw] + nr_missed[rw])
1298 			missed_ppm_ar[rw] =
1299 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1300 						   nr_met[rw] + nr_missed[rw]);
1301 		else
1302 			missed_ppm_ar[rw] = 0;
1303 	}
1304 
1305 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1306 				   ioc->period_us * NSEC_PER_USEC);
1307 }
1308 
1309 /* was iocg idle this period? */
1310 static bool iocg_is_idle(struct ioc_gq *iocg)
1311 {
1312 	struct ioc *ioc = iocg->ioc;
1313 
1314 	/* did something get issued this period? */
1315 	if (atomic64_read(&iocg->active_period) ==
1316 	    atomic64_read(&ioc->cur_period))
1317 		return false;
1318 
1319 	/* is something in flight? */
1320 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1321 		return false;
1322 
1323 	return true;
1324 }
1325 
1326 /* returns usage with margin added if surplus is large enough */
1327 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1328 {
1329 	/* add margin */
1330 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1331 	usage += SURPLUS_SCALE_ABS;
1332 
1333 	/* don't bother if the surplus is too small */
1334 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1335 		return 0;
1336 
1337 	return usage;
1338 }
1339 
1340 static void ioc_timer_fn(struct timer_list *timer)
1341 {
1342 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1343 	struct ioc_gq *iocg, *tiocg;
1344 	struct ioc_now now;
1345 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1346 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1347 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1348 	u32 missed_ppm[2], rq_wait_pct;
1349 	u64 period_vtime;
1350 	int prev_busy_level, i;
1351 
1352 	/* how were the latencies during the period? */
1353 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1354 
1355 	/* take care of active iocgs */
1356 	spin_lock_irq(&ioc->lock);
1357 
1358 	ioc_now(ioc, &now);
1359 
1360 	period_vtime = now.vnow - ioc->period_at_vtime;
1361 	if (WARN_ON_ONCE(!period_vtime)) {
1362 		spin_unlock_irq(&ioc->lock);
1363 		return;
1364 	}
1365 
1366 	/*
1367 	 * Waiters determine the sleep durations based on the vrate they
1368 	 * saw at the time of sleep.  If vrate has increased, some waiters
1369 	 * could be sleeping for too long.  Wake up tardy waiters which
1370 	 * should have woken up in the last period and expire idle iocgs.
1371 	 */
1372 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1373 		if (!waitqueue_active(&iocg->waitq) && iocg->abs_vdebt &&
1374 		    !iocg_is_idle(iocg))
1375 			continue;
1376 
1377 		spin_lock(&iocg->waitq.lock);
1378 
1379 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1380 			/* might be oversleeping vtime / hweight changes, kick */
1381 			iocg_kick_waitq(iocg, &now);
1382 			iocg_kick_delay(iocg, &now);
1383 		} else if (iocg_is_idle(iocg)) {
1384 			/* no waiter and idle, deactivate */
1385 			iocg->last_inuse = iocg->inuse;
1386 			__propagate_active_weight(iocg, 0, 0);
1387 			list_del_init(&iocg->active_list);
1388 		}
1389 
1390 		spin_unlock(&iocg->waitq.lock);
1391 	}
1392 	commit_active_weights(ioc);
1393 
1394 	/* calc usages and see whether some weights need to be moved around */
1395 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1396 		u64 vdone, vtime, vusage, vmargin, vmin;
1397 		u32 hw_active, hw_inuse, usage;
1398 
1399 		/*
1400 		 * Collect unused and wind vtime closer to vnow to prevent
1401 		 * iocgs from accumulating a large amount of budget.
1402 		 */
1403 		vdone = atomic64_read(&iocg->done_vtime);
1404 		vtime = atomic64_read(&iocg->vtime);
1405 		current_hweight(iocg, &hw_active, &hw_inuse);
1406 
1407 		/*
1408 		 * Latency QoS detection doesn't account for IOs which are
1409 		 * in-flight for longer than a period.  Detect them by
1410 		 * comparing vdone against period start.  If lagging behind
1411 		 * IOs from past periods, don't increase vrate.
1412 		 */
1413 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1414 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1415 		    time_after64(vtime, vdone) &&
1416 		    time_after64(vtime, now.vnow -
1417 				 MAX_LAGGING_PERIODS * period_vtime) &&
1418 		    time_before64(vdone, now.vnow - period_vtime))
1419 			nr_lagging++;
1420 
1421 		if (waitqueue_active(&iocg->waitq))
1422 			vusage = now.vnow - iocg->last_vtime;
1423 		else if (time_before64(iocg->last_vtime, vtime))
1424 			vusage = vtime - iocg->last_vtime;
1425 		else
1426 			vusage = 0;
1427 
1428 		iocg->last_vtime += vusage;
1429 		/*
1430 		 * Factor in in-flight vtime into vusage to avoid
1431 		 * high-latency completions appearing as idle.  This should
1432 		 * be done after the above ->last_time adjustment.
1433 		 */
1434 		vusage = max(vusage, vtime - vdone);
1435 
1436 		/* calculate hweight based usage ratio and record */
1437 		if (vusage) {
1438 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1439 						   period_vtime);
1440 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1441 			iocg->usages[iocg->usage_idx] = usage;
1442 		} else {
1443 			usage = 0;
1444 		}
1445 
1446 		/* see whether there's surplus vtime */
1447 		vmargin = ioc->margin_us * now.vrate;
1448 		vmin = now.vnow - vmargin;
1449 
1450 		iocg->has_surplus = false;
1451 
1452 		if (!waitqueue_active(&iocg->waitq) &&
1453 		    time_before64(vtime, vmin)) {
1454 			u64 delta = vmin - vtime;
1455 
1456 			/* throw away surplus vtime */
1457 			atomic64_add(delta, &iocg->vtime);
1458 			atomic64_add(delta, &iocg->done_vtime);
1459 			iocg->last_vtime += delta;
1460 			/* if usage is sufficiently low, maybe it can donate */
1461 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1462 				iocg->has_surplus = true;
1463 				nr_surpluses++;
1464 			}
1465 		} else if (hw_inuse < hw_active) {
1466 			u32 new_hwi, new_inuse;
1467 
1468 			/* was donating but might need to take back some */
1469 			if (waitqueue_active(&iocg->waitq)) {
1470 				new_hwi = hw_active;
1471 			} else {
1472 				new_hwi = max(hw_inuse,
1473 					      usage * SURPLUS_SCALE_PCT / 100 +
1474 					      SURPLUS_SCALE_ABS);
1475 			}
1476 
1477 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1478 					      hw_inuse);
1479 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1480 
1481 			if (new_inuse > iocg->inuse) {
1482 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1483 						iocg->inuse, new_inuse,
1484 						hw_inuse, new_hwi);
1485 				__propagate_active_weight(iocg, iocg->weight,
1486 							  new_inuse);
1487 			}
1488 		} else {
1489 			/* genuninely out of vtime */
1490 			nr_shortages++;
1491 		}
1492 	}
1493 
1494 	if (!nr_shortages || !nr_surpluses)
1495 		goto skip_surplus_transfers;
1496 
1497 	/* there are both shortages and surpluses, transfer surpluses */
1498 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1499 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1500 		int nr_valid = 0;
1501 
1502 		if (!iocg->has_surplus)
1503 			continue;
1504 
1505 		/* base the decision on max historical usage */
1506 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1507 			if (iocg->usages[i]) {
1508 				usage = max(usage, iocg->usages[i]);
1509 				nr_valid++;
1510 			}
1511 		}
1512 		if (nr_valid < MIN_VALID_USAGES)
1513 			continue;
1514 
1515 		current_hweight(iocg, &hw_active, &hw_inuse);
1516 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1517 		if (!new_hwi)
1518 			continue;
1519 
1520 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1521 					       hw_inuse);
1522 		if (new_inuse < iocg->inuse) {
1523 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1524 					iocg->inuse, new_inuse,
1525 					hw_inuse, new_hwi);
1526 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1527 		}
1528 	}
1529 skip_surplus_transfers:
1530 	commit_active_weights(ioc);
1531 
1532 	/*
1533 	 * If q is getting clogged or we're missing too much, we're issuing
1534 	 * too much IO and should lower vtime rate.  If we're not missing
1535 	 * and experiencing shortages but not surpluses, we're too stingy
1536 	 * and should increase vtime rate.
1537 	 */
1538 	prev_busy_level = ioc->busy_level;
1539 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1540 	    missed_ppm[READ] > ppm_rthr ||
1541 	    missed_ppm[WRITE] > ppm_wthr) {
1542 		/* clearly missing QoS targets, slow down vrate */
1543 		ioc->busy_level = max(ioc->busy_level, 0);
1544 		ioc->busy_level++;
1545 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1546 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1547 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1548 		/* QoS targets are being met with >25% margin */
1549 		if (nr_shortages) {
1550 			/*
1551 			 * We're throttling while the device has spare
1552 			 * capacity.  If vrate was being slowed down, stop.
1553 			 */
1554 			ioc->busy_level = min(ioc->busy_level, 0);
1555 
1556 			/*
1557 			 * If there are IOs spanning multiple periods, wait
1558 			 * them out before pushing the device harder.  If
1559 			 * there are surpluses, let redistribution work it
1560 			 * out first.
1561 			 */
1562 			if (!nr_lagging && !nr_surpluses)
1563 				ioc->busy_level--;
1564 		} else {
1565 			/*
1566 			 * Nobody is being throttled and the users aren't
1567 			 * issuing enough IOs to saturate the device.  We
1568 			 * simply don't know how close the device is to
1569 			 * saturation.  Coast.
1570 			 */
1571 			ioc->busy_level = 0;
1572 		}
1573 	} else {
1574 		/* inside the hysterisis margin, we're good */
1575 		ioc->busy_level = 0;
1576 	}
1577 
1578 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1579 
1580 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1581 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1582 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1583 
1584 		/* rq_wait signal is always reliable, ignore user vrate_min */
1585 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1586 			vrate_min = VRATE_MIN;
1587 
1588 		/*
1589 		 * If vrate is out of bounds, apply clamp gradually as the
1590 		 * bounds can change abruptly.  Otherwise, apply busy_level
1591 		 * based adjustment.
1592 		 */
1593 		if (vrate < vrate_min) {
1594 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1595 					  100);
1596 			vrate = min(vrate, vrate_min);
1597 		} else if (vrate > vrate_max) {
1598 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1599 					  100);
1600 			vrate = max(vrate, vrate_max);
1601 		} else {
1602 			int idx = min_t(int, abs(ioc->busy_level),
1603 					ARRAY_SIZE(vrate_adj_pct) - 1);
1604 			u32 adj_pct = vrate_adj_pct[idx];
1605 
1606 			if (ioc->busy_level > 0)
1607 				adj_pct = 100 - adj_pct;
1608 			else
1609 				adj_pct = 100 + adj_pct;
1610 
1611 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1612 				      vrate_min, vrate_max);
1613 		}
1614 
1615 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1616 					   nr_lagging, nr_shortages,
1617 					   nr_surpluses);
1618 
1619 		atomic64_set(&ioc->vtime_rate, vrate);
1620 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1621 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1622 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1623 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1624 					   missed_ppm, rq_wait_pct, nr_lagging,
1625 					   nr_shortages, nr_surpluses);
1626 	}
1627 
1628 	ioc_refresh_params(ioc, false);
1629 
1630 	/*
1631 	 * This period is done.  Move onto the next one.  If nothing's
1632 	 * going on with the device, stop the timer.
1633 	 */
1634 	atomic64_inc(&ioc->cur_period);
1635 
1636 	if (ioc->running != IOC_STOP) {
1637 		if (!list_empty(&ioc->active_iocgs)) {
1638 			ioc_start_period(ioc, &now);
1639 		} else {
1640 			ioc->busy_level = 0;
1641 			ioc->running = IOC_IDLE;
1642 		}
1643 	}
1644 
1645 	spin_unlock_irq(&ioc->lock);
1646 }
1647 
1648 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1649 				    bool is_merge, u64 *costp)
1650 {
1651 	struct ioc *ioc = iocg->ioc;
1652 	u64 coef_seqio, coef_randio, coef_page;
1653 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1654 	u64 seek_pages = 0;
1655 	u64 cost = 0;
1656 
1657 	switch (bio_op(bio)) {
1658 	case REQ_OP_READ:
1659 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1660 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1661 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1662 		break;
1663 	case REQ_OP_WRITE:
1664 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1665 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1666 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1667 		break;
1668 	default:
1669 		goto out;
1670 	}
1671 
1672 	if (iocg->cursor) {
1673 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1674 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1675 	}
1676 
1677 	if (!is_merge) {
1678 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1679 			cost += coef_randio;
1680 		} else {
1681 			cost += coef_seqio;
1682 		}
1683 	}
1684 	cost += pages * coef_page;
1685 out:
1686 	*costp = cost;
1687 }
1688 
1689 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1690 {
1691 	u64 cost;
1692 
1693 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1694 	return cost;
1695 }
1696 
1697 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1698 					 u64 *costp)
1699 {
1700 	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1701 
1702 	switch (req_op(rq)) {
1703 	case REQ_OP_READ:
1704 		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1705 		break;
1706 	case REQ_OP_WRITE:
1707 		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1708 		break;
1709 	default:
1710 		*costp = 0;
1711 	}
1712 }
1713 
1714 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1715 {
1716 	u64 cost;
1717 
1718 	calc_size_vtime_cost_builtin(rq, ioc, &cost);
1719 	return cost;
1720 }
1721 
1722 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1723 {
1724 	struct blkcg_gq *blkg = bio->bi_blkg;
1725 	struct ioc *ioc = rqos_to_ioc(rqos);
1726 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1727 	struct ioc_now now;
1728 	struct iocg_wait wait;
1729 	u32 hw_active, hw_inuse;
1730 	u64 abs_cost, cost, vtime;
1731 
1732 	/* bypass IOs if disabled or for root cgroup */
1733 	if (!ioc->enabled || !iocg->level)
1734 		return;
1735 
1736 	/* always activate so that even 0 cost IOs get protected to some level */
1737 	if (!iocg_activate(iocg, &now))
1738 		return;
1739 
1740 	/* calculate the absolute vtime cost */
1741 	abs_cost = calc_vtime_cost(bio, iocg, false);
1742 	if (!abs_cost)
1743 		return;
1744 
1745 	iocg->cursor = bio_end_sector(bio);
1746 
1747 	vtime = atomic64_read(&iocg->vtime);
1748 	current_hweight(iocg, &hw_active, &hw_inuse);
1749 
1750 	if (hw_inuse < hw_active &&
1751 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1752 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1753 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1754 		spin_lock_irq(&ioc->lock);
1755 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1756 		spin_unlock_irq(&ioc->lock);
1757 		current_hweight(iocg, &hw_active, &hw_inuse);
1758 	}
1759 
1760 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1761 
1762 	/*
1763 	 * If no one's waiting and within budget, issue right away.  The
1764 	 * tests are racy but the races aren't systemic - we only miss once
1765 	 * in a while which is fine.
1766 	 */
1767 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1768 	    time_before_eq64(vtime + cost, now.vnow)) {
1769 		iocg_commit_bio(iocg, bio, cost);
1770 		return;
1771 	}
1772 
1773 	/*
1774 	 * We activated above but w/o any synchronization. Deactivation is
1775 	 * synchronized with waitq.lock and we won't get deactivated as long
1776 	 * as we're waiting or has debt, so we're good if we're activated
1777 	 * here. In the unlikely case that we aren't, just issue the IO.
1778 	 */
1779 	spin_lock_irq(&iocg->waitq.lock);
1780 
1781 	if (unlikely(list_empty(&iocg->active_list))) {
1782 		spin_unlock_irq(&iocg->waitq.lock);
1783 		iocg_commit_bio(iocg, bio, cost);
1784 		return;
1785 	}
1786 
1787 	/*
1788 	 * We're over budget. If @bio has to be issued regardless, remember
1789 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1790 	 * off the debt before waking more IOs.
1791 	 *
1792 	 * This way, the debt is continuously paid off each period with the
1793 	 * actual budget available to the cgroup. If we just wound vtime, we
1794 	 * would incorrectly use the current hw_inuse for the entire amount
1795 	 * which, for example, can lead to the cgroup staying blocked for a
1796 	 * long time even with substantially raised hw_inuse.
1797 	 *
1798 	 * An iocg with vdebt should stay online so that the timer can keep
1799 	 * deducting its vdebt and [de]activate use_delay mechanism
1800 	 * accordingly. We don't want to race against the timer trying to
1801 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1802 	 * penalizing the cgroup and its descendants.
1803 	 */
1804 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1805 		iocg->abs_vdebt += abs_cost;
1806 		if (iocg_kick_delay(iocg, &now))
1807 			blkcg_schedule_throttle(rqos->q,
1808 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1809 		spin_unlock_irq(&iocg->waitq.lock);
1810 		return;
1811 	}
1812 
1813 	/*
1814 	 * Append self to the waitq and schedule the wakeup timer if we're
1815 	 * the first waiter.  The timer duration is calculated based on the
1816 	 * current vrate.  vtime and hweight changes can make it too short
1817 	 * or too long.  Each wait entry records the absolute cost it's
1818 	 * waiting for to allow re-evaluation using a custom wait entry.
1819 	 *
1820 	 * If too short, the timer simply reschedules itself.  If too long,
1821 	 * the period timer will notice and trigger wakeups.
1822 	 *
1823 	 * All waiters are on iocg->waitq and the wait states are
1824 	 * synchronized using waitq.lock.
1825 	 */
1826 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1827 	wait.wait.private = current;
1828 	wait.bio = bio;
1829 	wait.abs_cost = abs_cost;
1830 	wait.committed = false;	/* will be set true by waker */
1831 
1832 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1833 	iocg_kick_waitq(iocg, &now);
1834 
1835 	spin_unlock_irq(&iocg->waitq.lock);
1836 
1837 	while (true) {
1838 		set_current_state(TASK_UNINTERRUPTIBLE);
1839 		if (wait.committed)
1840 			break;
1841 		io_schedule();
1842 	}
1843 
1844 	/* waker already committed us, proceed */
1845 	finish_wait(&iocg->waitq, &wait.wait);
1846 }
1847 
1848 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1849 			   struct bio *bio)
1850 {
1851 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1852 	struct ioc *ioc = iocg->ioc;
1853 	sector_t bio_end = bio_end_sector(bio);
1854 	struct ioc_now now;
1855 	u32 hw_inuse;
1856 	u64 abs_cost, cost;
1857 	unsigned long flags;
1858 
1859 	/* bypass if disabled or for root cgroup */
1860 	if (!ioc->enabled || !iocg->level)
1861 		return;
1862 
1863 	abs_cost = calc_vtime_cost(bio, iocg, true);
1864 	if (!abs_cost)
1865 		return;
1866 
1867 	ioc_now(ioc, &now);
1868 	current_hweight(iocg, NULL, &hw_inuse);
1869 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1870 
1871 	/* update cursor if backmerging into the request at the cursor */
1872 	if (blk_rq_pos(rq) < bio_end &&
1873 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1874 		iocg->cursor = bio_end;
1875 
1876 	/*
1877 	 * Charge if there's enough vtime budget and the existing request has
1878 	 * cost assigned.
1879 	 */
1880 	if (rq->bio && rq->bio->bi_iocost_cost &&
1881 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1882 		iocg_commit_bio(iocg, bio, cost);
1883 		return;
1884 	}
1885 
1886 	/*
1887 	 * Otherwise, account it as debt if @iocg is online, which it should
1888 	 * be for the vast majority of cases. See debt handling in
1889 	 * ioc_rqos_throttle() for details.
1890 	 */
1891 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1892 	if (likely(!list_empty(&iocg->active_list))) {
1893 		iocg->abs_vdebt += abs_cost;
1894 		iocg_kick_delay(iocg, &now);
1895 	} else {
1896 		iocg_commit_bio(iocg, bio, cost);
1897 	}
1898 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1899 }
1900 
1901 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1902 {
1903 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1904 
1905 	if (iocg && bio->bi_iocost_cost)
1906 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1907 }
1908 
1909 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1910 {
1911 	struct ioc *ioc = rqos_to_ioc(rqos);
1912 	u64 on_q_ns, rq_wait_ns, size_nsec;
1913 	int pidx, rw;
1914 
1915 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1916 		return;
1917 
1918 	switch (req_op(rq) & REQ_OP_MASK) {
1919 	case REQ_OP_READ:
1920 		pidx = QOS_RLAT;
1921 		rw = READ;
1922 		break;
1923 	case REQ_OP_WRITE:
1924 		pidx = QOS_WLAT;
1925 		rw = WRITE;
1926 		break;
1927 	default:
1928 		return;
1929 	}
1930 
1931 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1932 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1933 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
1934 
1935 	if (on_q_ns <= size_nsec ||
1936 	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1937 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1938 	else
1939 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1940 
1941 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1942 }
1943 
1944 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1945 {
1946 	struct ioc *ioc = rqos_to_ioc(rqos);
1947 
1948 	spin_lock_irq(&ioc->lock);
1949 	ioc_refresh_params(ioc, false);
1950 	spin_unlock_irq(&ioc->lock);
1951 }
1952 
1953 static void ioc_rqos_exit(struct rq_qos *rqos)
1954 {
1955 	struct ioc *ioc = rqos_to_ioc(rqos);
1956 
1957 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1958 
1959 	spin_lock_irq(&ioc->lock);
1960 	ioc->running = IOC_STOP;
1961 	spin_unlock_irq(&ioc->lock);
1962 
1963 	del_timer_sync(&ioc->timer);
1964 	free_percpu(ioc->pcpu_stat);
1965 	kfree(ioc);
1966 }
1967 
1968 static struct rq_qos_ops ioc_rqos_ops = {
1969 	.throttle = ioc_rqos_throttle,
1970 	.merge = ioc_rqos_merge,
1971 	.done_bio = ioc_rqos_done_bio,
1972 	.done = ioc_rqos_done,
1973 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1974 	.exit = ioc_rqos_exit,
1975 };
1976 
1977 static int blk_iocost_init(struct request_queue *q)
1978 {
1979 	struct ioc *ioc;
1980 	struct rq_qos *rqos;
1981 	int ret;
1982 
1983 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1984 	if (!ioc)
1985 		return -ENOMEM;
1986 
1987 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1988 	if (!ioc->pcpu_stat) {
1989 		kfree(ioc);
1990 		return -ENOMEM;
1991 	}
1992 
1993 	rqos = &ioc->rqos;
1994 	rqos->id = RQ_QOS_COST;
1995 	rqos->ops = &ioc_rqos_ops;
1996 	rqos->q = q;
1997 
1998 	spin_lock_init(&ioc->lock);
1999 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
2000 	INIT_LIST_HEAD(&ioc->active_iocgs);
2001 
2002 	ioc->running = IOC_IDLE;
2003 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2004 	seqcount_init(&ioc->period_seqcount);
2005 	ioc->period_at = ktime_to_us(ktime_get());
2006 	atomic64_set(&ioc->cur_period, 0);
2007 	atomic_set(&ioc->hweight_gen, 0);
2008 
2009 	spin_lock_irq(&ioc->lock);
2010 	ioc->autop_idx = AUTOP_INVALID;
2011 	ioc_refresh_params(ioc, true);
2012 	spin_unlock_irq(&ioc->lock);
2013 
2014 	rq_qos_add(q, rqos);
2015 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2016 	if (ret) {
2017 		rq_qos_del(q, rqos);
2018 		free_percpu(ioc->pcpu_stat);
2019 		kfree(ioc);
2020 		return ret;
2021 	}
2022 	return 0;
2023 }
2024 
2025 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2026 {
2027 	struct ioc_cgrp *iocc;
2028 
2029 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2030 	if (!iocc)
2031 		return NULL;
2032 
2033 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2034 	return &iocc->cpd;
2035 }
2036 
2037 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2038 {
2039 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2040 }
2041 
2042 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2043 					     struct blkcg *blkcg)
2044 {
2045 	int levels = blkcg->css.cgroup->level + 1;
2046 	struct ioc_gq *iocg;
2047 
2048 	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
2049 			    gfp, q->node);
2050 	if (!iocg)
2051 		return NULL;
2052 
2053 	return &iocg->pd;
2054 }
2055 
2056 static void ioc_pd_init(struct blkg_policy_data *pd)
2057 {
2058 	struct ioc_gq *iocg = pd_to_iocg(pd);
2059 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2060 	struct ioc *ioc = q_to_ioc(blkg->q);
2061 	struct ioc_now now;
2062 	struct blkcg_gq *tblkg;
2063 	unsigned long flags;
2064 
2065 	ioc_now(ioc, &now);
2066 
2067 	iocg->ioc = ioc;
2068 	atomic64_set(&iocg->vtime, now.vnow);
2069 	atomic64_set(&iocg->done_vtime, now.vnow);
2070 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2071 	INIT_LIST_HEAD(&iocg->active_list);
2072 	iocg->hweight_active = HWEIGHT_WHOLE;
2073 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2074 
2075 	init_waitqueue_head(&iocg->waitq);
2076 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2077 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2078 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2079 	iocg->delay_timer.function = iocg_delay_timer_fn;
2080 
2081 	iocg->level = blkg->blkcg->css.cgroup->level;
2082 
2083 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2084 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2085 		iocg->ancestors[tiocg->level] = tiocg;
2086 	}
2087 
2088 	spin_lock_irqsave(&ioc->lock, flags);
2089 	weight_updated(iocg);
2090 	spin_unlock_irqrestore(&ioc->lock, flags);
2091 }
2092 
2093 static void ioc_pd_free(struct blkg_policy_data *pd)
2094 {
2095 	struct ioc_gq *iocg = pd_to_iocg(pd);
2096 	struct ioc *ioc = iocg->ioc;
2097 
2098 	if (ioc) {
2099 		spin_lock(&ioc->lock);
2100 		if (!list_empty(&iocg->active_list)) {
2101 			propagate_active_weight(iocg, 0, 0);
2102 			list_del_init(&iocg->active_list);
2103 		}
2104 		spin_unlock(&ioc->lock);
2105 
2106 		hrtimer_cancel(&iocg->waitq_timer);
2107 		hrtimer_cancel(&iocg->delay_timer);
2108 	}
2109 	kfree(iocg);
2110 }
2111 
2112 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2113 			     int off)
2114 {
2115 	const char *dname = blkg_dev_name(pd->blkg);
2116 	struct ioc_gq *iocg = pd_to_iocg(pd);
2117 
2118 	if (dname && iocg->cfg_weight)
2119 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2120 	return 0;
2121 }
2122 
2123 
2124 static int ioc_weight_show(struct seq_file *sf, void *v)
2125 {
2126 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2127 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2128 
2129 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2130 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2131 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2132 	return 0;
2133 }
2134 
2135 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2136 				size_t nbytes, loff_t off)
2137 {
2138 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2139 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2140 	struct blkg_conf_ctx ctx;
2141 	struct ioc_gq *iocg;
2142 	u32 v;
2143 	int ret;
2144 
2145 	if (!strchr(buf, ':')) {
2146 		struct blkcg_gq *blkg;
2147 
2148 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2149 			return -EINVAL;
2150 
2151 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2152 			return -EINVAL;
2153 
2154 		spin_lock(&blkcg->lock);
2155 		iocc->dfl_weight = v;
2156 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2157 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2158 
2159 			if (iocg) {
2160 				spin_lock_irq(&iocg->ioc->lock);
2161 				weight_updated(iocg);
2162 				spin_unlock_irq(&iocg->ioc->lock);
2163 			}
2164 		}
2165 		spin_unlock(&blkcg->lock);
2166 
2167 		return nbytes;
2168 	}
2169 
2170 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2171 	if (ret)
2172 		return ret;
2173 
2174 	iocg = blkg_to_iocg(ctx.blkg);
2175 
2176 	if (!strncmp(ctx.body, "default", 7)) {
2177 		v = 0;
2178 	} else {
2179 		if (!sscanf(ctx.body, "%u", &v))
2180 			goto einval;
2181 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2182 			goto einval;
2183 	}
2184 
2185 	spin_lock(&iocg->ioc->lock);
2186 	iocg->cfg_weight = v;
2187 	weight_updated(iocg);
2188 	spin_unlock(&iocg->ioc->lock);
2189 
2190 	blkg_conf_finish(&ctx);
2191 	return nbytes;
2192 
2193 einval:
2194 	blkg_conf_finish(&ctx);
2195 	return -EINVAL;
2196 }
2197 
2198 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2199 			  int off)
2200 {
2201 	const char *dname = blkg_dev_name(pd->blkg);
2202 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2203 
2204 	if (!dname)
2205 		return 0;
2206 
2207 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2208 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2209 		   ioc->params.qos[QOS_RPPM] / 10000,
2210 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2211 		   ioc->params.qos[QOS_RLAT],
2212 		   ioc->params.qos[QOS_WPPM] / 10000,
2213 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2214 		   ioc->params.qos[QOS_WLAT],
2215 		   ioc->params.qos[QOS_MIN] / 10000,
2216 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2217 		   ioc->params.qos[QOS_MAX] / 10000,
2218 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2219 	return 0;
2220 }
2221 
2222 static int ioc_qos_show(struct seq_file *sf, void *v)
2223 {
2224 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2225 
2226 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2227 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2228 	return 0;
2229 }
2230 
2231 static const match_table_t qos_ctrl_tokens = {
2232 	{ QOS_ENABLE,		"enable=%u"	},
2233 	{ QOS_CTRL,		"ctrl=%s"	},
2234 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2235 };
2236 
2237 static const match_table_t qos_tokens = {
2238 	{ QOS_RPPM,		"rpct=%s"	},
2239 	{ QOS_RLAT,		"rlat=%u"	},
2240 	{ QOS_WPPM,		"wpct=%s"	},
2241 	{ QOS_WLAT,		"wlat=%u"	},
2242 	{ QOS_MIN,		"min=%s"	},
2243 	{ QOS_MAX,		"max=%s"	},
2244 	{ NR_QOS_PARAMS,	NULL		},
2245 };
2246 
2247 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2248 			     size_t nbytes, loff_t off)
2249 {
2250 	struct gendisk *disk;
2251 	struct ioc *ioc;
2252 	u32 qos[NR_QOS_PARAMS];
2253 	bool enable, user;
2254 	char *p;
2255 	int ret;
2256 
2257 	disk = blkcg_conf_get_disk(&input);
2258 	if (IS_ERR(disk))
2259 		return PTR_ERR(disk);
2260 
2261 	ioc = q_to_ioc(disk->queue);
2262 	if (!ioc) {
2263 		ret = blk_iocost_init(disk->queue);
2264 		if (ret)
2265 			goto err;
2266 		ioc = q_to_ioc(disk->queue);
2267 	}
2268 
2269 	spin_lock_irq(&ioc->lock);
2270 	memcpy(qos, ioc->params.qos, sizeof(qos));
2271 	enable = ioc->enabled;
2272 	user = ioc->user_qos_params;
2273 	spin_unlock_irq(&ioc->lock);
2274 
2275 	while ((p = strsep(&input, " \t\n"))) {
2276 		substring_t args[MAX_OPT_ARGS];
2277 		char buf[32];
2278 		int tok;
2279 		s64 v;
2280 
2281 		if (!*p)
2282 			continue;
2283 
2284 		switch (match_token(p, qos_ctrl_tokens, args)) {
2285 		case QOS_ENABLE:
2286 			match_u64(&args[0], &v);
2287 			enable = v;
2288 			continue;
2289 		case QOS_CTRL:
2290 			match_strlcpy(buf, &args[0], sizeof(buf));
2291 			if (!strcmp(buf, "auto"))
2292 				user = false;
2293 			else if (!strcmp(buf, "user"))
2294 				user = true;
2295 			else
2296 				goto einval;
2297 			continue;
2298 		}
2299 
2300 		tok = match_token(p, qos_tokens, args);
2301 		switch (tok) {
2302 		case QOS_RPPM:
2303 		case QOS_WPPM:
2304 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2305 			    sizeof(buf))
2306 				goto einval;
2307 			if (cgroup_parse_float(buf, 2, &v))
2308 				goto einval;
2309 			if (v < 0 || v > 10000)
2310 				goto einval;
2311 			qos[tok] = v * 100;
2312 			break;
2313 		case QOS_RLAT:
2314 		case QOS_WLAT:
2315 			if (match_u64(&args[0], &v))
2316 				goto einval;
2317 			qos[tok] = v;
2318 			break;
2319 		case QOS_MIN:
2320 		case QOS_MAX:
2321 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2322 			    sizeof(buf))
2323 				goto einval;
2324 			if (cgroup_parse_float(buf, 2, &v))
2325 				goto einval;
2326 			if (v < 0)
2327 				goto einval;
2328 			qos[tok] = clamp_t(s64, v * 100,
2329 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2330 			break;
2331 		default:
2332 			goto einval;
2333 		}
2334 		user = true;
2335 	}
2336 
2337 	if (qos[QOS_MIN] > qos[QOS_MAX])
2338 		goto einval;
2339 
2340 	spin_lock_irq(&ioc->lock);
2341 
2342 	if (enable) {
2343 		blk_stat_enable_accounting(ioc->rqos.q);
2344 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2345 		ioc->enabled = true;
2346 	} else {
2347 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2348 		ioc->enabled = false;
2349 	}
2350 
2351 	if (user) {
2352 		memcpy(ioc->params.qos, qos, sizeof(qos));
2353 		ioc->user_qos_params = true;
2354 	} else {
2355 		ioc->user_qos_params = false;
2356 	}
2357 
2358 	ioc_refresh_params(ioc, true);
2359 	spin_unlock_irq(&ioc->lock);
2360 
2361 	put_disk_and_module(disk);
2362 	return nbytes;
2363 einval:
2364 	ret = -EINVAL;
2365 err:
2366 	put_disk_and_module(disk);
2367 	return ret;
2368 }
2369 
2370 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2371 				 struct blkg_policy_data *pd, int off)
2372 {
2373 	const char *dname = blkg_dev_name(pd->blkg);
2374 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2375 	u64 *u = ioc->params.i_lcoefs;
2376 
2377 	if (!dname)
2378 		return 0;
2379 
2380 	seq_printf(sf, "%s ctrl=%s model=linear "
2381 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2382 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2383 		   dname, ioc->user_cost_model ? "user" : "auto",
2384 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2385 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2386 	return 0;
2387 }
2388 
2389 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2390 {
2391 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2392 
2393 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2394 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2395 	return 0;
2396 }
2397 
2398 static const match_table_t cost_ctrl_tokens = {
2399 	{ COST_CTRL,		"ctrl=%s"	},
2400 	{ COST_MODEL,		"model=%s"	},
2401 	{ NR_COST_CTRL_PARAMS,	NULL		},
2402 };
2403 
2404 static const match_table_t i_lcoef_tokens = {
2405 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2406 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2407 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2408 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2409 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2410 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2411 	{ NR_I_LCOEFS,		NULL		},
2412 };
2413 
2414 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2415 				    size_t nbytes, loff_t off)
2416 {
2417 	struct gendisk *disk;
2418 	struct ioc *ioc;
2419 	u64 u[NR_I_LCOEFS];
2420 	bool user;
2421 	char *p;
2422 	int ret;
2423 
2424 	disk = blkcg_conf_get_disk(&input);
2425 	if (IS_ERR(disk))
2426 		return PTR_ERR(disk);
2427 
2428 	ioc = q_to_ioc(disk->queue);
2429 	if (!ioc) {
2430 		ret = blk_iocost_init(disk->queue);
2431 		if (ret)
2432 			goto err;
2433 		ioc = q_to_ioc(disk->queue);
2434 	}
2435 
2436 	spin_lock_irq(&ioc->lock);
2437 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2438 	user = ioc->user_cost_model;
2439 	spin_unlock_irq(&ioc->lock);
2440 
2441 	while ((p = strsep(&input, " \t\n"))) {
2442 		substring_t args[MAX_OPT_ARGS];
2443 		char buf[32];
2444 		int tok;
2445 		u64 v;
2446 
2447 		if (!*p)
2448 			continue;
2449 
2450 		switch (match_token(p, cost_ctrl_tokens, args)) {
2451 		case COST_CTRL:
2452 			match_strlcpy(buf, &args[0], sizeof(buf));
2453 			if (!strcmp(buf, "auto"))
2454 				user = false;
2455 			else if (!strcmp(buf, "user"))
2456 				user = true;
2457 			else
2458 				goto einval;
2459 			continue;
2460 		case COST_MODEL:
2461 			match_strlcpy(buf, &args[0], sizeof(buf));
2462 			if (strcmp(buf, "linear"))
2463 				goto einval;
2464 			continue;
2465 		}
2466 
2467 		tok = match_token(p, i_lcoef_tokens, args);
2468 		if (tok == NR_I_LCOEFS)
2469 			goto einval;
2470 		if (match_u64(&args[0], &v))
2471 			goto einval;
2472 		u[tok] = v;
2473 		user = true;
2474 	}
2475 
2476 	spin_lock_irq(&ioc->lock);
2477 	if (user) {
2478 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2479 		ioc->user_cost_model = true;
2480 	} else {
2481 		ioc->user_cost_model = false;
2482 	}
2483 	ioc_refresh_params(ioc, true);
2484 	spin_unlock_irq(&ioc->lock);
2485 
2486 	put_disk_and_module(disk);
2487 	return nbytes;
2488 
2489 einval:
2490 	ret = -EINVAL;
2491 err:
2492 	put_disk_and_module(disk);
2493 	return ret;
2494 }
2495 
2496 static struct cftype ioc_files[] = {
2497 	{
2498 		.name = "weight",
2499 		.flags = CFTYPE_NOT_ON_ROOT,
2500 		.seq_show = ioc_weight_show,
2501 		.write = ioc_weight_write,
2502 	},
2503 	{
2504 		.name = "cost.qos",
2505 		.flags = CFTYPE_ONLY_ON_ROOT,
2506 		.seq_show = ioc_qos_show,
2507 		.write = ioc_qos_write,
2508 	},
2509 	{
2510 		.name = "cost.model",
2511 		.flags = CFTYPE_ONLY_ON_ROOT,
2512 		.seq_show = ioc_cost_model_show,
2513 		.write = ioc_cost_model_write,
2514 	},
2515 	{}
2516 };
2517 
2518 static struct blkcg_policy blkcg_policy_iocost = {
2519 	.dfl_cftypes	= ioc_files,
2520 	.cpd_alloc_fn	= ioc_cpd_alloc,
2521 	.cpd_free_fn	= ioc_cpd_free,
2522 	.pd_alloc_fn	= ioc_pd_alloc,
2523 	.pd_init_fn	= ioc_pd_init,
2524 	.pd_free_fn	= ioc_pd_free,
2525 };
2526 
2527 static int __init ioc_init(void)
2528 {
2529 	return blkcg_policy_register(&blkcg_policy_iocost);
2530 }
2531 
2532 static void __exit ioc_exit(void)
2533 {
2534 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2535 }
2536 
2537 module_init(ioc_init);
2538 module_exit(ioc_exit);
2539