xref: /openbmc/linux/block/blk-iocost.c (revision 0760aad038b5a032c31ea124feed63d88627d2f1)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (HWEIGHT_WHOLE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO iff doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The ouput looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include "blk-rq-qos.h"
183 #include "blk-stat.h"
184 #include "blk-wbt.h"
185 
186 #ifdef CONFIG_TRACEPOINTS
187 
188 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
189 #define TRACE_IOCG_PATH_LEN 1024
190 static DEFINE_SPINLOCK(trace_iocg_path_lock);
191 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
192 
193 #define TRACE_IOCG_PATH(type, iocg, ...)					\
194 	do {									\
195 		unsigned long flags;						\
196 		if (trace_iocost_##type##_enabled()) {				\
197 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
198 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
199 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
200 			trace_iocost_##type(iocg, trace_iocg_path,		\
201 					      ##__VA_ARGS__);			\
202 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
203 		}								\
204 	} while (0)
205 
206 #else	/* CONFIG_TRACE_POINTS */
207 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
208 #endif	/* CONFIG_TRACE_POINTS */
209 
210 enum {
211 	MILLION			= 1000000,
212 
213 	/* timer period is calculated from latency requirements, bound it */
214 	MIN_PERIOD		= USEC_PER_MSEC,
215 	MAX_PERIOD		= USEC_PER_SEC,
216 
217 	/*
218 	 * A cgroup's vtime can run 50% behind the device vtime, which
219 	 * serves as its IO credit buffer.  Surplus weight adjustment is
220 	 * immediately canceled if the vtime margin runs below 10%.
221 	 */
222 	MARGIN_PCT		= 50,
223 	INUSE_MARGIN_PCT	= 10,
224 
225 	/* Have some play in waitq timer operations */
226 	WAITQ_TIMER_MARGIN_PCT	= 5,
227 
228 	/*
229 	 * vtime can wrap well within a reasonable uptime when vrate is
230 	 * consistently raised.  Don't trust recorded cgroup vtime if the
231 	 * period counter indicates that it's older than 5mins.
232 	 */
233 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
234 
235 	/*
236 	 * Remember the past three non-zero usages and use the max for
237 	 * surplus calculation.  Three slots guarantee that we remember one
238 	 * full period usage from the last active stretch even after
239 	 * partial deactivation and re-activation periods.  Don't start
240 	 * giving away weight before collecting two data points to prevent
241 	 * hweight adjustments based on one partial activation period.
242 	 */
243 	NR_USAGE_SLOTS		= 3,
244 	MIN_VALID_USAGES	= 2,
245 
246 	/* 1/64k is granular enough and can easily be handled w/ u32 */
247 	HWEIGHT_WHOLE		= 1 << 16,
248 
249 	/*
250 	 * As vtime is used to calculate the cost of each IO, it needs to
251 	 * be fairly high precision.  For example, it should be able to
252 	 * represent the cost of a single page worth of discard with
253 	 * suffificient accuracy.  At the same time, it should be able to
254 	 * represent reasonably long enough durations to be useful and
255 	 * convenient during operation.
256 	 *
257 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
258 	 * granularity and days of wrap-around time even at extreme vrates.
259 	 */
260 	VTIME_PER_SEC_SHIFT	= 37,
261 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
262 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
263 	VTIME_PER_NSEC		= VTIME_PER_SEC / NSEC_PER_SEC,
264 
265 	/* bound vrate adjustments within two orders of magnitude */
266 	VRATE_MIN_PPM		= 10000,	/* 1% */
267 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
268 
269 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
270 	VRATE_CLAMP_ADJ_PCT	= 4,
271 
272 	/* if IOs end up waiting for requests, issue less */
273 	RQ_WAIT_BUSY_PCT	= 5,
274 
275 	/* unbusy hysterisis */
276 	UNBUSY_THR_PCT		= 75,
277 
278 	/* don't let cmds which take a very long time pin lagging for too long */
279 	MAX_LAGGING_PERIODS	= 10,
280 
281 	/*
282 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
283 	 * donate the surplus.
284 	 */
285 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
286 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
287 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
288 
289 	/* switch iff the conditions are met for longer than this */
290 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
291 
292 	/*
293 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
294 	 * size-proportional components of cost calculation in closer
295 	 * numbers of digits to per-IO cost components.
296 	 */
297 	IOC_PAGE_SHIFT		= 12,
298 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
299 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
300 
301 	/* if apart further than 16M, consider randio for linear model */
302 	LCOEF_RANDIO_PAGES	= 4096,
303 };
304 
305 enum ioc_running {
306 	IOC_IDLE,
307 	IOC_RUNNING,
308 	IOC_STOP,
309 };
310 
311 /* io.cost.qos controls including per-dev enable of the whole controller */
312 enum {
313 	QOS_ENABLE,
314 	QOS_CTRL,
315 	NR_QOS_CTRL_PARAMS,
316 };
317 
318 /* io.cost.qos params */
319 enum {
320 	QOS_RPPM,
321 	QOS_RLAT,
322 	QOS_WPPM,
323 	QOS_WLAT,
324 	QOS_MIN,
325 	QOS_MAX,
326 	NR_QOS_PARAMS,
327 };
328 
329 /* io.cost.model controls */
330 enum {
331 	COST_CTRL,
332 	COST_MODEL,
333 	NR_COST_CTRL_PARAMS,
334 };
335 
336 /* builtin linear cost model coefficients */
337 enum {
338 	I_LCOEF_RBPS,
339 	I_LCOEF_RSEQIOPS,
340 	I_LCOEF_RRANDIOPS,
341 	I_LCOEF_WBPS,
342 	I_LCOEF_WSEQIOPS,
343 	I_LCOEF_WRANDIOPS,
344 	NR_I_LCOEFS,
345 };
346 
347 enum {
348 	LCOEF_RPAGE,
349 	LCOEF_RSEQIO,
350 	LCOEF_RRANDIO,
351 	LCOEF_WPAGE,
352 	LCOEF_WSEQIO,
353 	LCOEF_WRANDIO,
354 	NR_LCOEFS,
355 };
356 
357 enum {
358 	AUTOP_INVALID,
359 	AUTOP_HDD,
360 	AUTOP_SSD_QD1,
361 	AUTOP_SSD_DFL,
362 	AUTOP_SSD_FAST,
363 };
364 
365 struct ioc_gq;
366 
367 struct ioc_params {
368 	u32				qos[NR_QOS_PARAMS];
369 	u64				i_lcoefs[NR_I_LCOEFS];
370 	u64				lcoefs[NR_LCOEFS];
371 	u32				too_fast_vrate_pct;
372 	u32				too_slow_vrate_pct;
373 };
374 
375 struct ioc_missed {
376 	u32				nr_met;
377 	u32				nr_missed;
378 	u32				last_met;
379 	u32				last_missed;
380 };
381 
382 struct ioc_pcpu_stat {
383 	struct ioc_missed		missed[2];
384 
385 	u64				rq_wait_ns;
386 	u64				last_rq_wait_ns;
387 };
388 
389 /* per device */
390 struct ioc {
391 	struct rq_qos			rqos;
392 
393 	bool				enabled;
394 
395 	struct ioc_params		params;
396 	u32				period_us;
397 	u32				margin_us;
398 	u64				vrate_min;
399 	u64				vrate_max;
400 
401 	spinlock_t			lock;
402 	struct timer_list		timer;
403 	struct list_head		active_iocgs;	/* active cgroups */
404 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
405 
406 	enum ioc_running		running;
407 	atomic64_t			vtime_rate;
408 
409 	seqcount_spinlock_t		period_seqcount;
410 	u32				period_at;	/* wallclock starttime */
411 	u64				period_at_vtime; /* vtime starttime */
412 
413 	atomic64_t			cur_period;	/* inc'd each period */
414 	int				busy_level;	/* saturation history */
415 
416 	u64				inuse_margin_vtime;
417 	bool				weights_updated;
418 	atomic_t			hweight_gen;	/* for lazy hweights */
419 
420 	u64				autop_too_fast_at;
421 	u64				autop_too_slow_at;
422 	int				autop_idx;
423 	bool				user_qos_params:1;
424 	bool				user_cost_model:1;
425 };
426 
427 /* per device-cgroup pair */
428 struct ioc_gq {
429 	struct blkg_policy_data		pd;
430 	struct ioc			*ioc;
431 
432 	/*
433 	 * A iocg can get its weight from two sources - an explicit
434 	 * per-device-cgroup configuration or the default weight of the
435 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
436 	 * configuration.  `weight` is the effective considering both
437 	 * sources.
438 	 *
439 	 * When an idle cgroup becomes active its `active` goes from 0 to
440 	 * `weight`.  `inuse` is the surplus adjusted active weight.
441 	 * `active` and `inuse` are used to calculate `hweight_active` and
442 	 * `hweight_inuse`.
443 	 *
444 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
445 	 * surplus adjustments.
446 	 */
447 	u32				cfg_weight;
448 	u32				weight;
449 	u32				active;
450 	u32				inuse;
451 	u32				last_inuse;
452 
453 	sector_t			cursor;		/* to detect randio */
454 
455 	/*
456 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
457 	 * issued.  If lagging behind device vtime, the delta represents
458 	 * the currently available IO budget.  If runnning ahead, the
459 	 * overage.
460 	 *
461 	 * `vtime_done` is the same but progressed on completion rather
462 	 * than issue.  The delta behind `vtime` represents the cost of
463 	 * currently in-flight IOs.
464 	 *
465 	 * `last_vtime` is used to remember `vtime` at the end of the last
466 	 * period to calculate utilization.
467 	 */
468 	atomic64_t			vtime;
469 	atomic64_t			done_vtime;
470 	u64				abs_vdebt;
471 	u64				last_vtime;
472 
473 	/*
474 	 * The period this iocg was last active in.  Used for deactivation
475 	 * and invalidating `vtime`.
476 	 */
477 	atomic64_t			active_period;
478 	struct list_head		active_list;
479 
480 	/* see __propagate_active_weight() and current_hweight() for details */
481 	u64				child_active_sum;
482 	u64				child_inuse_sum;
483 	int				hweight_gen;
484 	u32				hweight_active;
485 	u32				hweight_inuse;
486 	bool				has_surplus;
487 
488 	struct wait_queue_head		waitq;
489 	struct hrtimer			waitq_timer;
490 	struct hrtimer			delay_timer;
491 
492 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
493 	int				usage_idx;
494 	u32				usages[NR_USAGE_SLOTS];
495 
496 	/* this iocg's depth in the hierarchy and ancestors including self */
497 	int				level;
498 	struct ioc_gq			*ancestors[];
499 };
500 
501 /* per cgroup */
502 struct ioc_cgrp {
503 	struct blkcg_policy_data	cpd;
504 	unsigned int			dfl_weight;
505 };
506 
507 struct ioc_now {
508 	u64				now_ns;
509 	u32				now;
510 	u64				vnow;
511 	u64				vrate;
512 };
513 
514 struct iocg_wait {
515 	struct wait_queue_entry		wait;
516 	struct bio			*bio;
517 	u64				abs_cost;
518 	bool				committed;
519 };
520 
521 struct iocg_wake_ctx {
522 	struct ioc_gq			*iocg;
523 	u32				hw_inuse;
524 	s64				vbudget;
525 };
526 
527 static const struct ioc_params autop[] = {
528 	[AUTOP_HDD] = {
529 		.qos				= {
530 			[QOS_RLAT]		=        250000, /* 250ms */
531 			[QOS_WLAT]		=        250000,
532 			[QOS_MIN]		= VRATE_MIN_PPM,
533 			[QOS_MAX]		= VRATE_MAX_PPM,
534 		},
535 		.i_lcoefs			= {
536 			[I_LCOEF_RBPS]		=     174019176,
537 			[I_LCOEF_RSEQIOPS]	=         41708,
538 			[I_LCOEF_RRANDIOPS]	=           370,
539 			[I_LCOEF_WBPS]		=     178075866,
540 			[I_LCOEF_WSEQIOPS]	=         42705,
541 			[I_LCOEF_WRANDIOPS]	=           378,
542 		},
543 	},
544 	[AUTOP_SSD_QD1] = {
545 		.qos				= {
546 			[QOS_RLAT]		=         25000, /* 25ms */
547 			[QOS_WLAT]		=         25000,
548 			[QOS_MIN]		= VRATE_MIN_PPM,
549 			[QOS_MAX]		= VRATE_MAX_PPM,
550 		},
551 		.i_lcoefs			= {
552 			[I_LCOEF_RBPS]		=     245855193,
553 			[I_LCOEF_RSEQIOPS]	=         61575,
554 			[I_LCOEF_RRANDIOPS]	=          6946,
555 			[I_LCOEF_WBPS]		=     141365009,
556 			[I_LCOEF_WSEQIOPS]	=         33716,
557 			[I_LCOEF_WRANDIOPS]	=         26796,
558 		},
559 	},
560 	[AUTOP_SSD_DFL] = {
561 		.qos				= {
562 			[QOS_RLAT]		=         25000, /* 25ms */
563 			[QOS_WLAT]		=         25000,
564 			[QOS_MIN]		= VRATE_MIN_PPM,
565 			[QOS_MAX]		= VRATE_MAX_PPM,
566 		},
567 		.i_lcoefs			= {
568 			[I_LCOEF_RBPS]		=     488636629,
569 			[I_LCOEF_RSEQIOPS]	=          8932,
570 			[I_LCOEF_RRANDIOPS]	=          8518,
571 			[I_LCOEF_WBPS]		=     427891549,
572 			[I_LCOEF_WSEQIOPS]	=         28755,
573 			[I_LCOEF_WRANDIOPS]	=         21940,
574 		},
575 		.too_fast_vrate_pct		=           500,
576 	},
577 	[AUTOP_SSD_FAST] = {
578 		.qos				= {
579 			[QOS_RLAT]		=          5000, /* 5ms */
580 			[QOS_WLAT]		=          5000,
581 			[QOS_MIN]		= VRATE_MIN_PPM,
582 			[QOS_MAX]		= VRATE_MAX_PPM,
583 		},
584 		.i_lcoefs			= {
585 			[I_LCOEF_RBPS]		=    3102524156LLU,
586 			[I_LCOEF_RSEQIOPS]	=        724816,
587 			[I_LCOEF_RRANDIOPS]	=        778122,
588 			[I_LCOEF_WBPS]		=    1742780862LLU,
589 			[I_LCOEF_WSEQIOPS]	=        425702,
590 			[I_LCOEF_WRANDIOPS]	=	 443193,
591 		},
592 		.too_slow_vrate_pct		=            10,
593 	},
594 };
595 
596 /*
597  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
598  * vtime credit shortage and down on device saturation.
599  */
600 static u32 vrate_adj_pct[] =
601 	{ 0, 0, 0, 0,
602 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
603 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
604 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
605 
606 static struct blkcg_policy blkcg_policy_iocost;
607 
608 /* accessors and helpers */
609 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
610 {
611 	return container_of(rqos, struct ioc, rqos);
612 }
613 
614 static struct ioc *q_to_ioc(struct request_queue *q)
615 {
616 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
617 }
618 
619 static const char *q_name(struct request_queue *q)
620 {
621 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
622 		return kobject_name(q->kobj.parent);
623 	else
624 		return "<unknown>";
625 }
626 
627 static const char __maybe_unused *ioc_name(struct ioc *ioc)
628 {
629 	return q_name(ioc->rqos.q);
630 }
631 
632 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
633 {
634 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
635 }
636 
637 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
638 {
639 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
640 }
641 
642 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
643 {
644 	return pd_to_blkg(&iocg->pd);
645 }
646 
647 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
648 {
649 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
650 			    struct ioc_cgrp, cpd);
651 }
652 
653 /*
654  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
655  * weight, the more expensive each IO.  Must round up.
656  */
657 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
658 {
659 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
660 }
661 
662 /*
663  * The inverse of abs_cost_to_cost().  Must round up.
664  */
665 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
666 {
667 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
668 }
669 
670 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
671 {
672 	bio->bi_iocost_cost = cost;
673 	atomic64_add(cost, &iocg->vtime);
674 }
675 
676 #define CREATE_TRACE_POINTS
677 #include <trace/events/iocost.h>
678 
679 /* latency Qos params changed, update period_us and all the dependent params */
680 static void ioc_refresh_period_us(struct ioc *ioc)
681 {
682 	u32 ppm, lat, multi, period_us;
683 
684 	lockdep_assert_held(&ioc->lock);
685 
686 	/* pick the higher latency target */
687 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
688 		ppm = ioc->params.qos[QOS_RPPM];
689 		lat = ioc->params.qos[QOS_RLAT];
690 	} else {
691 		ppm = ioc->params.qos[QOS_WPPM];
692 		lat = ioc->params.qos[QOS_WLAT];
693 	}
694 
695 	/*
696 	 * We want the period to be long enough to contain a healthy number
697 	 * of IOs while short enough for granular control.  Define it as a
698 	 * multiple of the latency target.  Ideally, the multiplier should
699 	 * be scaled according to the percentile so that it would nominally
700 	 * contain a certain number of requests.  Let's be simpler and
701 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
702 	 */
703 	if (ppm)
704 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
705 	else
706 		multi = 2;
707 	period_us = multi * lat;
708 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
709 
710 	/* calculate dependent params */
711 	ioc->period_us = period_us;
712 	ioc->margin_us = period_us * MARGIN_PCT / 100;
713 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
714 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
715 }
716 
717 static int ioc_autop_idx(struct ioc *ioc)
718 {
719 	int idx = ioc->autop_idx;
720 	const struct ioc_params *p = &autop[idx];
721 	u32 vrate_pct;
722 	u64 now_ns;
723 
724 	/* rotational? */
725 	if (!blk_queue_nonrot(ioc->rqos.q))
726 		return AUTOP_HDD;
727 
728 	/* handle SATA SSDs w/ broken NCQ */
729 	if (blk_queue_depth(ioc->rqos.q) == 1)
730 		return AUTOP_SSD_QD1;
731 
732 	/* use one of the normal ssd sets */
733 	if (idx < AUTOP_SSD_DFL)
734 		return AUTOP_SSD_DFL;
735 
736 	/* if user is overriding anything, maintain what was there */
737 	if (ioc->user_qos_params || ioc->user_cost_model)
738 		return idx;
739 
740 	/* step up/down based on the vrate */
741 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
742 			      VTIME_PER_USEC);
743 	now_ns = ktime_get_ns();
744 
745 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
746 		if (!ioc->autop_too_fast_at)
747 			ioc->autop_too_fast_at = now_ns;
748 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
749 			return idx + 1;
750 	} else {
751 		ioc->autop_too_fast_at = 0;
752 	}
753 
754 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
755 		if (!ioc->autop_too_slow_at)
756 			ioc->autop_too_slow_at = now_ns;
757 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
758 			return idx - 1;
759 	} else {
760 		ioc->autop_too_slow_at = 0;
761 	}
762 
763 	return idx;
764 }
765 
766 /*
767  * Take the followings as input
768  *
769  *  @bps	maximum sequential throughput
770  *  @seqiops	maximum sequential 4k iops
771  *  @randiops	maximum random 4k iops
772  *
773  * and calculate the linear model cost coefficients.
774  *
775  *  *@page	per-page cost		1s / (@bps / 4096)
776  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
777  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
778  */
779 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
780 			u64 *page, u64 *seqio, u64 *randio)
781 {
782 	u64 v;
783 
784 	*page = *seqio = *randio = 0;
785 
786 	if (bps)
787 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
788 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
789 
790 	if (seqiops) {
791 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
792 		if (v > *page)
793 			*seqio = v - *page;
794 	}
795 
796 	if (randiops) {
797 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
798 		if (v > *page)
799 			*randio = v - *page;
800 	}
801 }
802 
803 static void ioc_refresh_lcoefs(struct ioc *ioc)
804 {
805 	u64 *u = ioc->params.i_lcoefs;
806 	u64 *c = ioc->params.lcoefs;
807 
808 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
809 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
810 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
811 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
812 }
813 
814 static bool ioc_refresh_params(struct ioc *ioc, bool force)
815 {
816 	const struct ioc_params *p;
817 	int idx;
818 
819 	lockdep_assert_held(&ioc->lock);
820 
821 	idx = ioc_autop_idx(ioc);
822 	p = &autop[idx];
823 
824 	if (idx == ioc->autop_idx && !force)
825 		return false;
826 
827 	if (idx != ioc->autop_idx)
828 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
829 
830 	ioc->autop_idx = idx;
831 	ioc->autop_too_fast_at = 0;
832 	ioc->autop_too_slow_at = 0;
833 
834 	if (!ioc->user_qos_params)
835 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
836 	if (!ioc->user_cost_model)
837 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
838 
839 	ioc_refresh_period_us(ioc);
840 	ioc_refresh_lcoefs(ioc);
841 
842 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
843 					    VTIME_PER_USEC, MILLION);
844 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
845 				   VTIME_PER_USEC, MILLION);
846 
847 	return true;
848 }
849 
850 /* take a snapshot of the current [v]time and vrate */
851 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
852 {
853 	unsigned seq;
854 
855 	now->now_ns = ktime_get();
856 	now->now = ktime_to_us(now->now_ns);
857 	now->vrate = atomic64_read(&ioc->vtime_rate);
858 
859 	/*
860 	 * The current vtime is
861 	 *
862 	 *   vtime at period start + (wallclock time since the start) * vrate
863 	 *
864 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
865 	 * needed, they're seqcount protected.
866 	 */
867 	do {
868 		seq = read_seqcount_begin(&ioc->period_seqcount);
869 		now->vnow = ioc->period_at_vtime +
870 			(now->now - ioc->period_at) * now->vrate;
871 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
872 }
873 
874 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
875 {
876 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
877 
878 	write_seqcount_begin(&ioc->period_seqcount);
879 	ioc->period_at = now->now;
880 	ioc->period_at_vtime = now->vnow;
881 	write_seqcount_end(&ioc->period_seqcount);
882 
883 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
884 	add_timer(&ioc->timer);
885 }
886 
887 /*
888  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
889  * weight sums and propagate upwards accordingly.
890  */
891 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
892 {
893 	struct ioc *ioc = iocg->ioc;
894 	int lvl;
895 
896 	lockdep_assert_held(&ioc->lock);
897 
898 	inuse = min(active, inuse);
899 
900 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
901 		struct ioc_gq *parent = iocg->ancestors[lvl];
902 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
903 		u32 parent_active = 0, parent_inuse = 0;
904 
905 		/* update the level sums */
906 		parent->child_active_sum += (s32)(active - child->active);
907 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
908 		/* apply the udpates */
909 		child->active = active;
910 		child->inuse = inuse;
911 
912 		/*
913 		 * The delta between inuse and active sums indicates that
914 		 * that much of weight is being given away.  Parent's inuse
915 		 * and active should reflect the ratio.
916 		 */
917 		if (parent->child_active_sum) {
918 			parent_active = parent->weight;
919 			parent_inuse = DIV64_U64_ROUND_UP(
920 				parent_active * parent->child_inuse_sum,
921 				parent->child_active_sum);
922 		}
923 
924 		/* do we need to keep walking up? */
925 		if (parent_active == parent->active &&
926 		    parent_inuse == parent->inuse)
927 			break;
928 
929 		active = parent_active;
930 		inuse = parent_inuse;
931 	}
932 
933 	ioc->weights_updated = true;
934 }
935 
936 static void commit_active_weights(struct ioc *ioc)
937 {
938 	lockdep_assert_held(&ioc->lock);
939 
940 	if (ioc->weights_updated) {
941 		/* paired with rmb in current_hweight(), see there */
942 		smp_wmb();
943 		atomic_inc(&ioc->hweight_gen);
944 		ioc->weights_updated = false;
945 	}
946 }
947 
948 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
949 {
950 	__propagate_active_weight(iocg, active, inuse);
951 	commit_active_weights(iocg->ioc);
952 }
953 
954 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
955 {
956 	struct ioc *ioc = iocg->ioc;
957 	int lvl;
958 	u32 hwa, hwi;
959 	int ioc_gen;
960 
961 	/* hot path - if uptodate, use cached */
962 	ioc_gen = atomic_read(&ioc->hweight_gen);
963 	if (ioc_gen == iocg->hweight_gen)
964 		goto out;
965 
966 	/*
967 	 * Paired with wmb in commit_active_weights().  If we saw the
968 	 * updated hweight_gen, all the weight updates from
969 	 * __propagate_active_weight() are visible too.
970 	 *
971 	 * We can race with weight updates during calculation and get it
972 	 * wrong.  However, hweight_gen would have changed and a future
973 	 * reader will recalculate and we're guaranteed to discard the
974 	 * wrong result soon.
975 	 */
976 	smp_rmb();
977 
978 	hwa = hwi = HWEIGHT_WHOLE;
979 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
980 		struct ioc_gq *parent = iocg->ancestors[lvl];
981 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
982 		u32 active_sum = READ_ONCE(parent->child_active_sum);
983 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
984 		u32 active = READ_ONCE(child->active);
985 		u32 inuse = READ_ONCE(child->inuse);
986 
987 		/* we can race with deactivations and either may read as zero */
988 		if (!active_sum || !inuse_sum)
989 			continue;
990 
991 		active_sum = max(active, active_sum);
992 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
993 
994 		inuse_sum = max(inuse, inuse_sum);
995 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
996 	}
997 
998 	iocg->hweight_active = max_t(u32, hwa, 1);
999 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1000 	iocg->hweight_gen = ioc_gen;
1001 out:
1002 	if (hw_activep)
1003 		*hw_activep = iocg->hweight_active;
1004 	if (hw_inusep)
1005 		*hw_inusep = iocg->hweight_inuse;
1006 }
1007 
1008 static void weight_updated(struct ioc_gq *iocg)
1009 {
1010 	struct ioc *ioc = iocg->ioc;
1011 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1012 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1013 	u32 weight;
1014 
1015 	lockdep_assert_held(&ioc->lock);
1016 
1017 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1018 	if (weight != iocg->weight && iocg->active)
1019 		propagate_active_weight(iocg, weight,
1020 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1021 	iocg->weight = weight;
1022 }
1023 
1024 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1025 {
1026 	struct ioc *ioc = iocg->ioc;
1027 	u64 last_period, cur_period, max_period_delta;
1028 	u64 vtime, vmargin, vmin;
1029 	int i;
1030 
1031 	/*
1032 	 * If seem to be already active, just update the stamp to tell the
1033 	 * timer that we're still active.  We don't mind occassional races.
1034 	 */
1035 	if (!list_empty(&iocg->active_list)) {
1036 		ioc_now(ioc, now);
1037 		cur_period = atomic64_read(&ioc->cur_period);
1038 		if (atomic64_read(&iocg->active_period) != cur_period)
1039 			atomic64_set(&iocg->active_period, cur_period);
1040 		return true;
1041 	}
1042 
1043 	/* racy check on internal node IOs, treat as root level IOs */
1044 	if (iocg->child_active_sum)
1045 		return false;
1046 
1047 	spin_lock_irq(&ioc->lock);
1048 
1049 	ioc_now(ioc, now);
1050 
1051 	/* update period */
1052 	cur_period = atomic64_read(&ioc->cur_period);
1053 	last_period = atomic64_read(&iocg->active_period);
1054 	atomic64_set(&iocg->active_period, cur_period);
1055 
1056 	/* already activated or breaking leaf-only constraint? */
1057 	if (!list_empty(&iocg->active_list))
1058 		goto succeed_unlock;
1059 	for (i = iocg->level - 1; i > 0; i--)
1060 		if (!list_empty(&iocg->ancestors[i]->active_list))
1061 			goto fail_unlock;
1062 
1063 	if (iocg->child_active_sum)
1064 		goto fail_unlock;
1065 
1066 	/*
1067 	 * vtime may wrap when vrate is raised substantially due to
1068 	 * underestimated IO costs.  Look at the period and ignore its
1069 	 * vtime if the iocg has been idle for too long.  Also, cap the
1070 	 * budget it can start with to the margin.
1071 	 */
1072 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1073 	vtime = atomic64_read(&iocg->vtime);
1074 	vmargin = ioc->margin_us * now->vrate;
1075 	vmin = now->vnow - vmargin;
1076 
1077 	if (last_period + max_period_delta < cur_period ||
1078 	    time_before64(vtime, vmin)) {
1079 		atomic64_add(vmin - vtime, &iocg->vtime);
1080 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1081 		vtime = vmin;
1082 	}
1083 
1084 	/*
1085 	 * Activate, propagate weight and start period timer if not
1086 	 * running.  Reset hweight_gen to avoid accidental match from
1087 	 * wrapping.
1088 	 */
1089 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1090 	list_add(&iocg->active_list, &ioc->active_iocgs);
1091 	propagate_active_weight(iocg, iocg->weight,
1092 				iocg->last_inuse ?: iocg->weight);
1093 
1094 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1095 			last_period, cur_period, vtime);
1096 
1097 	iocg->last_vtime = vtime;
1098 
1099 	if (ioc->running == IOC_IDLE) {
1100 		ioc->running = IOC_RUNNING;
1101 		ioc_start_period(ioc, now);
1102 	}
1103 
1104 succeed_unlock:
1105 	spin_unlock_irq(&ioc->lock);
1106 	return true;
1107 
1108 fail_unlock:
1109 	spin_unlock_irq(&ioc->lock);
1110 	return false;
1111 }
1112 
1113 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1114 			int flags, void *key)
1115 {
1116 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1117 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1118 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1119 
1120 	ctx->vbudget -= cost;
1121 
1122 	if (ctx->vbudget < 0)
1123 		return -1;
1124 
1125 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1126 
1127 	/*
1128 	 * autoremove_wake_function() removes the wait entry only when it
1129 	 * actually changed the task state.  We want the wait always
1130 	 * removed.  Remove explicitly and use default_wake_function().
1131 	 */
1132 	list_del_init(&wq_entry->entry);
1133 	wait->committed = true;
1134 
1135 	default_wake_function(wq_entry, mode, flags, key);
1136 	return 0;
1137 }
1138 
1139 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1140 {
1141 	struct ioc *ioc = iocg->ioc;
1142 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1143 	u64 margin_ns = (u64)(ioc->period_us *
1144 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1145 	u64 vdebt, vshortage, expires, oexpires;
1146 	s64 vbudget;
1147 	u32 hw_inuse;
1148 
1149 	lockdep_assert_held(&iocg->waitq.lock);
1150 
1151 	current_hweight(iocg, NULL, &hw_inuse);
1152 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1153 
1154 	/* pay off debt */
1155 	vdebt = abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1156 	if (vdebt && vbudget > 0) {
1157 		u64 delta = min_t(u64, vbudget, vdebt);
1158 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1159 				    iocg->abs_vdebt);
1160 
1161 		atomic64_add(delta, &iocg->vtime);
1162 		atomic64_add(delta, &iocg->done_vtime);
1163 		iocg->abs_vdebt -= abs_delta;
1164 	}
1165 
1166 	/*
1167 	 * Wake up the ones which are due and see how much vtime we'll need
1168 	 * for the next one.
1169 	 */
1170 	ctx.hw_inuse = hw_inuse;
1171 	ctx.vbudget = vbudget - vdebt;
1172 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1173 	if (!waitqueue_active(&iocg->waitq))
1174 		return;
1175 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1176 		return;
1177 
1178 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1179 	vshortage = -ctx.vbudget;
1180 	expires = now->now_ns +
1181 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1182 	expires += margin_ns / 4;
1183 
1184 	/* if already active and close enough, don't bother */
1185 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1186 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1187 	    abs(oexpires - expires) <= margin_ns / 4)
1188 		return;
1189 
1190 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1191 			       margin_ns / 4, HRTIMER_MODE_ABS);
1192 }
1193 
1194 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1195 {
1196 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1197 	struct ioc_now now;
1198 	unsigned long flags;
1199 
1200 	ioc_now(iocg->ioc, &now);
1201 
1202 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1203 	iocg_kick_waitq(iocg, &now);
1204 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1205 
1206 	return HRTIMER_NORESTART;
1207 }
1208 
1209 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now)
1210 {
1211 	struct ioc *ioc = iocg->ioc;
1212 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1213 	u64 vtime = atomic64_read(&iocg->vtime);
1214 	u64 vmargin = ioc->margin_us * now->vrate;
1215 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1216 	u64 delta_ns, expires, oexpires;
1217 	u32 hw_inuse;
1218 
1219 	lockdep_assert_held(&iocg->waitq.lock);
1220 
1221 	/* debt-adjust vtime */
1222 	current_hweight(iocg, NULL, &hw_inuse);
1223 	vtime += abs_cost_to_cost(iocg->abs_vdebt, hw_inuse);
1224 
1225 	/*
1226 	 * Clear or maintain depending on the overage. Non-zero vdebt is what
1227 	 * guarantees that @iocg is online and future iocg_kick_delay() will
1228 	 * clear use_delay. Don't leave it on when there's no vdebt.
1229 	 */
1230 	if (!iocg->abs_vdebt || time_before_eq64(vtime, now->vnow)) {
1231 		blkcg_clear_delay(blkg);
1232 		return false;
1233 	}
1234 	if (!atomic_read(&blkg->use_delay) &&
1235 	    time_before_eq64(vtime, now->vnow + vmargin))
1236 		return false;
1237 
1238 	/* use delay */
1239 	delta_ns = DIV64_U64_ROUND_UP(vtime - now->vnow,
1240 				      now->vrate) * NSEC_PER_USEC;
1241 	blkcg_set_delay(blkg, delta_ns);
1242 	expires = now->now_ns + delta_ns;
1243 
1244 	/* if already active and close enough, don't bother */
1245 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1246 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1247 	    abs(oexpires - expires) <= margin_ns / 4)
1248 		return true;
1249 
1250 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1251 			       margin_ns / 4, HRTIMER_MODE_ABS);
1252 	return true;
1253 }
1254 
1255 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1256 {
1257 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1258 	struct ioc_now now;
1259 	unsigned long flags;
1260 
1261 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1262 	ioc_now(iocg->ioc, &now);
1263 	iocg_kick_delay(iocg, &now);
1264 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1265 
1266 	return HRTIMER_NORESTART;
1267 }
1268 
1269 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1270 {
1271 	u32 nr_met[2] = { };
1272 	u32 nr_missed[2] = { };
1273 	u64 rq_wait_ns = 0;
1274 	int cpu, rw;
1275 
1276 	for_each_online_cpu(cpu) {
1277 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1278 		u64 this_rq_wait_ns;
1279 
1280 		for (rw = READ; rw <= WRITE; rw++) {
1281 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1282 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1283 
1284 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1285 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1286 			stat->missed[rw].last_met = this_met;
1287 			stat->missed[rw].last_missed = this_missed;
1288 		}
1289 
1290 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1291 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1292 		stat->last_rq_wait_ns = this_rq_wait_ns;
1293 	}
1294 
1295 	for (rw = READ; rw <= WRITE; rw++) {
1296 		if (nr_met[rw] + nr_missed[rw])
1297 			missed_ppm_ar[rw] =
1298 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1299 						   nr_met[rw] + nr_missed[rw]);
1300 		else
1301 			missed_ppm_ar[rw] = 0;
1302 	}
1303 
1304 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1305 				   ioc->period_us * NSEC_PER_USEC);
1306 }
1307 
1308 /* was iocg idle this period? */
1309 static bool iocg_is_idle(struct ioc_gq *iocg)
1310 {
1311 	struct ioc *ioc = iocg->ioc;
1312 
1313 	/* did something get issued this period? */
1314 	if (atomic64_read(&iocg->active_period) ==
1315 	    atomic64_read(&ioc->cur_period))
1316 		return false;
1317 
1318 	/* is something in flight? */
1319 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1320 		return false;
1321 
1322 	return true;
1323 }
1324 
1325 /* returns usage with margin added if surplus is large enough */
1326 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1327 {
1328 	/* add margin */
1329 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1330 	usage += SURPLUS_SCALE_ABS;
1331 
1332 	/* don't bother if the surplus is too small */
1333 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1334 		return 0;
1335 
1336 	return usage;
1337 }
1338 
1339 static void ioc_timer_fn(struct timer_list *timer)
1340 {
1341 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1342 	struct ioc_gq *iocg, *tiocg;
1343 	struct ioc_now now;
1344 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1345 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1346 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1347 	u32 missed_ppm[2], rq_wait_pct;
1348 	u64 period_vtime;
1349 	int prev_busy_level, i;
1350 
1351 	/* how were the latencies during the period? */
1352 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1353 
1354 	/* take care of active iocgs */
1355 	spin_lock_irq(&ioc->lock);
1356 
1357 	ioc_now(ioc, &now);
1358 
1359 	period_vtime = now.vnow - ioc->period_at_vtime;
1360 	if (WARN_ON_ONCE(!period_vtime)) {
1361 		spin_unlock_irq(&ioc->lock);
1362 		return;
1363 	}
1364 
1365 	/*
1366 	 * Waiters determine the sleep durations based on the vrate they
1367 	 * saw at the time of sleep.  If vrate has increased, some waiters
1368 	 * could be sleeping for too long.  Wake up tardy waiters which
1369 	 * should have woken up in the last period and expire idle iocgs.
1370 	 */
1371 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1372 		if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1373 		    !iocg_is_idle(iocg))
1374 			continue;
1375 
1376 		spin_lock(&iocg->waitq.lock);
1377 
1378 		if (waitqueue_active(&iocg->waitq) || iocg->abs_vdebt) {
1379 			/* might be oversleeping vtime / hweight changes, kick */
1380 			iocg_kick_waitq(iocg, &now);
1381 			iocg_kick_delay(iocg, &now);
1382 		} else if (iocg_is_idle(iocg)) {
1383 			/* no waiter and idle, deactivate */
1384 			iocg->last_inuse = iocg->inuse;
1385 			__propagate_active_weight(iocg, 0, 0);
1386 			list_del_init(&iocg->active_list);
1387 		}
1388 
1389 		spin_unlock(&iocg->waitq.lock);
1390 	}
1391 	commit_active_weights(ioc);
1392 
1393 	/* calc usages and see whether some weights need to be moved around */
1394 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1395 		u64 vdone, vtime, vusage, vmargin, vmin;
1396 		u32 hw_active, hw_inuse, usage;
1397 
1398 		/*
1399 		 * Collect unused and wind vtime closer to vnow to prevent
1400 		 * iocgs from accumulating a large amount of budget.
1401 		 */
1402 		vdone = atomic64_read(&iocg->done_vtime);
1403 		vtime = atomic64_read(&iocg->vtime);
1404 		current_hweight(iocg, &hw_active, &hw_inuse);
1405 
1406 		/*
1407 		 * Latency QoS detection doesn't account for IOs which are
1408 		 * in-flight for longer than a period.  Detect them by
1409 		 * comparing vdone against period start.  If lagging behind
1410 		 * IOs from past periods, don't increase vrate.
1411 		 */
1412 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1413 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1414 		    time_after64(vtime, vdone) &&
1415 		    time_after64(vtime, now.vnow -
1416 				 MAX_LAGGING_PERIODS * period_vtime) &&
1417 		    time_before64(vdone, now.vnow - period_vtime))
1418 			nr_lagging++;
1419 
1420 		if (waitqueue_active(&iocg->waitq))
1421 			vusage = now.vnow - iocg->last_vtime;
1422 		else if (time_before64(iocg->last_vtime, vtime))
1423 			vusage = vtime - iocg->last_vtime;
1424 		else
1425 			vusage = 0;
1426 
1427 		iocg->last_vtime += vusage;
1428 		/*
1429 		 * Factor in in-flight vtime into vusage to avoid
1430 		 * high-latency completions appearing as idle.  This should
1431 		 * be done after the above ->last_time adjustment.
1432 		 */
1433 		vusage = max(vusage, vtime - vdone);
1434 
1435 		/* calculate hweight based usage ratio and record */
1436 		if (vusage) {
1437 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1438 						   period_vtime);
1439 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1440 			iocg->usages[iocg->usage_idx] = usage;
1441 		} else {
1442 			usage = 0;
1443 		}
1444 
1445 		/* see whether there's surplus vtime */
1446 		vmargin = ioc->margin_us * now.vrate;
1447 		vmin = now.vnow - vmargin;
1448 
1449 		iocg->has_surplus = false;
1450 
1451 		if (!waitqueue_active(&iocg->waitq) &&
1452 		    time_before64(vtime, vmin)) {
1453 			u64 delta = vmin - vtime;
1454 
1455 			/* throw away surplus vtime */
1456 			atomic64_add(delta, &iocg->vtime);
1457 			atomic64_add(delta, &iocg->done_vtime);
1458 			iocg->last_vtime += delta;
1459 			/* if usage is sufficiently low, maybe it can donate */
1460 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1461 				iocg->has_surplus = true;
1462 				nr_surpluses++;
1463 			}
1464 		} else if (hw_inuse < hw_active) {
1465 			u32 new_hwi, new_inuse;
1466 
1467 			/* was donating but might need to take back some */
1468 			if (waitqueue_active(&iocg->waitq)) {
1469 				new_hwi = hw_active;
1470 			} else {
1471 				new_hwi = max(hw_inuse,
1472 					      usage * SURPLUS_SCALE_PCT / 100 +
1473 					      SURPLUS_SCALE_ABS);
1474 			}
1475 
1476 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1477 					      hw_inuse);
1478 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1479 
1480 			if (new_inuse > iocg->inuse) {
1481 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1482 						iocg->inuse, new_inuse,
1483 						hw_inuse, new_hwi);
1484 				__propagate_active_weight(iocg, iocg->weight,
1485 							  new_inuse);
1486 			}
1487 		} else {
1488 			/* genuninely out of vtime */
1489 			nr_shortages++;
1490 		}
1491 	}
1492 
1493 	if (!nr_shortages || !nr_surpluses)
1494 		goto skip_surplus_transfers;
1495 
1496 	/* there are both shortages and surpluses, transfer surpluses */
1497 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1498 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1499 		int nr_valid = 0;
1500 
1501 		if (!iocg->has_surplus)
1502 			continue;
1503 
1504 		/* base the decision on max historical usage */
1505 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1506 			if (iocg->usages[i]) {
1507 				usage = max(usage, iocg->usages[i]);
1508 				nr_valid++;
1509 			}
1510 		}
1511 		if (nr_valid < MIN_VALID_USAGES)
1512 			continue;
1513 
1514 		current_hweight(iocg, &hw_active, &hw_inuse);
1515 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1516 		if (!new_hwi)
1517 			continue;
1518 
1519 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1520 					       hw_inuse);
1521 		if (new_inuse < iocg->inuse) {
1522 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1523 					iocg->inuse, new_inuse,
1524 					hw_inuse, new_hwi);
1525 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1526 		}
1527 	}
1528 skip_surplus_transfers:
1529 	commit_active_weights(ioc);
1530 
1531 	/*
1532 	 * If q is getting clogged or we're missing too much, we're issuing
1533 	 * too much IO and should lower vtime rate.  If we're not missing
1534 	 * and experiencing shortages but not surpluses, we're too stingy
1535 	 * and should increase vtime rate.
1536 	 */
1537 	prev_busy_level = ioc->busy_level;
1538 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1539 	    missed_ppm[READ] > ppm_rthr ||
1540 	    missed_ppm[WRITE] > ppm_wthr) {
1541 		/* clearly missing QoS targets, slow down vrate */
1542 		ioc->busy_level = max(ioc->busy_level, 0);
1543 		ioc->busy_level++;
1544 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1545 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1546 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1547 		/* QoS targets are being met with >25% margin */
1548 		if (nr_shortages) {
1549 			/*
1550 			 * We're throttling while the device has spare
1551 			 * capacity.  If vrate was being slowed down, stop.
1552 			 */
1553 			ioc->busy_level = min(ioc->busy_level, 0);
1554 
1555 			/*
1556 			 * If there are IOs spanning multiple periods, wait
1557 			 * them out before pushing the device harder.  If
1558 			 * there are surpluses, let redistribution work it
1559 			 * out first.
1560 			 */
1561 			if (!nr_lagging && !nr_surpluses)
1562 				ioc->busy_level--;
1563 		} else {
1564 			/*
1565 			 * Nobody is being throttled and the users aren't
1566 			 * issuing enough IOs to saturate the device.  We
1567 			 * simply don't know how close the device is to
1568 			 * saturation.  Coast.
1569 			 */
1570 			ioc->busy_level = 0;
1571 		}
1572 	} else {
1573 		/* inside the hysterisis margin, we're good */
1574 		ioc->busy_level = 0;
1575 	}
1576 
1577 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1578 
1579 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1580 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1581 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1582 
1583 		/* rq_wait signal is always reliable, ignore user vrate_min */
1584 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1585 			vrate_min = VRATE_MIN;
1586 
1587 		/*
1588 		 * If vrate is out of bounds, apply clamp gradually as the
1589 		 * bounds can change abruptly.  Otherwise, apply busy_level
1590 		 * based adjustment.
1591 		 */
1592 		if (vrate < vrate_min) {
1593 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1594 					  100);
1595 			vrate = min(vrate, vrate_min);
1596 		} else if (vrate > vrate_max) {
1597 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1598 					  100);
1599 			vrate = max(vrate, vrate_max);
1600 		} else {
1601 			int idx = min_t(int, abs(ioc->busy_level),
1602 					ARRAY_SIZE(vrate_adj_pct) - 1);
1603 			u32 adj_pct = vrate_adj_pct[idx];
1604 
1605 			if (ioc->busy_level > 0)
1606 				adj_pct = 100 - adj_pct;
1607 			else
1608 				adj_pct = 100 + adj_pct;
1609 
1610 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1611 				      vrate_min, vrate_max);
1612 		}
1613 
1614 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1615 					   nr_lagging, nr_shortages,
1616 					   nr_surpluses);
1617 
1618 		atomic64_set(&ioc->vtime_rate, vrate);
1619 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1620 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1621 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1622 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1623 					   missed_ppm, rq_wait_pct, nr_lagging,
1624 					   nr_shortages, nr_surpluses);
1625 	}
1626 
1627 	ioc_refresh_params(ioc, false);
1628 
1629 	/*
1630 	 * This period is done.  Move onto the next one.  If nothing's
1631 	 * going on with the device, stop the timer.
1632 	 */
1633 	atomic64_inc(&ioc->cur_period);
1634 
1635 	if (ioc->running != IOC_STOP) {
1636 		if (!list_empty(&ioc->active_iocgs)) {
1637 			ioc_start_period(ioc, &now);
1638 		} else {
1639 			ioc->busy_level = 0;
1640 			ioc->running = IOC_IDLE;
1641 		}
1642 	}
1643 
1644 	spin_unlock_irq(&ioc->lock);
1645 }
1646 
1647 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1648 				    bool is_merge, u64 *costp)
1649 {
1650 	struct ioc *ioc = iocg->ioc;
1651 	u64 coef_seqio, coef_randio, coef_page;
1652 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1653 	u64 seek_pages = 0;
1654 	u64 cost = 0;
1655 
1656 	switch (bio_op(bio)) {
1657 	case REQ_OP_READ:
1658 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1659 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1660 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1661 		break;
1662 	case REQ_OP_WRITE:
1663 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1664 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1665 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1666 		break;
1667 	default:
1668 		goto out;
1669 	}
1670 
1671 	if (iocg->cursor) {
1672 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1673 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1674 	}
1675 
1676 	if (!is_merge) {
1677 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1678 			cost += coef_randio;
1679 		} else {
1680 			cost += coef_seqio;
1681 		}
1682 	}
1683 	cost += pages * coef_page;
1684 out:
1685 	*costp = cost;
1686 }
1687 
1688 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1689 {
1690 	u64 cost;
1691 
1692 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1693 	return cost;
1694 }
1695 
1696 static void calc_size_vtime_cost_builtin(struct request *rq, struct ioc *ioc,
1697 					 u64 *costp)
1698 {
1699 	unsigned int pages = blk_rq_stats_sectors(rq) >> IOC_SECT_TO_PAGE_SHIFT;
1700 
1701 	switch (req_op(rq)) {
1702 	case REQ_OP_READ:
1703 		*costp = pages * ioc->params.lcoefs[LCOEF_RPAGE];
1704 		break;
1705 	case REQ_OP_WRITE:
1706 		*costp = pages * ioc->params.lcoefs[LCOEF_WPAGE];
1707 		break;
1708 	default:
1709 		*costp = 0;
1710 	}
1711 }
1712 
1713 static u64 calc_size_vtime_cost(struct request *rq, struct ioc *ioc)
1714 {
1715 	u64 cost;
1716 
1717 	calc_size_vtime_cost_builtin(rq, ioc, &cost);
1718 	return cost;
1719 }
1720 
1721 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1722 {
1723 	struct blkcg_gq *blkg = bio->bi_blkg;
1724 	struct ioc *ioc = rqos_to_ioc(rqos);
1725 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1726 	struct ioc_now now;
1727 	struct iocg_wait wait;
1728 	u32 hw_active, hw_inuse;
1729 	u64 abs_cost, cost, vtime;
1730 
1731 	/* bypass IOs if disabled or for root cgroup */
1732 	if (!ioc->enabled || !iocg->level)
1733 		return;
1734 
1735 	/* always activate so that even 0 cost IOs get protected to some level */
1736 	if (!iocg_activate(iocg, &now))
1737 		return;
1738 
1739 	/* calculate the absolute vtime cost */
1740 	abs_cost = calc_vtime_cost(bio, iocg, false);
1741 	if (!abs_cost)
1742 		return;
1743 
1744 	iocg->cursor = bio_end_sector(bio);
1745 
1746 	vtime = atomic64_read(&iocg->vtime);
1747 	current_hweight(iocg, &hw_active, &hw_inuse);
1748 
1749 	if (hw_inuse < hw_active &&
1750 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1751 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1752 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1753 		spin_lock_irq(&ioc->lock);
1754 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1755 		spin_unlock_irq(&ioc->lock);
1756 		current_hweight(iocg, &hw_active, &hw_inuse);
1757 	}
1758 
1759 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1760 
1761 	/*
1762 	 * If no one's waiting and within budget, issue right away.  The
1763 	 * tests are racy but the races aren't systemic - we only miss once
1764 	 * in a while which is fine.
1765 	 */
1766 	if (!waitqueue_active(&iocg->waitq) && !iocg->abs_vdebt &&
1767 	    time_before_eq64(vtime + cost, now.vnow)) {
1768 		iocg_commit_bio(iocg, bio, cost);
1769 		return;
1770 	}
1771 
1772 	/*
1773 	 * We activated above but w/o any synchronization. Deactivation is
1774 	 * synchronized with waitq.lock and we won't get deactivated as long
1775 	 * as we're waiting or has debt, so we're good if we're activated
1776 	 * here. In the unlikely case that we aren't, just issue the IO.
1777 	 */
1778 	spin_lock_irq(&iocg->waitq.lock);
1779 
1780 	if (unlikely(list_empty(&iocg->active_list))) {
1781 		spin_unlock_irq(&iocg->waitq.lock);
1782 		iocg_commit_bio(iocg, bio, cost);
1783 		return;
1784 	}
1785 
1786 	/*
1787 	 * We're over budget. If @bio has to be issued regardless, remember
1788 	 * the abs_cost instead of advancing vtime. iocg_kick_waitq() will pay
1789 	 * off the debt before waking more IOs.
1790 	 *
1791 	 * This way, the debt is continuously paid off each period with the
1792 	 * actual budget available to the cgroup. If we just wound vtime, we
1793 	 * would incorrectly use the current hw_inuse for the entire amount
1794 	 * which, for example, can lead to the cgroup staying blocked for a
1795 	 * long time even with substantially raised hw_inuse.
1796 	 *
1797 	 * An iocg with vdebt should stay online so that the timer can keep
1798 	 * deducting its vdebt and [de]activate use_delay mechanism
1799 	 * accordingly. We don't want to race against the timer trying to
1800 	 * clear them and leave @iocg inactive w/ dangling use_delay heavily
1801 	 * penalizing the cgroup and its descendants.
1802 	 */
1803 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1804 		iocg->abs_vdebt += abs_cost;
1805 		if (iocg_kick_delay(iocg, &now))
1806 			blkcg_schedule_throttle(rqos->q,
1807 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1808 		spin_unlock_irq(&iocg->waitq.lock);
1809 		return;
1810 	}
1811 
1812 	/*
1813 	 * Append self to the waitq and schedule the wakeup timer if we're
1814 	 * the first waiter.  The timer duration is calculated based on the
1815 	 * current vrate.  vtime and hweight changes can make it too short
1816 	 * or too long.  Each wait entry records the absolute cost it's
1817 	 * waiting for to allow re-evaluation using a custom wait entry.
1818 	 *
1819 	 * If too short, the timer simply reschedules itself.  If too long,
1820 	 * the period timer will notice and trigger wakeups.
1821 	 *
1822 	 * All waiters are on iocg->waitq and the wait states are
1823 	 * synchronized using waitq.lock.
1824 	 */
1825 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1826 	wait.wait.private = current;
1827 	wait.bio = bio;
1828 	wait.abs_cost = abs_cost;
1829 	wait.committed = false;	/* will be set true by waker */
1830 
1831 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1832 	iocg_kick_waitq(iocg, &now);
1833 
1834 	spin_unlock_irq(&iocg->waitq.lock);
1835 
1836 	while (true) {
1837 		set_current_state(TASK_UNINTERRUPTIBLE);
1838 		if (wait.committed)
1839 			break;
1840 		io_schedule();
1841 	}
1842 
1843 	/* waker already committed us, proceed */
1844 	finish_wait(&iocg->waitq, &wait.wait);
1845 }
1846 
1847 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1848 			   struct bio *bio)
1849 {
1850 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1851 	struct ioc *ioc = iocg->ioc;
1852 	sector_t bio_end = bio_end_sector(bio);
1853 	struct ioc_now now;
1854 	u32 hw_inuse;
1855 	u64 abs_cost, cost;
1856 	unsigned long flags;
1857 
1858 	/* bypass if disabled or for root cgroup */
1859 	if (!ioc->enabled || !iocg->level)
1860 		return;
1861 
1862 	abs_cost = calc_vtime_cost(bio, iocg, true);
1863 	if (!abs_cost)
1864 		return;
1865 
1866 	ioc_now(ioc, &now);
1867 	current_hweight(iocg, NULL, &hw_inuse);
1868 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1869 
1870 	/* update cursor if backmerging into the request at the cursor */
1871 	if (blk_rq_pos(rq) < bio_end &&
1872 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1873 		iocg->cursor = bio_end;
1874 
1875 	/*
1876 	 * Charge if there's enough vtime budget and the existing request has
1877 	 * cost assigned.
1878 	 */
1879 	if (rq->bio && rq->bio->bi_iocost_cost &&
1880 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow)) {
1881 		iocg_commit_bio(iocg, bio, cost);
1882 		return;
1883 	}
1884 
1885 	/*
1886 	 * Otherwise, account it as debt if @iocg is online, which it should
1887 	 * be for the vast majority of cases. See debt handling in
1888 	 * ioc_rqos_throttle() for details.
1889 	 */
1890 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1891 	if (likely(!list_empty(&iocg->active_list))) {
1892 		iocg->abs_vdebt += abs_cost;
1893 		iocg_kick_delay(iocg, &now);
1894 	} else {
1895 		iocg_commit_bio(iocg, bio, cost);
1896 	}
1897 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1898 }
1899 
1900 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1901 {
1902 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1903 
1904 	if (iocg && bio->bi_iocost_cost)
1905 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1906 }
1907 
1908 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1909 {
1910 	struct ioc *ioc = rqos_to_ioc(rqos);
1911 	u64 on_q_ns, rq_wait_ns, size_nsec;
1912 	int pidx, rw;
1913 
1914 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1915 		return;
1916 
1917 	switch (req_op(rq) & REQ_OP_MASK) {
1918 	case REQ_OP_READ:
1919 		pidx = QOS_RLAT;
1920 		rw = READ;
1921 		break;
1922 	case REQ_OP_WRITE:
1923 		pidx = QOS_WLAT;
1924 		rw = WRITE;
1925 		break;
1926 	default:
1927 		return;
1928 	}
1929 
1930 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1931 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1932 	size_nsec = div64_u64(calc_size_vtime_cost(rq, ioc), VTIME_PER_NSEC);
1933 
1934 	if (on_q_ns <= size_nsec ||
1935 	    on_q_ns - size_nsec <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1936 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1937 	else
1938 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1939 
1940 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1941 }
1942 
1943 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1944 {
1945 	struct ioc *ioc = rqos_to_ioc(rqos);
1946 
1947 	spin_lock_irq(&ioc->lock);
1948 	ioc_refresh_params(ioc, false);
1949 	spin_unlock_irq(&ioc->lock);
1950 }
1951 
1952 static void ioc_rqos_exit(struct rq_qos *rqos)
1953 {
1954 	struct ioc *ioc = rqos_to_ioc(rqos);
1955 
1956 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1957 
1958 	spin_lock_irq(&ioc->lock);
1959 	ioc->running = IOC_STOP;
1960 	spin_unlock_irq(&ioc->lock);
1961 
1962 	del_timer_sync(&ioc->timer);
1963 	free_percpu(ioc->pcpu_stat);
1964 	kfree(ioc);
1965 }
1966 
1967 static struct rq_qos_ops ioc_rqos_ops = {
1968 	.throttle = ioc_rqos_throttle,
1969 	.merge = ioc_rqos_merge,
1970 	.done_bio = ioc_rqos_done_bio,
1971 	.done = ioc_rqos_done,
1972 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1973 	.exit = ioc_rqos_exit,
1974 };
1975 
1976 static int blk_iocost_init(struct request_queue *q)
1977 {
1978 	struct ioc *ioc;
1979 	struct rq_qos *rqos;
1980 	int ret;
1981 
1982 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1983 	if (!ioc)
1984 		return -ENOMEM;
1985 
1986 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1987 	if (!ioc->pcpu_stat) {
1988 		kfree(ioc);
1989 		return -ENOMEM;
1990 	}
1991 
1992 	rqos = &ioc->rqos;
1993 	rqos->id = RQ_QOS_COST;
1994 	rqos->ops = &ioc_rqos_ops;
1995 	rqos->q = q;
1996 
1997 	spin_lock_init(&ioc->lock);
1998 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1999 	INIT_LIST_HEAD(&ioc->active_iocgs);
2000 
2001 	ioc->running = IOC_IDLE;
2002 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
2003 	seqcount_spinlock_init(&ioc->period_seqcount, &ioc->lock);
2004 	ioc->period_at = ktime_to_us(ktime_get());
2005 	atomic64_set(&ioc->cur_period, 0);
2006 	atomic_set(&ioc->hweight_gen, 0);
2007 
2008 	spin_lock_irq(&ioc->lock);
2009 	ioc->autop_idx = AUTOP_INVALID;
2010 	ioc_refresh_params(ioc, true);
2011 	spin_unlock_irq(&ioc->lock);
2012 
2013 	rq_qos_add(q, rqos);
2014 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
2015 	if (ret) {
2016 		rq_qos_del(q, rqos);
2017 		free_percpu(ioc->pcpu_stat);
2018 		kfree(ioc);
2019 		return ret;
2020 	}
2021 	return 0;
2022 }
2023 
2024 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
2025 {
2026 	struct ioc_cgrp *iocc;
2027 
2028 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
2029 	if (!iocc)
2030 		return NULL;
2031 
2032 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
2033 	return &iocc->cpd;
2034 }
2035 
2036 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
2037 {
2038 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
2039 }
2040 
2041 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
2042 					     struct blkcg *blkcg)
2043 {
2044 	int levels = blkcg->css.cgroup->level + 1;
2045 	struct ioc_gq *iocg;
2046 
2047 	iocg = kzalloc_node(struct_size(iocg, ancestors, levels), gfp, q->node);
2048 	if (!iocg)
2049 		return NULL;
2050 
2051 	return &iocg->pd;
2052 }
2053 
2054 static void ioc_pd_init(struct blkg_policy_data *pd)
2055 {
2056 	struct ioc_gq *iocg = pd_to_iocg(pd);
2057 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
2058 	struct ioc *ioc = q_to_ioc(blkg->q);
2059 	struct ioc_now now;
2060 	struct blkcg_gq *tblkg;
2061 	unsigned long flags;
2062 
2063 	ioc_now(ioc, &now);
2064 
2065 	iocg->ioc = ioc;
2066 	atomic64_set(&iocg->vtime, now.vnow);
2067 	atomic64_set(&iocg->done_vtime, now.vnow);
2068 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2069 	INIT_LIST_HEAD(&iocg->active_list);
2070 	iocg->hweight_active = HWEIGHT_WHOLE;
2071 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2072 
2073 	init_waitqueue_head(&iocg->waitq);
2074 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2075 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2076 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2077 	iocg->delay_timer.function = iocg_delay_timer_fn;
2078 
2079 	iocg->level = blkg->blkcg->css.cgroup->level;
2080 
2081 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2082 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2083 		iocg->ancestors[tiocg->level] = tiocg;
2084 	}
2085 
2086 	spin_lock_irqsave(&ioc->lock, flags);
2087 	weight_updated(iocg);
2088 	spin_unlock_irqrestore(&ioc->lock, flags);
2089 }
2090 
2091 static void ioc_pd_free(struct blkg_policy_data *pd)
2092 {
2093 	struct ioc_gq *iocg = pd_to_iocg(pd);
2094 	struct ioc *ioc = iocg->ioc;
2095 	unsigned long flags;
2096 
2097 	if (ioc) {
2098 		spin_lock_irqsave(&ioc->lock, flags);
2099 		if (!list_empty(&iocg->active_list)) {
2100 			propagate_active_weight(iocg, 0, 0);
2101 			list_del_init(&iocg->active_list);
2102 		}
2103 		spin_unlock_irqrestore(&ioc->lock, flags);
2104 
2105 		hrtimer_cancel(&iocg->waitq_timer);
2106 		hrtimer_cancel(&iocg->delay_timer);
2107 	}
2108 	kfree(iocg);
2109 }
2110 
2111 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2112 			     int off)
2113 {
2114 	const char *dname = blkg_dev_name(pd->blkg);
2115 	struct ioc_gq *iocg = pd_to_iocg(pd);
2116 
2117 	if (dname && iocg->cfg_weight)
2118 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2119 	return 0;
2120 }
2121 
2122 
2123 static int ioc_weight_show(struct seq_file *sf, void *v)
2124 {
2125 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2126 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2127 
2128 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2129 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2130 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2131 	return 0;
2132 }
2133 
2134 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2135 				size_t nbytes, loff_t off)
2136 {
2137 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2138 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2139 	struct blkg_conf_ctx ctx;
2140 	struct ioc_gq *iocg;
2141 	u32 v;
2142 	int ret;
2143 
2144 	if (!strchr(buf, ':')) {
2145 		struct blkcg_gq *blkg;
2146 
2147 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2148 			return -EINVAL;
2149 
2150 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2151 			return -EINVAL;
2152 
2153 		spin_lock(&blkcg->lock);
2154 		iocc->dfl_weight = v;
2155 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2156 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2157 
2158 			if (iocg) {
2159 				spin_lock_irq(&iocg->ioc->lock);
2160 				weight_updated(iocg);
2161 				spin_unlock_irq(&iocg->ioc->lock);
2162 			}
2163 		}
2164 		spin_unlock(&blkcg->lock);
2165 
2166 		return nbytes;
2167 	}
2168 
2169 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2170 	if (ret)
2171 		return ret;
2172 
2173 	iocg = blkg_to_iocg(ctx.blkg);
2174 
2175 	if (!strncmp(ctx.body, "default", 7)) {
2176 		v = 0;
2177 	} else {
2178 		if (!sscanf(ctx.body, "%u", &v))
2179 			goto einval;
2180 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2181 			goto einval;
2182 	}
2183 
2184 	spin_lock(&iocg->ioc->lock);
2185 	iocg->cfg_weight = v;
2186 	weight_updated(iocg);
2187 	spin_unlock(&iocg->ioc->lock);
2188 
2189 	blkg_conf_finish(&ctx);
2190 	return nbytes;
2191 
2192 einval:
2193 	blkg_conf_finish(&ctx);
2194 	return -EINVAL;
2195 }
2196 
2197 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2198 			  int off)
2199 {
2200 	const char *dname = blkg_dev_name(pd->blkg);
2201 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2202 
2203 	if (!dname)
2204 		return 0;
2205 
2206 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2207 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2208 		   ioc->params.qos[QOS_RPPM] / 10000,
2209 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2210 		   ioc->params.qos[QOS_RLAT],
2211 		   ioc->params.qos[QOS_WPPM] / 10000,
2212 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2213 		   ioc->params.qos[QOS_WLAT],
2214 		   ioc->params.qos[QOS_MIN] / 10000,
2215 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2216 		   ioc->params.qos[QOS_MAX] / 10000,
2217 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2218 	return 0;
2219 }
2220 
2221 static int ioc_qos_show(struct seq_file *sf, void *v)
2222 {
2223 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2224 
2225 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2226 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2227 	return 0;
2228 }
2229 
2230 static const match_table_t qos_ctrl_tokens = {
2231 	{ QOS_ENABLE,		"enable=%u"	},
2232 	{ QOS_CTRL,		"ctrl=%s"	},
2233 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2234 };
2235 
2236 static const match_table_t qos_tokens = {
2237 	{ QOS_RPPM,		"rpct=%s"	},
2238 	{ QOS_RLAT,		"rlat=%u"	},
2239 	{ QOS_WPPM,		"wpct=%s"	},
2240 	{ QOS_WLAT,		"wlat=%u"	},
2241 	{ QOS_MIN,		"min=%s"	},
2242 	{ QOS_MAX,		"max=%s"	},
2243 	{ NR_QOS_PARAMS,	NULL		},
2244 };
2245 
2246 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2247 			     size_t nbytes, loff_t off)
2248 {
2249 	struct gendisk *disk;
2250 	struct ioc *ioc;
2251 	u32 qos[NR_QOS_PARAMS];
2252 	bool enable, user;
2253 	char *p;
2254 	int ret;
2255 
2256 	disk = blkcg_conf_get_disk(&input);
2257 	if (IS_ERR(disk))
2258 		return PTR_ERR(disk);
2259 
2260 	ioc = q_to_ioc(disk->queue);
2261 	if (!ioc) {
2262 		ret = blk_iocost_init(disk->queue);
2263 		if (ret)
2264 			goto err;
2265 		ioc = q_to_ioc(disk->queue);
2266 	}
2267 
2268 	spin_lock_irq(&ioc->lock);
2269 	memcpy(qos, ioc->params.qos, sizeof(qos));
2270 	enable = ioc->enabled;
2271 	user = ioc->user_qos_params;
2272 	spin_unlock_irq(&ioc->lock);
2273 
2274 	while ((p = strsep(&input, " \t\n"))) {
2275 		substring_t args[MAX_OPT_ARGS];
2276 		char buf[32];
2277 		int tok;
2278 		s64 v;
2279 
2280 		if (!*p)
2281 			continue;
2282 
2283 		switch (match_token(p, qos_ctrl_tokens, args)) {
2284 		case QOS_ENABLE:
2285 			match_u64(&args[0], &v);
2286 			enable = v;
2287 			continue;
2288 		case QOS_CTRL:
2289 			match_strlcpy(buf, &args[0], sizeof(buf));
2290 			if (!strcmp(buf, "auto"))
2291 				user = false;
2292 			else if (!strcmp(buf, "user"))
2293 				user = true;
2294 			else
2295 				goto einval;
2296 			continue;
2297 		}
2298 
2299 		tok = match_token(p, qos_tokens, args);
2300 		switch (tok) {
2301 		case QOS_RPPM:
2302 		case QOS_WPPM:
2303 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2304 			    sizeof(buf))
2305 				goto einval;
2306 			if (cgroup_parse_float(buf, 2, &v))
2307 				goto einval;
2308 			if (v < 0 || v > 10000)
2309 				goto einval;
2310 			qos[tok] = v * 100;
2311 			break;
2312 		case QOS_RLAT:
2313 		case QOS_WLAT:
2314 			if (match_u64(&args[0], &v))
2315 				goto einval;
2316 			qos[tok] = v;
2317 			break;
2318 		case QOS_MIN:
2319 		case QOS_MAX:
2320 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2321 			    sizeof(buf))
2322 				goto einval;
2323 			if (cgroup_parse_float(buf, 2, &v))
2324 				goto einval;
2325 			if (v < 0)
2326 				goto einval;
2327 			qos[tok] = clamp_t(s64, v * 100,
2328 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2329 			break;
2330 		default:
2331 			goto einval;
2332 		}
2333 		user = true;
2334 	}
2335 
2336 	if (qos[QOS_MIN] > qos[QOS_MAX])
2337 		goto einval;
2338 
2339 	spin_lock_irq(&ioc->lock);
2340 
2341 	if (enable) {
2342 		blk_stat_enable_accounting(ioc->rqos.q);
2343 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2344 		ioc->enabled = true;
2345 	} else {
2346 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2347 		ioc->enabled = false;
2348 	}
2349 
2350 	if (user) {
2351 		memcpy(ioc->params.qos, qos, sizeof(qos));
2352 		ioc->user_qos_params = true;
2353 	} else {
2354 		ioc->user_qos_params = false;
2355 	}
2356 
2357 	ioc_refresh_params(ioc, true);
2358 	spin_unlock_irq(&ioc->lock);
2359 
2360 	put_disk_and_module(disk);
2361 	return nbytes;
2362 einval:
2363 	ret = -EINVAL;
2364 err:
2365 	put_disk_and_module(disk);
2366 	return ret;
2367 }
2368 
2369 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2370 				 struct blkg_policy_data *pd, int off)
2371 {
2372 	const char *dname = blkg_dev_name(pd->blkg);
2373 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2374 	u64 *u = ioc->params.i_lcoefs;
2375 
2376 	if (!dname)
2377 		return 0;
2378 
2379 	seq_printf(sf, "%s ctrl=%s model=linear "
2380 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2381 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2382 		   dname, ioc->user_cost_model ? "user" : "auto",
2383 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2384 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2385 	return 0;
2386 }
2387 
2388 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2389 {
2390 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2391 
2392 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2393 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2394 	return 0;
2395 }
2396 
2397 static const match_table_t cost_ctrl_tokens = {
2398 	{ COST_CTRL,		"ctrl=%s"	},
2399 	{ COST_MODEL,		"model=%s"	},
2400 	{ NR_COST_CTRL_PARAMS,	NULL		},
2401 };
2402 
2403 static const match_table_t i_lcoef_tokens = {
2404 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2405 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2406 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2407 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2408 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2409 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2410 	{ NR_I_LCOEFS,		NULL		},
2411 };
2412 
2413 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2414 				    size_t nbytes, loff_t off)
2415 {
2416 	struct gendisk *disk;
2417 	struct ioc *ioc;
2418 	u64 u[NR_I_LCOEFS];
2419 	bool user;
2420 	char *p;
2421 	int ret;
2422 
2423 	disk = blkcg_conf_get_disk(&input);
2424 	if (IS_ERR(disk))
2425 		return PTR_ERR(disk);
2426 
2427 	ioc = q_to_ioc(disk->queue);
2428 	if (!ioc) {
2429 		ret = blk_iocost_init(disk->queue);
2430 		if (ret)
2431 			goto err;
2432 		ioc = q_to_ioc(disk->queue);
2433 	}
2434 
2435 	spin_lock_irq(&ioc->lock);
2436 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2437 	user = ioc->user_cost_model;
2438 	spin_unlock_irq(&ioc->lock);
2439 
2440 	while ((p = strsep(&input, " \t\n"))) {
2441 		substring_t args[MAX_OPT_ARGS];
2442 		char buf[32];
2443 		int tok;
2444 		u64 v;
2445 
2446 		if (!*p)
2447 			continue;
2448 
2449 		switch (match_token(p, cost_ctrl_tokens, args)) {
2450 		case COST_CTRL:
2451 			match_strlcpy(buf, &args[0], sizeof(buf));
2452 			if (!strcmp(buf, "auto"))
2453 				user = false;
2454 			else if (!strcmp(buf, "user"))
2455 				user = true;
2456 			else
2457 				goto einval;
2458 			continue;
2459 		case COST_MODEL:
2460 			match_strlcpy(buf, &args[0], sizeof(buf));
2461 			if (strcmp(buf, "linear"))
2462 				goto einval;
2463 			continue;
2464 		}
2465 
2466 		tok = match_token(p, i_lcoef_tokens, args);
2467 		if (tok == NR_I_LCOEFS)
2468 			goto einval;
2469 		if (match_u64(&args[0], &v))
2470 			goto einval;
2471 		u[tok] = v;
2472 		user = true;
2473 	}
2474 
2475 	spin_lock_irq(&ioc->lock);
2476 	if (user) {
2477 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2478 		ioc->user_cost_model = true;
2479 	} else {
2480 		ioc->user_cost_model = false;
2481 	}
2482 	ioc_refresh_params(ioc, true);
2483 	spin_unlock_irq(&ioc->lock);
2484 
2485 	put_disk_and_module(disk);
2486 	return nbytes;
2487 
2488 einval:
2489 	ret = -EINVAL;
2490 err:
2491 	put_disk_and_module(disk);
2492 	return ret;
2493 }
2494 
2495 static struct cftype ioc_files[] = {
2496 	{
2497 		.name = "weight",
2498 		.flags = CFTYPE_NOT_ON_ROOT,
2499 		.seq_show = ioc_weight_show,
2500 		.write = ioc_weight_write,
2501 	},
2502 	{
2503 		.name = "cost.qos",
2504 		.flags = CFTYPE_ONLY_ON_ROOT,
2505 		.seq_show = ioc_qos_show,
2506 		.write = ioc_qos_write,
2507 	},
2508 	{
2509 		.name = "cost.model",
2510 		.flags = CFTYPE_ONLY_ON_ROOT,
2511 		.seq_show = ioc_cost_model_show,
2512 		.write = ioc_cost_model_write,
2513 	},
2514 	{}
2515 };
2516 
2517 static struct blkcg_policy blkcg_policy_iocost = {
2518 	.dfl_cftypes	= ioc_files,
2519 	.cpd_alloc_fn	= ioc_cpd_alloc,
2520 	.cpd_free_fn	= ioc_cpd_free,
2521 	.pd_alloc_fn	= ioc_pd_alloc,
2522 	.pd_init_fn	= ioc_pd_init,
2523 	.pd_free_fn	= ioc_pd_free,
2524 };
2525 
2526 static int __init ioc_init(void)
2527 {
2528 	return blkcg_policy_register(&blkcg_policy_iocost);
2529 }
2530 
2531 static void __exit ioc_exit(void)
2532 {
2533 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2534 }
2535 
2536 module_init(ioc_init);
2537 module_exit(ioc_exit);
2538