xref: /openbmc/linux/block/blk-iocost.c (revision b2765275)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
50  * device-specific coefficients.
51  *
52  * 2. Control Strategy
53  *
54  * The device virtual time (vtime) is used as the primary control metric.
55  * The control strategy is composed of the following three parts.
56  *
57  * 2-1. Vtime Distribution
58  *
59  * When a cgroup becomes active in terms of IOs, its hierarchical share is
60  * calculated.  Please consider the following hierarchy where the numbers
61  * inside parentheses denote the configured weights.
62  *
63  *           root
64  *         /       \
65  *      A (w:100)  B (w:300)
66  *      /       \
67  *  A0 (w:100)  A1 (w:100)
68  *
69  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
70  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
71  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
72  * 12.5% each.  The distribution mechanism only cares about these flattened
73  * shares.  They're called hweights (hierarchical weights) and always add
74  * upto 1 (HWEIGHT_WHOLE).
75  *
76  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
77  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
78  * against the device vtime - an IO which takes 10ms on the underlying
79  * device is considered to take 80ms on A0.
80  *
81  * This constitutes the basis of IO capacity distribution.  Each cgroup's
82  * vtime is running at a rate determined by its hweight.  A cgroup tracks
83  * the vtime consumed by past IOs and can issue a new IO iff doing so
84  * wouldn't outrun the current device vtime.  Otherwise, the IO is
85  * suspended until the vtime has progressed enough to cover it.
86  *
87  * 2-2. Vrate Adjustment
88  *
89  * It's unrealistic to expect the cost model to be perfect.  There are too
90  * many devices and even on the same device the overall performance
91  * fluctuates depending on numerous factors such as IO mixture and device
92  * internal garbage collection.  The controller needs to adapt dynamically.
93  *
94  * This is achieved by adjusting the overall IO rate according to how busy
95  * the device is.  If the device becomes overloaded, we're sending down too
96  * many IOs and should generally slow down.  If there are waiting issuers
97  * but the device isn't saturated, we're issuing too few and should
98  * generally speed up.
99  *
100  * To slow down, we lower the vrate - the rate at which the device vtime
101  * passes compared to the wall clock.  For example, if the vtime is running
102  * at the vrate of 75%, all cgroups added up would only be able to issue
103  * 750ms worth of IOs per second, and vice-versa for speeding up.
104  *
105  * Device business is determined using two criteria - rq wait and
106  * completion latencies.
107  *
108  * When a device gets saturated, the on-device and then the request queues
109  * fill up and a bio which is ready to be issued has to wait for a request
110  * to become available.  When this delay becomes noticeable, it's a clear
111  * indication that the device is saturated and we lower the vrate.  This
112  * saturation signal is fairly conservative as it only triggers when both
113  * hardware and software queues are filled up, and is used as the default
114  * busy signal.
115  *
116  * As devices can have deep queues and be unfair in how the queued commands
117  * are executed, soley depending on rq wait may not result in satisfactory
118  * control quality.  For a better control quality, completion latency QoS
119  * parameters can be configured so that the device is considered saturated
120  * if N'th percentile completion latency rises above the set point.
121  *
122  * The completion latency requirements are a function of both the
123  * underlying device characteristics and the desired IO latency quality of
124  * service.  There is an inherent trade-off - the tighter the latency QoS,
125  * the higher the bandwidth lossage.  Latency QoS is disabled by default
126  * and can be set through /sys/fs/cgroup/io.cost.qos.
127  *
128  * 2-3. Work Conservation
129  *
130  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
131  * periodically while B is sending out enough parallel IOs to saturate the
132  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
133  * cost per second, i.e., 10% of the device capacity.  The naive
134  * distribution of half and half would lead to 60% utilization of the
135  * device, a significant reduction in the total amount of work done
136  * compared to free-for-all competition.  This is too high a cost to pay
137  * for IO control.
138  *
139  * To conserve the total amount of work done, we keep track of how much
140  * each active cgroup is actually using and yield part of its weight if
141  * there are other cgroups which can make use of it.  In the above case,
142  * A's weight will be lowered so that it hovers above the actual usage and
143  * B would be able to use the rest.
144  *
145  * As we don't want to penalize a cgroup for donating its weight, the
146  * surplus weight adjustment factors in a margin and has an immediate
147  * snapback mechanism in case the cgroup needs more IO vtime for itself.
148  *
149  * Note that adjusting down surplus weights has the same effects as
150  * accelerating vtime for other cgroups and work conservation can also be
151  * implemented by adjusting vrate dynamically.  However, squaring who can
152  * donate and should take back how much requires hweight propagations
153  * anyway making it easier to implement and understand as a separate
154  * mechanism.
155  *
156  * 3. Monitoring
157  *
158  * Instead of debugfs or other clumsy monitoring mechanisms, this
159  * controller uses a drgn based monitoring script -
160  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
161  * https://github.com/osandov/drgn.  The ouput looks like the following.
162  *
163  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
164  *                 active      weight      hweight% inflt% dbt  delay usages%
165  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
166  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
167  *
168  * - per	: Timer period
169  * - cur_per	: Internal wall and device vtime clock
170  * - vrate	: Device virtual time rate against wall clock
171  * - weight	: Surplus-adjusted and configured weights
172  * - hweight	: Surplus-adjusted and configured hierarchical weights
173  * - inflt	: The percentage of in-flight IO cost at the end of last period
174  * - del_ms	: Deferred issuer delay induction level and duration
175  * - usages	: Usage history
176  */
177 
178 #include <linux/kernel.h>
179 #include <linux/module.h>
180 #include <linux/timer.h>
181 #include <linux/time64.h>
182 #include <linux/parser.h>
183 #include <linux/sched/signal.h>
184 #include <linux/blk-cgroup.h>
185 #include "blk-rq-qos.h"
186 #include "blk-stat.h"
187 #include "blk-wbt.h"
188 
189 #ifdef CONFIG_TRACEPOINTS
190 
191 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
192 #define TRACE_IOCG_PATH_LEN 1024
193 static DEFINE_SPINLOCK(trace_iocg_path_lock);
194 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
195 
196 #define TRACE_IOCG_PATH(type, iocg, ...)					\
197 	do {									\
198 		unsigned long flags;						\
199 		if (trace_iocost_##type##_enabled()) {				\
200 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
201 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
202 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
203 			trace_iocost_##type(iocg, trace_iocg_path,		\
204 					      ##__VA_ARGS__);			\
205 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
206 		}								\
207 	} while (0)
208 
209 #else	/* CONFIG_TRACE_POINTS */
210 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
211 #endif	/* CONFIG_TRACE_POINTS */
212 
213 enum {
214 	MILLION			= 1000000,
215 
216 	/* timer period is calculated from latency requirements, bound it */
217 	MIN_PERIOD		= USEC_PER_MSEC,
218 	MAX_PERIOD		= USEC_PER_SEC,
219 
220 	/*
221 	 * A cgroup's vtime can run 50% behind the device vtime, which
222 	 * serves as its IO credit buffer.  Surplus weight adjustment is
223 	 * immediately canceled if the vtime margin runs below 10%.
224 	 */
225 	MARGIN_PCT		= 50,
226 	INUSE_MARGIN_PCT	= 10,
227 
228 	/* Have some play in waitq timer operations */
229 	WAITQ_TIMER_MARGIN_PCT	= 5,
230 
231 	/*
232 	 * vtime can wrap well within a reasonable uptime when vrate is
233 	 * consistently raised.  Don't trust recorded cgroup vtime if the
234 	 * period counter indicates that it's older than 5mins.
235 	 */
236 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
237 
238 	/*
239 	 * Remember the past three non-zero usages and use the max for
240 	 * surplus calculation.  Three slots guarantee that we remember one
241 	 * full period usage from the last active stretch even after
242 	 * partial deactivation and re-activation periods.  Don't start
243 	 * giving away weight before collecting two data points to prevent
244 	 * hweight adjustments based on one partial activation period.
245 	 */
246 	NR_USAGE_SLOTS		= 3,
247 	MIN_VALID_USAGES	= 2,
248 
249 	/* 1/64k is granular enough and can easily be handled w/ u32 */
250 	HWEIGHT_WHOLE		= 1 << 16,
251 
252 	/*
253 	 * As vtime is used to calculate the cost of each IO, it needs to
254 	 * be fairly high precision.  For example, it should be able to
255 	 * represent the cost of a single page worth of discard with
256 	 * suffificient accuracy.  At the same time, it should be able to
257 	 * represent reasonably long enough durations to be useful and
258 	 * convenient during operation.
259 	 *
260 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
261 	 * granularity and days of wrap-around time even at extreme vrates.
262 	 */
263 	VTIME_PER_SEC_SHIFT	= 37,
264 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
265 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
266 
267 	/* bound vrate adjustments within two orders of magnitude */
268 	VRATE_MIN_PPM		= 10000,	/* 1% */
269 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
270 
271 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
272 	VRATE_CLAMP_ADJ_PCT	= 4,
273 
274 	/* if IOs end up waiting for requests, issue less */
275 	RQ_WAIT_BUSY_PCT	= 5,
276 
277 	/* unbusy hysterisis */
278 	UNBUSY_THR_PCT		= 75,
279 
280 	/* don't let cmds which take a very long time pin lagging for too long */
281 	MAX_LAGGING_PERIODS	= 10,
282 
283 	/*
284 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
285 	 * donate the surplus.
286 	 */
287 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
288 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
289 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
290 
291 	/* switch iff the conditions are met for longer than this */
292 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
293 
294 	/*
295 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
296 	 * size-proportional components of cost calculation in closer
297 	 * numbers of digits to per-IO cost components.
298 	 */
299 	IOC_PAGE_SHIFT		= 12,
300 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
301 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
302 
303 	/* if apart further than 16M, consider randio for linear model */
304 	LCOEF_RANDIO_PAGES	= 4096,
305 };
306 
307 enum ioc_running {
308 	IOC_IDLE,
309 	IOC_RUNNING,
310 	IOC_STOP,
311 };
312 
313 /* io.cost.qos controls including per-dev enable of the whole controller */
314 enum {
315 	QOS_ENABLE,
316 	QOS_CTRL,
317 	NR_QOS_CTRL_PARAMS,
318 };
319 
320 /* io.cost.qos params */
321 enum {
322 	QOS_RPPM,
323 	QOS_RLAT,
324 	QOS_WPPM,
325 	QOS_WLAT,
326 	QOS_MIN,
327 	QOS_MAX,
328 	NR_QOS_PARAMS,
329 };
330 
331 /* io.cost.model controls */
332 enum {
333 	COST_CTRL,
334 	COST_MODEL,
335 	NR_COST_CTRL_PARAMS,
336 };
337 
338 /* builtin linear cost model coefficients */
339 enum {
340 	I_LCOEF_RBPS,
341 	I_LCOEF_RSEQIOPS,
342 	I_LCOEF_RRANDIOPS,
343 	I_LCOEF_WBPS,
344 	I_LCOEF_WSEQIOPS,
345 	I_LCOEF_WRANDIOPS,
346 	NR_I_LCOEFS,
347 };
348 
349 enum {
350 	LCOEF_RPAGE,
351 	LCOEF_RSEQIO,
352 	LCOEF_RRANDIO,
353 	LCOEF_WPAGE,
354 	LCOEF_WSEQIO,
355 	LCOEF_WRANDIO,
356 	NR_LCOEFS,
357 };
358 
359 enum {
360 	AUTOP_INVALID,
361 	AUTOP_HDD,
362 	AUTOP_SSD_QD1,
363 	AUTOP_SSD_DFL,
364 	AUTOP_SSD_FAST,
365 };
366 
367 struct ioc_gq;
368 
369 struct ioc_params {
370 	u32				qos[NR_QOS_PARAMS];
371 	u64				i_lcoefs[NR_I_LCOEFS];
372 	u64				lcoefs[NR_LCOEFS];
373 	u32				too_fast_vrate_pct;
374 	u32				too_slow_vrate_pct;
375 };
376 
377 struct ioc_missed {
378 	u32				nr_met;
379 	u32				nr_missed;
380 	u32				last_met;
381 	u32				last_missed;
382 };
383 
384 struct ioc_pcpu_stat {
385 	struct ioc_missed		missed[2];
386 
387 	u64				rq_wait_ns;
388 	u64				last_rq_wait_ns;
389 };
390 
391 /* per device */
392 struct ioc {
393 	struct rq_qos			rqos;
394 
395 	bool				enabled;
396 
397 	struct ioc_params		params;
398 	u32				period_us;
399 	u32				margin_us;
400 	u64				vrate_min;
401 	u64				vrate_max;
402 
403 	spinlock_t			lock;
404 	struct timer_list		timer;
405 	struct list_head		active_iocgs;	/* active cgroups */
406 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
407 
408 	enum ioc_running		running;
409 	atomic64_t			vtime_rate;
410 
411 	seqcount_t			period_seqcount;
412 	u32				period_at;	/* wallclock starttime */
413 	u64				period_at_vtime; /* vtime starttime */
414 
415 	atomic64_t			cur_period;	/* inc'd each period */
416 	int				busy_level;	/* saturation history */
417 
418 	u64				inuse_margin_vtime;
419 	bool				weights_updated;
420 	atomic_t			hweight_gen;	/* for lazy hweights */
421 
422 	u64				autop_too_fast_at;
423 	u64				autop_too_slow_at;
424 	int				autop_idx;
425 	bool				user_qos_params:1;
426 	bool				user_cost_model:1;
427 };
428 
429 /* per device-cgroup pair */
430 struct ioc_gq {
431 	struct blkg_policy_data		pd;
432 	struct ioc			*ioc;
433 
434 	/*
435 	 * A iocg can get its weight from two sources - an explicit
436 	 * per-device-cgroup configuration or the default weight of the
437 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
438 	 * configuration.  `weight` is the effective considering both
439 	 * sources.
440 	 *
441 	 * When an idle cgroup becomes active its `active` goes from 0 to
442 	 * `weight`.  `inuse` is the surplus adjusted active weight.
443 	 * `active` and `inuse` are used to calculate `hweight_active` and
444 	 * `hweight_inuse`.
445 	 *
446 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
447 	 * surplus adjustments.
448 	 */
449 	u32				cfg_weight;
450 	u32				weight;
451 	u32				active;
452 	u32				inuse;
453 	u32				last_inuse;
454 
455 	sector_t			cursor;		/* to detect randio */
456 
457 	/*
458 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
459 	 * issued.  If lagging behind device vtime, the delta represents
460 	 * the currently available IO budget.  If runnning ahead, the
461 	 * overage.
462 	 *
463 	 * `vtime_done` is the same but progressed on completion rather
464 	 * than issue.  The delta behind `vtime` represents the cost of
465 	 * currently in-flight IOs.
466 	 *
467 	 * `last_vtime` is used to remember `vtime` at the end of the last
468 	 * period to calculate utilization.
469 	 */
470 	atomic64_t			vtime;
471 	atomic64_t			done_vtime;
472 	atomic64_t			abs_vdebt;
473 	u64				last_vtime;
474 
475 	/*
476 	 * The period this iocg was last active in.  Used for deactivation
477 	 * and invalidating `vtime`.
478 	 */
479 	atomic64_t			active_period;
480 	struct list_head		active_list;
481 
482 	/* see __propagate_active_weight() and current_hweight() for details */
483 	u64				child_active_sum;
484 	u64				child_inuse_sum;
485 	int				hweight_gen;
486 	u32				hweight_active;
487 	u32				hweight_inuse;
488 	bool				has_surplus;
489 
490 	struct wait_queue_head		waitq;
491 	struct hrtimer			waitq_timer;
492 	struct hrtimer			delay_timer;
493 
494 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
495 	int				usage_idx;
496 	u32				usages[NR_USAGE_SLOTS];
497 
498 	/* this iocg's depth in the hierarchy and ancestors including self */
499 	int				level;
500 	struct ioc_gq			*ancestors[];
501 };
502 
503 /* per cgroup */
504 struct ioc_cgrp {
505 	struct blkcg_policy_data	cpd;
506 	unsigned int			dfl_weight;
507 };
508 
509 struct ioc_now {
510 	u64				now_ns;
511 	u32				now;
512 	u64				vnow;
513 	u64				vrate;
514 };
515 
516 struct iocg_wait {
517 	struct wait_queue_entry		wait;
518 	struct bio			*bio;
519 	u64				abs_cost;
520 	bool				committed;
521 };
522 
523 struct iocg_wake_ctx {
524 	struct ioc_gq			*iocg;
525 	u32				hw_inuse;
526 	s64				vbudget;
527 };
528 
529 static const struct ioc_params autop[] = {
530 	[AUTOP_HDD] = {
531 		.qos				= {
532 			[QOS_RLAT]		=        250000, /* 250ms */
533 			[QOS_WLAT]		=        250000,
534 			[QOS_MIN]		= VRATE_MIN_PPM,
535 			[QOS_MAX]		= VRATE_MAX_PPM,
536 		},
537 		.i_lcoefs			= {
538 			[I_LCOEF_RBPS]		=     174019176,
539 			[I_LCOEF_RSEQIOPS]	=         41708,
540 			[I_LCOEF_RRANDIOPS]	=           370,
541 			[I_LCOEF_WBPS]		=     178075866,
542 			[I_LCOEF_WSEQIOPS]	=         42705,
543 			[I_LCOEF_WRANDIOPS]	=           378,
544 		},
545 	},
546 	[AUTOP_SSD_QD1] = {
547 		.qos				= {
548 			[QOS_RLAT]		=         25000, /* 25ms */
549 			[QOS_WLAT]		=         25000,
550 			[QOS_MIN]		= VRATE_MIN_PPM,
551 			[QOS_MAX]		= VRATE_MAX_PPM,
552 		},
553 		.i_lcoefs			= {
554 			[I_LCOEF_RBPS]		=     245855193,
555 			[I_LCOEF_RSEQIOPS]	=         61575,
556 			[I_LCOEF_RRANDIOPS]	=          6946,
557 			[I_LCOEF_WBPS]		=     141365009,
558 			[I_LCOEF_WSEQIOPS]	=         33716,
559 			[I_LCOEF_WRANDIOPS]	=         26796,
560 		},
561 	},
562 	[AUTOP_SSD_DFL] = {
563 		.qos				= {
564 			[QOS_RLAT]		=         25000, /* 25ms */
565 			[QOS_WLAT]		=         25000,
566 			[QOS_MIN]		= VRATE_MIN_PPM,
567 			[QOS_MAX]		= VRATE_MAX_PPM,
568 		},
569 		.i_lcoefs			= {
570 			[I_LCOEF_RBPS]		=     488636629,
571 			[I_LCOEF_RSEQIOPS]	=          8932,
572 			[I_LCOEF_RRANDIOPS]	=          8518,
573 			[I_LCOEF_WBPS]		=     427891549,
574 			[I_LCOEF_WSEQIOPS]	=         28755,
575 			[I_LCOEF_WRANDIOPS]	=         21940,
576 		},
577 		.too_fast_vrate_pct		=           500,
578 	},
579 	[AUTOP_SSD_FAST] = {
580 		.qos				= {
581 			[QOS_RLAT]		=          5000, /* 5ms */
582 			[QOS_WLAT]		=          5000,
583 			[QOS_MIN]		= VRATE_MIN_PPM,
584 			[QOS_MAX]		= VRATE_MAX_PPM,
585 		},
586 		.i_lcoefs			= {
587 			[I_LCOEF_RBPS]		=    3102524156LLU,
588 			[I_LCOEF_RSEQIOPS]	=        724816,
589 			[I_LCOEF_RRANDIOPS]	=        778122,
590 			[I_LCOEF_WBPS]		=    1742780862LLU,
591 			[I_LCOEF_WSEQIOPS]	=        425702,
592 			[I_LCOEF_WRANDIOPS]	=	 443193,
593 		},
594 		.too_slow_vrate_pct		=            10,
595 	},
596 };
597 
598 /*
599  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
600  * vtime credit shortage and down on device saturation.
601  */
602 static u32 vrate_adj_pct[] =
603 	{ 0, 0, 0, 0,
604 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
605 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
606 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
607 
608 static struct blkcg_policy blkcg_policy_iocost;
609 
610 /* accessors and helpers */
611 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
612 {
613 	return container_of(rqos, struct ioc, rqos);
614 }
615 
616 static struct ioc *q_to_ioc(struct request_queue *q)
617 {
618 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
619 }
620 
621 static const char *q_name(struct request_queue *q)
622 {
623 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
624 		return kobject_name(q->kobj.parent);
625 	else
626 		return "<unknown>";
627 }
628 
629 static const char __maybe_unused *ioc_name(struct ioc *ioc)
630 {
631 	return q_name(ioc->rqos.q);
632 }
633 
634 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
635 {
636 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
637 }
638 
639 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
640 {
641 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
642 }
643 
644 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
645 {
646 	return pd_to_blkg(&iocg->pd);
647 }
648 
649 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
650 {
651 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
652 			    struct ioc_cgrp, cpd);
653 }
654 
655 /*
656  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
657  * weight, the more expensive each IO.  Must round up.
658  */
659 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
660 {
661 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
662 }
663 
664 /*
665  * The inverse of abs_cost_to_cost().  Must round up.
666  */
667 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
668 {
669 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
670 }
671 
672 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
673 {
674 	bio->bi_iocost_cost = cost;
675 	atomic64_add(cost, &iocg->vtime);
676 }
677 
678 #define CREATE_TRACE_POINTS
679 #include <trace/events/iocost.h>
680 
681 /* latency Qos params changed, update period_us and all the dependent params */
682 static void ioc_refresh_period_us(struct ioc *ioc)
683 {
684 	u32 ppm, lat, multi, period_us;
685 
686 	lockdep_assert_held(&ioc->lock);
687 
688 	/* pick the higher latency target */
689 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
690 		ppm = ioc->params.qos[QOS_RPPM];
691 		lat = ioc->params.qos[QOS_RLAT];
692 	} else {
693 		ppm = ioc->params.qos[QOS_WPPM];
694 		lat = ioc->params.qos[QOS_WLAT];
695 	}
696 
697 	/*
698 	 * We want the period to be long enough to contain a healthy number
699 	 * of IOs while short enough for granular control.  Define it as a
700 	 * multiple of the latency target.  Ideally, the multiplier should
701 	 * be scaled according to the percentile so that it would nominally
702 	 * contain a certain number of requests.  Let's be simpler and
703 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
704 	 */
705 	if (ppm)
706 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
707 	else
708 		multi = 2;
709 	period_us = multi * lat;
710 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
711 
712 	/* calculate dependent params */
713 	ioc->period_us = period_us;
714 	ioc->margin_us = period_us * MARGIN_PCT / 100;
715 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
716 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
717 }
718 
719 static int ioc_autop_idx(struct ioc *ioc)
720 {
721 	int idx = ioc->autop_idx;
722 	const struct ioc_params *p = &autop[idx];
723 	u32 vrate_pct;
724 	u64 now_ns;
725 
726 	/* rotational? */
727 	if (!blk_queue_nonrot(ioc->rqos.q))
728 		return AUTOP_HDD;
729 
730 	/* handle SATA SSDs w/ broken NCQ */
731 	if (blk_queue_depth(ioc->rqos.q) == 1)
732 		return AUTOP_SSD_QD1;
733 
734 	/* use one of the normal ssd sets */
735 	if (idx < AUTOP_SSD_DFL)
736 		return AUTOP_SSD_DFL;
737 
738 	/* if user is overriding anything, maintain what was there */
739 	if (ioc->user_qos_params || ioc->user_cost_model)
740 		return idx;
741 
742 	/* step up/down based on the vrate */
743 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
744 			      VTIME_PER_USEC);
745 	now_ns = ktime_get_ns();
746 
747 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
748 		if (!ioc->autop_too_fast_at)
749 			ioc->autop_too_fast_at = now_ns;
750 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
751 			return idx + 1;
752 	} else {
753 		ioc->autop_too_fast_at = 0;
754 	}
755 
756 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
757 		if (!ioc->autop_too_slow_at)
758 			ioc->autop_too_slow_at = now_ns;
759 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
760 			return idx - 1;
761 	} else {
762 		ioc->autop_too_slow_at = 0;
763 	}
764 
765 	return idx;
766 }
767 
768 /*
769  * Take the followings as input
770  *
771  *  @bps	maximum sequential throughput
772  *  @seqiops	maximum sequential 4k iops
773  *  @randiops	maximum random 4k iops
774  *
775  * and calculate the linear model cost coefficients.
776  *
777  *  *@page	per-page cost		1s / (@bps / 4096)
778  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
779  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
780  */
781 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
782 			u64 *page, u64 *seqio, u64 *randio)
783 {
784 	u64 v;
785 
786 	*page = *seqio = *randio = 0;
787 
788 	if (bps)
789 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
790 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
791 
792 	if (seqiops) {
793 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
794 		if (v > *page)
795 			*seqio = v - *page;
796 	}
797 
798 	if (randiops) {
799 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
800 		if (v > *page)
801 			*randio = v - *page;
802 	}
803 }
804 
805 static void ioc_refresh_lcoefs(struct ioc *ioc)
806 {
807 	u64 *u = ioc->params.i_lcoefs;
808 	u64 *c = ioc->params.lcoefs;
809 
810 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
811 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
812 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
813 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
814 }
815 
816 static bool ioc_refresh_params(struct ioc *ioc, bool force)
817 {
818 	const struct ioc_params *p;
819 	int idx;
820 
821 	lockdep_assert_held(&ioc->lock);
822 
823 	idx = ioc_autop_idx(ioc);
824 	p = &autop[idx];
825 
826 	if (idx == ioc->autop_idx && !force)
827 		return false;
828 
829 	if (idx != ioc->autop_idx)
830 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
831 
832 	ioc->autop_idx = idx;
833 	ioc->autop_too_fast_at = 0;
834 	ioc->autop_too_slow_at = 0;
835 
836 	if (!ioc->user_qos_params)
837 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
838 	if (!ioc->user_cost_model)
839 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
840 
841 	ioc_refresh_period_us(ioc);
842 	ioc_refresh_lcoefs(ioc);
843 
844 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
845 					    VTIME_PER_USEC, MILLION);
846 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
847 				   VTIME_PER_USEC, MILLION);
848 
849 	return true;
850 }
851 
852 /* take a snapshot of the current [v]time and vrate */
853 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
854 {
855 	unsigned seq;
856 
857 	now->now_ns = ktime_get();
858 	now->now = ktime_to_us(now->now_ns);
859 	now->vrate = atomic64_read(&ioc->vtime_rate);
860 
861 	/*
862 	 * The current vtime is
863 	 *
864 	 *   vtime at period start + (wallclock time since the start) * vrate
865 	 *
866 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
867 	 * needed, they're seqcount protected.
868 	 */
869 	do {
870 		seq = read_seqcount_begin(&ioc->period_seqcount);
871 		now->vnow = ioc->period_at_vtime +
872 			(now->now - ioc->period_at) * now->vrate;
873 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
874 }
875 
876 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
877 {
878 	lockdep_assert_held(&ioc->lock);
879 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
880 
881 	write_seqcount_begin(&ioc->period_seqcount);
882 	ioc->period_at = now->now;
883 	ioc->period_at_vtime = now->vnow;
884 	write_seqcount_end(&ioc->period_seqcount);
885 
886 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
887 	add_timer(&ioc->timer);
888 }
889 
890 /*
891  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
892  * weight sums and propagate upwards accordingly.
893  */
894 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
895 {
896 	struct ioc *ioc = iocg->ioc;
897 	int lvl;
898 
899 	lockdep_assert_held(&ioc->lock);
900 
901 	inuse = min(active, inuse);
902 
903 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
904 		struct ioc_gq *parent = iocg->ancestors[lvl];
905 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
906 		u32 parent_active = 0, parent_inuse = 0;
907 
908 		/* update the level sums */
909 		parent->child_active_sum += (s32)(active - child->active);
910 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
911 		/* apply the udpates */
912 		child->active = active;
913 		child->inuse = inuse;
914 
915 		/*
916 		 * The delta between inuse and active sums indicates that
917 		 * that much of weight is being given away.  Parent's inuse
918 		 * and active should reflect the ratio.
919 		 */
920 		if (parent->child_active_sum) {
921 			parent_active = parent->weight;
922 			parent_inuse = DIV64_U64_ROUND_UP(
923 				parent_active * parent->child_inuse_sum,
924 				parent->child_active_sum);
925 		}
926 
927 		/* do we need to keep walking up? */
928 		if (parent_active == parent->active &&
929 		    parent_inuse == parent->inuse)
930 			break;
931 
932 		active = parent_active;
933 		inuse = parent_inuse;
934 	}
935 
936 	ioc->weights_updated = true;
937 }
938 
939 static void commit_active_weights(struct ioc *ioc)
940 {
941 	lockdep_assert_held(&ioc->lock);
942 
943 	if (ioc->weights_updated) {
944 		/* paired with rmb in current_hweight(), see there */
945 		smp_wmb();
946 		atomic_inc(&ioc->hweight_gen);
947 		ioc->weights_updated = false;
948 	}
949 }
950 
951 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
952 {
953 	__propagate_active_weight(iocg, active, inuse);
954 	commit_active_weights(iocg->ioc);
955 }
956 
957 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
958 {
959 	struct ioc *ioc = iocg->ioc;
960 	int lvl;
961 	u32 hwa, hwi;
962 	int ioc_gen;
963 
964 	/* hot path - if uptodate, use cached */
965 	ioc_gen = atomic_read(&ioc->hweight_gen);
966 	if (ioc_gen == iocg->hweight_gen)
967 		goto out;
968 
969 	/*
970 	 * Paired with wmb in commit_active_weights().  If we saw the
971 	 * updated hweight_gen, all the weight updates from
972 	 * __propagate_active_weight() are visible too.
973 	 *
974 	 * We can race with weight updates during calculation and get it
975 	 * wrong.  However, hweight_gen would have changed and a future
976 	 * reader will recalculate and we're guaranteed to discard the
977 	 * wrong result soon.
978 	 */
979 	smp_rmb();
980 
981 	hwa = hwi = HWEIGHT_WHOLE;
982 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
983 		struct ioc_gq *parent = iocg->ancestors[lvl];
984 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
985 		u32 active_sum = READ_ONCE(parent->child_active_sum);
986 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
987 		u32 active = READ_ONCE(child->active);
988 		u32 inuse = READ_ONCE(child->inuse);
989 
990 		/* we can race with deactivations and either may read as zero */
991 		if (!active_sum || !inuse_sum)
992 			continue;
993 
994 		active_sum = max(active, active_sum);
995 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
996 
997 		inuse_sum = max(inuse, inuse_sum);
998 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
999 	}
1000 
1001 	iocg->hweight_active = max_t(u32, hwa, 1);
1002 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1003 	iocg->hweight_gen = ioc_gen;
1004 out:
1005 	if (hw_activep)
1006 		*hw_activep = iocg->hweight_active;
1007 	if (hw_inusep)
1008 		*hw_inusep = iocg->hweight_inuse;
1009 }
1010 
1011 static void weight_updated(struct ioc_gq *iocg)
1012 {
1013 	struct ioc *ioc = iocg->ioc;
1014 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1015 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1016 	u32 weight;
1017 
1018 	lockdep_assert_held(&ioc->lock);
1019 
1020 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1021 	if (weight != iocg->weight && iocg->active)
1022 		propagate_active_weight(iocg, weight,
1023 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1024 	iocg->weight = weight;
1025 }
1026 
1027 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1028 {
1029 	struct ioc *ioc = iocg->ioc;
1030 	u64 last_period, cur_period, max_period_delta;
1031 	u64 vtime, vmargin, vmin;
1032 	int i;
1033 
1034 	/*
1035 	 * If seem to be already active, just update the stamp to tell the
1036 	 * timer that we're still active.  We don't mind occassional races.
1037 	 */
1038 	if (!list_empty(&iocg->active_list)) {
1039 		ioc_now(ioc, now);
1040 		cur_period = atomic64_read(&ioc->cur_period);
1041 		if (atomic64_read(&iocg->active_period) != cur_period)
1042 			atomic64_set(&iocg->active_period, cur_period);
1043 		return true;
1044 	}
1045 
1046 	/* racy check on internal node IOs, treat as root level IOs */
1047 	if (iocg->child_active_sum)
1048 		return false;
1049 
1050 	spin_lock_irq(&ioc->lock);
1051 
1052 	ioc_now(ioc, now);
1053 
1054 	/* update period */
1055 	cur_period = atomic64_read(&ioc->cur_period);
1056 	last_period = atomic64_read(&iocg->active_period);
1057 	atomic64_set(&iocg->active_period, cur_period);
1058 
1059 	/* already activated or breaking leaf-only constraint? */
1060 	if (!list_empty(&iocg->active_list))
1061 		goto succeed_unlock;
1062 	for (i = iocg->level - 1; i > 0; i--)
1063 		if (!list_empty(&iocg->ancestors[i]->active_list))
1064 			goto fail_unlock;
1065 
1066 	if (iocg->child_active_sum)
1067 		goto fail_unlock;
1068 
1069 	/*
1070 	 * vtime may wrap when vrate is raised substantially due to
1071 	 * underestimated IO costs.  Look at the period and ignore its
1072 	 * vtime if the iocg has been idle for too long.  Also, cap the
1073 	 * budget it can start with to the margin.
1074 	 */
1075 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1076 	vtime = atomic64_read(&iocg->vtime);
1077 	vmargin = ioc->margin_us * now->vrate;
1078 	vmin = now->vnow - vmargin;
1079 
1080 	if (last_period + max_period_delta < cur_period ||
1081 	    time_before64(vtime, vmin)) {
1082 		atomic64_add(vmin - vtime, &iocg->vtime);
1083 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1084 		vtime = vmin;
1085 	}
1086 
1087 	/*
1088 	 * Activate, propagate weight and start period timer if not
1089 	 * running.  Reset hweight_gen to avoid accidental match from
1090 	 * wrapping.
1091 	 */
1092 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1093 	list_add(&iocg->active_list, &ioc->active_iocgs);
1094 	propagate_active_weight(iocg, iocg->weight,
1095 				iocg->last_inuse ?: iocg->weight);
1096 
1097 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1098 			last_period, cur_period, vtime);
1099 
1100 	iocg->last_vtime = vtime;
1101 
1102 	if (ioc->running == IOC_IDLE) {
1103 		ioc->running = IOC_RUNNING;
1104 		ioc_start_period(ioc, now);
1105 	}
1106 
1107 succeed_unlock:
1108 	spin_unlock_irq(&ioc->lock);
1109 	return true;
1110 
1111 fail_unlock:
1112 	spin_unlock_irq(&ioc->lock);
1113 	return false;
1114 }
1115 
1116 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1117 			int flags, void *key)
1118 {
1119 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1120 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1121 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1122 
1123 	ctx->vbudget -= cost;
1124 
1125 	if (ctx->vbudget < 0)
1126 		return -1;
1127 
1128 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1129 
1130 	/*
1131 	 * autoremove_wake_function() removes the wait entry only when it
1132 	 * actually changed the task state.  We want the wait always
1133 	 * removed.  Remove explicitly and use default_wake_function().
1134 	 */
1135 	list_del_init(&wq_entry->entry);
1136 	wait->committed = true;
1137 
1138 	default_wake_function(wq_entry, mode, flags, key);
1139 	return 0;
1140 }
1141 
1142 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1143 {
1144 	struct ioc *ioc = iocg->ioc;
1145 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1146 	u64 margin_ns = (u64)(ioc->period_us *
1147 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1148 	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
1149 	s64 vbudget;
1150 	u32 hw_inuse;
1151 
1152 	lockdep_assert_held(&iocg->waitq.lock);
1153 
1154 	current_hweight(iocg, NULL, &hw_inuse);
1155 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1156 
1157 	/* pay off debt */
1158 	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
1159 	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
1160 	if (vdebt && vbudget > 0) {
1161 		u64 delta = min_t(u64, vbudget, vdebt);
1162 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1163 				    abs_vdebt);
1164 
1165 		atomic64_add(delta, &iocg->vtime);
1166 		atomic64_add(delta, &iocg->done_vtime);
1167 		atomic64_sub(abs_delta, &iocg->abs_vdebt);
1168 		if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
1169 			atomic64_set(&iocg->abs_vdebt, 0);
1170 	}
1171 
1172 	/*
1173 	 * Wake up the ones which are due and see how much vtime we'll need
1174 	 * for the next one.
1175 	 */
1176 	ctx.hw_inuse = hw_inuse;
1177 	ctx.vbudget = vbudget - vdebt;
1178 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1179 	if (!waitqueue_active(&iocg->waitq))
1180 		return;
1181 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1182 		return;
1183 
1184 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1185 	vshortage = -ctx.vbudget;
1186 	expires = now->now_ns +
1187 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1188 	expires += margin_ns / 4;
1189 
1190 	/* if already active and close enough, don't bother */
1191 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1192 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1193 	    abs(oexpires - expires) <= margin_ns / 4)
1194 		return;
1195 
1196 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1197 			       margin_ns / 4, HRTIMER_MODE_ABS);
1198 }
1199 
1200 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1201 {
1202 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1203 	struct ioc_now now;
1204 	unsigned long flags;
1205 
1206 	ioc_now(iocg->ioc, &now);
1207 
1208 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1209 	iocg_kick_waitq(iocg, &now);
1210 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1211 
1212 	return HRTIMER_NORESTART;
1213 }
1214 
1215 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1216 {
1217 	struct ioc *ioc = iocg->ioc;
1218 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1219 	u64 vtime = atomic64_read(&iocg->vtime);
1220 	u64 vmargin = ioc->margin_us * now->vrate;
1221 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1222 	u64 expires, oexpires;
1223 	u32 hw_inuse;
1224 
1225 	/* debt-adjust vtime */
1226 	current_hweight(iocg, NULL, &hw_inuse);
1227 	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
1228 
1229 	/* clear or maintain depending on the overage */
1230 	if (time_before_eq64(vtime, now->vnow)) {
1231 		blkcg_clear_delay(blkg);
1232 		return false;
1233 	}
1234 	if (!atomic_read(&blkg->use_delay) &&
1235 	    time_before_eq64(vtime, now->vnow + vmargin))
1236 		return false;
1237 
1238 	/* use delay */
1239 	if (cost) {
1240 		u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1241 						 now->vrate);
1242 		blkcg_add_delay(blkg, now->now_ns, cost_ns);
1243 	}
1244 	blkcg_use_delay(blkg);
1245 
1246 	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1247 						   now->vrate) * NSEC_PER_USEC;
1248 
1249 	/* if already active and close enough, don't bother */
1250 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1251 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1252 	    abs(oexpires - expires) <= margin_ns / 4)
1253 		return true;
1254 
1255 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1256 			       margin_ns / 4, HRTIMER_MODE_ABS);
1257 	return true;
1258 }
1259 
1260 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1261 {
1262 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1263 	struct ioc_now now;
1264 
1265 	ioc_now(iocg->ioc, &now);
1266 	iocg_kick_delay(iocg, &now, 0);
1267 
1268 	return HRTIMER_NORESTART;
1269 }
1270 
1271 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1272 {
1273 	u32 nr_met[2] = { };
1274 	u32 nr_missed[2] = { };
1275 	u64 rq_wait_ns = 0;
1276 	int cpu, rw;
1277 
1278 	for_each_online_cpu(cpu) {
1279 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1280 		u64 this_rq_wait_ns;
1281 
1282 		for (rw = READ; rw <= WRITE; rw++) {
1283 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1284 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1285 
1286 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1287 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1288 			stat->missed[rw].last_met = this_met;
1289 			stat->missed[rw].last_missed = this_missed;
1290 		}
1291 
1292 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1293 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1294 		stat->last_rq_wait_ns = this_rq_wait_ns;
1295 	}
1296 
1297 	for (rw = READ; rw <= WRITE; rw++) {
1298 		if (nr_met[rw] + nr_missed[rw])
1299 			missed_ppm_ar[rw] =
1300 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1301 						   nr_met[rw] + nr_missed[rw]);
1302 		else
1303 			missed_ppm_ar[rw] = 0;
1304 	}
1305 
1306 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1307 				   ioc->period_us * NSEC_PER_USEC);
1308 }
1309 
1310 /* was iocg idle this period? */
1311 static bool iocg_is_idle(struct ioc_gq *iocg)
1312 {
1313 	struct ioc *ioc = iocg->ioc;
1314 
1315 	/* did something get issued this period? */
1316 	if (atomic64_read(&iocg->active_period) ==
1317 	    atomic64_read(&ioc->cur_period))
1318 		return false;
1319 
1320 	/* is something in flight? */
1321 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1322 		return false;
1323 
1324 	return true;
1325 }
1326 
1327 /* returns usage with margin added if surplus is large enough */
1328 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1329 {
1330 	/* add margin */
1331 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1332 	usage += SURPLUS_SCALE_ABS;
1333 
1334 	/* don't bother if the surplus is too small */
1335 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1336 		return 0;
1337 
1338 	return usage;
1339 }
1340 
1341 static void ioc_timer_fn(struct timer_list *timer)
1342 {
1343 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1344 	struct ioc_gq *iocg, *tiocg;
1345 	struct ioc_now now;
1346 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1347 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1348 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1349 	u32 missed_ppm[2], rq_wait_pct;
1350 	u64 period_vtime;
1351 	int prev_busy_level, i;
1352 
1353 	/* how were the latencies during the period? */
1354 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1355 
1356 	/* take care of active iocgs */
1357 	spin_lock_irq(&ioc->lock);
1358 
1359 	ioc_now(ioc, &now);
1360 
1361 	period_vtime = now.vnow - ioc->period_at_vtime;
1362 	if (WARN_ON_ONCE(!period_vtime)) {
1363 		spin_unlock_irq(&ioc->lock);
1364 		return;
1365 	}
1366 
1367 	/*
1368 	 * Waiters determine the sleep durations based on the vrate they
1369 	 * saw at the time of sleep.  If vrate has increased, some waiters
1370 	 * could be sleeping for too long.  Wake up tardy waiters which
1371 	 * should have woken up in the last period and expire idle iocgs.
1372 	 */
1373 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1374 		if (!waitqueue_active(&iocg->waitq) &&
1375 		    !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
1376 			continue;
1377 
1378 		spin_lock(&iocg->waitq.lock);
1379 
1380 		if (waitqueue_active(&iocg->waitq) ||
1381 		    atomic64_read(&iocg->abs_vdebt)) {
1382 			/* might be oversleeping vtime / hweight changes, kick */
1383 			iocg_kick_waitq(iocg, &now);
1384 			iocg_kick_delay(iocg, &now, 0);
1385 		} else if (iocg_is_idle(iocg)) {
1386 			/* no waiter and idle, deactivate */
1387 			iocg->last_inuse = iocg->inuse;
1388 			__propagate_active_weight(iocg, 0, 0);
1389 			list_del_init(&iocg->active_list);
1390 		}
1391 
1392 		spin_unlock(&iocg->waitq.lock);
1393 	}
1394 	commit_active_weights(ioc);
1395 
1396 	/* calc usages and see whether some weights need to be moved around */
1397 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1398 		u64 vdone, vtime, vusage, vmargin, vmin;
1399 		u32 hw_active, hw_inuse, usage;
1400 
1401 		/*
1402 		 * Collect unused and wind vtime closer to vnow to prevent
1403 		 * iocgs from accumulating a large amount of budget.
1404 		 */
1405 		vdone = atomic64_read(&iocg->done_vtime);
1406 		vtime = atomic64_read(&iocg->vtime);
1407 		current_hweight(iocg, &hw_active, &hw_inuse);
1408 
1409 		/*
1410 		 * Latency QoS detection doesn't account for IOs which are
1411 		 * in-flight for longer than a period.  Detect them by
1412 		 * comparing vdone against period start.  If lagging behind
1413 		 * IOs from past periods, don't increase vrate.
1414 		 */
1415 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1416 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1417 		    time_after64(vtime, vdone) &&
1418 		    time_after64(vtime, now.vnow -
1419 				 MAX_LAGGING_PERIODS * period_vtime) &&
1420 		    time_before64(vdone, now.vnow - period_vtime))
1421 			nr_lagging++;
1422 
1423 		if (waitqueue_active(&iocg->waitq))
1424 			vusage = now.vnow - iocg->last_vtime;
1425 		else if (time_before64(iocg->last_vtime, vtime))
1426 			vusage = vtime - iocg->last_vtime;
1427 		else
1428 			vusage = 0;
1429 
1430 		iocg->last_vtime += vusage;
1431 		/*
1432 		 * Factor in in-flight vtime into vusage to avoid
1433 		 * high-latency completions appearing as idle.  This should
1434 		 * be done after the above ->last_time adjustment.
1435 		 */
1436 		vusage = max(vusage, vtime - vdone);
1437 
1438 		/* calculate hweight based usage ratio and record */
1439 		if (vusage) {
1440 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1441 						   period_vtime);
1442 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1443 			iocg->usages[iocg->usage_idx] = usage;
1444 		} else {
1445 			usage = 0;
1446 		}
1447 
1448 		/* see whether there's surplus vtime */
1449 		vmargin = ioc->margin_us * now.vrate;
1450 		vmin = now.vnow - vmargin;
1451 
1452 		iocg->has_surplus = false;
1453 
1454 		if (!waitqueue_active(&iocg->waitq) &&
1455 		    time_before64(vtime, vmin)) {
1456 			u64 delta = vmin - vtime;
1457 
1458 			/* throw away surplus vtime */
1459 			atomic64_add(delta, &iocg->vtime);
1460 			atomic64_add(delta, &iocg->done_vtime);
1461 			iocg->last_vtime += delta;
1462 			/* if usage is sufficiently low, maybe it can donate */
1463 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1464 				iocg->has_surplus = true;
1465 				nr_surpluses++;
1466 			}
1467 		} else if (hw_inuse < hw_active) {
1468 			u32 new_hwi, new_inuse;
1469 
1470 			/* was donating but might need to take back some */
1471 			if (waitqueue_active(&iocg->waitq)) {
1472 				new_hwi = hw_active;
1473 			} else {
1474 				new_hwi = max(hw_inuse,
1475 					      usage * SURPLUS_SCALE_PCT / 100 +
1476 					      SURPLUS_SCALE_ABS);
1477 			}
1478 
1479 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1480 					      hw_inuse);
1481 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1482 
1483 			if (new_inuse > iocg->inuse) {
1484 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1485 						iocg->inuse, new_inuse,
1486 						hw_inuse, new_hwi);
1487 				__propagate_active_weight(iocg, iocg->weight,
1488 							  new_inuse);
1489 			}
1490 		} else {
1491 			/* genuninely out of vtime */
1492 			nr_shortages++;
1493 		}
1494 	}
1495 
1496 	if (!nr_shortages || !nr_surpluses)
1497 		goto skip_surplus_transfers;
1498 
1499 	/* there are both shortages and surpluses, transfer surpluses */
1500 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1501 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1502 		int nr_valid = 0;
1503 
1504 		if (!iocg->has_surplus)
1505 			continue;
1506 
1507 		/* base the decision on max historical usage */
1508 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1509 			if (iocg->usages[i]) {
1510 				usage = max(usage, iocg->usages[i]);
1511 				nr_valid++;
1512 			}
1513 		}
1514 		if (nr_valid < MIN_VALID_USAGES)
1515 			continue;
1516 
1517 		current_hweight(iocg, &hw_active, &hw_inuse);
1518 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1519 		if (!new_hwi)
1520 			continue;
1521 
1522 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1523 					       hw_inuse);
1524 		if (new_inuse < iocg->inuse) {
1525 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1526 					iocg->inuse, new_inuse,
1527 					hw_inuse, new_hwi);
1528 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1529 		}
1530 	}
1531 skip_surplus_transfers:
1532 	commit_active_weights(ioc);
1533 
1534 	/*
1535 	 * If q is getting clogged or we're missing too much, we're issuing
1536 	 * too much IO and should lower vtime rate.  If we're not missing
1537 	 * and experiencing shortages but not surpluses, we're too stingy
1538 	 * and should increase vtime rate.
1539 	 */
1540 	prev_busy_level = ioc->busy_level;
1541 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1542 	    missed_ppm[READ] > ppm_rthr ||
1543 	    missed_ppm[WRITE] > ppm_wthr) {
1544 		ioc->busy_level = max(ioc->busy_level, 0);
1545 		ioc->busy_level++;
1546 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1547 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1548 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1549 		/* take action iff there is contention */
1550 		if (nr_shortages && !nr_lagging) {
1551 			ioc->busy_level = min(ioc->busy_level, 0);
1552 			/* redistribute surpluses first */
1553 			if (!nr_surpluses)
1554 				ioc->busy_level--;
1555 		}
1556 	} else {
1557 		ioc->busy_level = 0;
1558 	}
1559 
1560 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1561 
1562 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1563 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1564 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1565 
1566 		/* rq_wait signal is always reliable, ignore user vrate_min */
1567 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1568 			vrate_min = VRATE_MIN;
1569 
1570 		/*
1571 		 * If vrate is out of bounds, apply clamp gradually as the
1572 		 * bounds can change abruptly.  Otherwise, apply busy_level
1573 		 * based adjustment.
1574 		 */
1575 		if (vrate < vrate_min) {
1576 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1577 					  100);
1578 			vrate = min(vrate, vrate_min);
1579 		} else if (vrate > vrate_max) {
1580 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1581 					  100);
1582 			vrate = max(vrate, vrate_max);
1583 		} else {
1584 			int idx = min_t(int, abs(ioc->busy_level),
1585 					ARRAY_SIZE(vrate_adj_pct) - 1);
1586 			u32 adj_pct = vrate_adj_pct[idx];
1587 
1588 			if (ioc->busy_level > 0)
1589 				adj_pct = 100 - adj_pct;
1590 			else
1591 				adj_pct = 100 + adj_pct;
1592 
1593 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1594 				      vrate_min, vrate_max);
1595 		}
1596 
1597 		trace_iocost_ioc_vrate_adj(ioc, vrate, &missed_ppm, rq_wait_pct,
1598 					   nr_lagging, nr_shortages,
1599 					   nr_surpluses);
1600 
1601 		atomic64_set(&ioc->vtime_rate, vrate);
1602 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1603 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1604 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1605 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1606 					   &missed_ppm, rq_wait_pct, nr_lagging,
1607 					   nr_shortages, nr_surpluses);
1608 	}
1609 
1610 	ioc_refresh_params(ioc, false);
1611 
1612 	/*
1613 	 * This period is done.  Move onto the next one.  If nothing's
1614 	 * going on with the device, stop the timer.
1615 	 */
1616 	atomic64_inc(&ioc->cur_period);
1617 
1618 	if (ioc->running != IOC_STOP) {
1619 		if (!list_empty(&ioc->active_iocgs)) {
1620 			ioc_start_period(ioc, &now);
1621 		} else {
1622 			ioc->busy_level = 0;
1623 			ioc->running = IOC_IDLE;
1624 		}
1625 	}
1626 
1627 	spin_unlock_irq(&ioc->lock);
1628 }
1629 
1630 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1631 				    bool is_merge, u64 *costp)
1632 {
1633 	struct ioc *ioc = iocg->ioc;
1634 	u64 coef_seqio, coef_randio, coef_page;
1635 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1636 	u64 seek_pages = 0;
1637 	u64 cost = 0;
1638 
1639 	switch (bio_op(bio)) {
1640 	case REQ_OP_READ:
1641 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1642 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1643 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1644 		break;
1645 	case REQ_OP_WRITE:
1646 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1647 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1648 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1649 		break;
1650 	default:
1651 		goto out;
1652 	}
1653 
1654 	if (iocg->cursor) {
1655 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1656 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1657 	}
1658 
1659 	if (!is_merge) {
1660 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1661 			cost += coef_randio;
1662 		} else {
1663 			cost += coef_seqio;
1664 		}
1665 	}
1666 	cost += pages * coef_page;
1667 out:
1668 	*costp = cost;
1669 }
1670 
1671 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1672 {
1673 	u64 cost;
1674 
1675 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1676 	return cost;
1677 }
1678 
1679 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1680 {
1681 	struct blkcg_gq *blkg = bio->bi_blkg;
1682 	struct ioc *ioc = rqos_to_ioc(rqos);
1683 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1684 	struct ioc_now now;
1685 	struct iocg_wait wait;
1686 	u32 hw_active, hw_inuse;
1687 	u64 abs_cost, cost, vtime;
1688 
1689 	/* bypass IOs if disabled or for root cgroup */
1690 	if (!ioc->enabled || !iocg->level)
1691 		return;
1692 
1693 	/* always activate so that even 0 cost IOs get protected to some level */
1694 	if (!iocg_activate(iocg, &now))
1695 		return;
1696 
1697 	/* calculate the absolute vtime cost */
1698 	abs_cost = calc_vtime_cost(bio, iocg, false);
1699 	if (!abs_cost)
1700 		return;
1701 
1702 	iocg->cursor = bio_end_sector(bio);
1703 
1704 	vtime = atomic64_read(&iocg->vtime);
1705 	current_hweight(iocg, &hw_active, &hw_inuse);
1706 
1707 	if (hw_inuse < hw_active &&
1708 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1709 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1710 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1711 		spin_lock_irq(&ioc->lock);
1712 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1713 		spin_unlock_irq(&ioc->lock);
1714 		current_hweight(iocg, &hw_active, &hw_inuse);
1715 	}
1716 
1717 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1718 
1719 	/*
1720 	 * If no one's waiting and within budget, issue right away.  The
1721 	 * tests are racy but the races aren't systemic - we only miss once
1722 	 * in a while which is fine.
1723 	 */
1724 	if (!waitqueue_active(&iocg->waitq) &&
1725 	    !atomic64_read(&iocg->abs_vdebt) &&
1726 	    time_before_eq64(vtime + cost, now.vnow)) {
1727 		iocg_commit_bio(iocg, bio, cost);
1728 		return;
1729 	}
1730 
1731 	/*
1732 	 * We're over budget.  If @bio has to be issued regardless,
1733 	 * remember the abs_cost instead of advancing vtime.
1734 	 * iocg_kick_waitq() will pay off the debt before waking more IOs.
1735 	 * This way, the debt is continuously paid off each period with the
1736 	 * actual budget available to the cgroup.  If we just wound vtime,
1737 	 * we would incorrectly use the current hw_inuse for the entire
1738 	 * amount which, for example, can lead to the cgroup staying
1739 	 * blocked for a long time even with substantially raised hw_inuse.
1740 	 */
1741 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1742 		atomic64_add(abs_cost, &iocg->abs_vdebt);
1743 		if (iocg_kick_delay(iocg, &now, cost))
1744 			blkcg_schedule_throttle(rqos->q,
1745 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1746 		return;
1747 	}
1748 
1749 	/*
1750 	 * Append self to the waitq and schedule the wakeup timer if we're
1751 	 * the first waiter.  The timer duration is calculated based on the
1752 	 * current vrate.  vtime and hweight changes can make it too short
1753 	 * or too long.  Each wait entry records the absolute cost it's
1754 	 * waiting for to allow re-evaluation using a custom wait entry.
1755 	 *
1756 	 * If too short, the timer simply reschedules itself.  If too long,
1757 	 * the period timer will notice and trigger wakeups.
1758 	 *
1759 	 * All waiters are on iocg->waitq and the wait states are
1760 	 * synchronized using waitq.lock.
1761 	 */
1762 	spin_lock_irq(&iocg->waitq.lock);
1763 
1764 	/*
1765 	 * We activated above but w/o any synchronization.  Deactivation is
1766 	 * synchronized with waitq.lock and we won't get deactivated as
1767 	 * long as we're waiting, so we're good if we're activated here.
1768 	 * In the unlikely case that we are deactivated, just issue the IO.
1769 	 */
1770 	if (unlikely(list_empty(&iocg->active_list))) {
1771 		spin_unlock_irq(&iocg->waitq.lock);
1772 		iocg_commit_bio(iocg, bio, cost);
1773 		return;
1774 	}
1775 
1776 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1777 	wait.wait.private = current;
1778 	wait.bio = bio;
1779 	wait.abs_cost = abs_cost;
1780 	wait.committed = false;	/* will be set true by waker */
1781 
1782 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1783 	iocg_kick_waitq(iocg, &now);
1784 
1785 	spin_unlock_irq(&iocg->waitq.lock);
1786 
1787 	while (true) {
1788 		set_current_state(TASK_UNINTERRUPTIBLE);
1789 		if (wait.committed)
1790 			break;
1791 		io_schedule();
1792 	}
1793 
1794 	/* waker already committed us, proceed */
1795 	finish_wait(&iocg->waitq, &wait.wait);
1796 }
1797 
1798 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1799 			   struct bio *bio)
1800 {
1801 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1802 	struct ioc *ioc = iocg->ioc;
1803 	sector_t bio_end = bio_end_sector(bio);
1804 	struct ioc_now now;
1805 	u32 hw_inuse;
1806 	u64 abs_cost, cost;
1807 
1808 	/* bypass if disabled or for root cgroup */
1809 	if (!ioc->enabled || !iocg->level)
1810 		return;
1811 
1812 	abs_cost = calc_vtime_cost(bio, iocg, true);
1813 	if (!abs_cost)
1814 		return;
1815 
1816 	ioc_now(ioc, &now);
1817 	current_hweight(iocg, NULL, &hw_inuse);
1818 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1819 
1820 	/* update cursor if backmerging into the request at the cursor */
1821 	if (blk_rq_pos(rq) < bio_end &&
1822 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1823 		iocg->cursor = bio_end;
1824 
1825 	/*
1826 	 * Charge if there's enough vtime budget and the existing request
1827 	 * has cost assigned.  Otherwise, account it as debt.  See debt
1828 	 * handling in ioc_rqos_throttle() for details.
1829 	 */
1830 	if (rq->bio && rq->bio->bi_iocost_cost &&
1831 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
1832 		iocg_commit_bio(iocg, bio, cost);
1833 	else
1834 		atomic64_add(abs_cost, &iocg->abs_vdebt);
1835 }
1836 
1837 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1838 {
1839 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1840 
1841 	if (iocg && bio->bi_iocost_cost)
1842 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1843 }
1844 
1845 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1846 {
1847 	struct ioc *ioc = rqos_to_ioc(rqos);
1848 	u64 on_q_ns, rq_wait_ns;
1849 	int pidx, rw;
1850 
1851 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1852 		return;
1853 
1854 	switch (req_op(rq) & REQ_OP_MASK) {
1855 	case REQ_OP_READ:
1856 		pidx = QOS_RLAT;
1857 		rw = READ;
1858 		break;
1859 	case REQ_OP_WRITE:
1860 		pidx = QOS_WLAT;
1861 		rw = WRITE;
1862 		break;
1863 	default:
1864 		return;
1865 	}
1866 
1867 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1868 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1869 
1870 	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1871 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1872 	else
1873 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1874 
1875 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1876 }
1877 
1878 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1879 {
1880 	struct ioc *ioc = rqos_to_ioc(rqos);
1881 
1882 	spin_lock_irq(&ioc->lock);
1883 	ioc_refresh_params(ioc, false);
1884 	spin_unlock_irq(&ioc->lock);
1885 }
1886 
1887 static void ioc_rqos_exit(struct rq_qos *rqos)
1888 {
1889 	struct ioc *ioc = rqos_to_ioc(rqos);
1890 
1891 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1892 
1893 	spin_lock_irq(&ioc->lock);
1894 	ioc->running = IOC_STOP;
1895 	spin_unlock_irq(&ioc->lock);
1896 
1897 	del_timer_sync(&ioc->timer);
1898 	free_percpu(ioc->pcpu_stat);
1899 	kfree(ioc);
1900 }
1901 
1902 static struct rq_qos_ops ioc_rqos_ops = {
1903 	.throttle = ioc_rqos_throttle,
1904 	.merge = ioc_rqos_merge,
1905 	.done_bio = ioc_rqos_done_bio,
1906 	.done = ioc_rqos_done,
1907 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1908 	.exit = ioc_rqos_exit,
1909 };
1910 
1911 static int blk_iocost_init(struct request_queue *q)
1912 {
1913 	struct ioc *ioc;
1914 	struct rq_qos *rqos;
1915 	int ret;
1916 
1917 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1918 	if (!ioc)
1919 		return -ENOMEM;
1920 
1921 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1922 	if (!ioc->pcpu_stat) {
1923 		kfree(ioc);
1924 		return -ENOMEM;
1925 	}
1926 
1927 	rqos = &ioc->rqos;
1928 	rqos->id = RQ_QOS_COST;
1929 	rqos->ops = &ioc_rqos_ops;
1930 	rqos->q = q;
1931 
1932 	spin_lock_init(&ioc->lock);
1933 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1934 	INIT_LIST_HEAD(&ioc->active_iocgs);
1935 
1936 	ioc->running = IOC_IDLE;
1937 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1938 	seqcount_init(&ioc->period_seqcount);
1939 	ioc->period_at = ktime_to_us(ktime_get());
1940 	atomic64_set(&ioc->cur_period, 0);
1941 	atomic_set(&ioc->hweight_gen, 0);
1942 
1943 	spin_lock_irq(&ioc->lock);
1944 	ioc->autop_idx = AUTOP_INVALID;
1945 	ioc_refresh_params(ioc, true);
1946 	spin_unlock_irq(&ioc->lock);
1947 
1948 	rq_qos_add(q, rqos);
1949 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1950 	if (ret) {
1951 		rq_qos_del(q, rqos);
1952 		free_percpu(ioc->pcpu_stat);
1953 		kfree(ioc);
1954 		return ret;
1955 	}
1956 	return 0;
1957 }
1958 
1959 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1960 {
1961 	struct ioc_cgrp *iocc;
1962 
1963 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1964 	if (!iocc)
1965 		return NULL;
1966 
1967 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1968 	return &iocc->cpd;
1969 }
1970 
1971 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1972 {
1973 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
1974 }
1975 
1976 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1977 					     struct blkcg *blkcg)
1978 {
1979 	int levels = blkcg->css.cgroup->level + 1;
1980 	struct ioc_gq *iocg;
1981 
1982 	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1983 			    gfp, q->node);
1984 	if (!iocg)
1985 		return NULL;
1986 
1987 	return &iocg->pd;
1988 }
1989 
1990 static void ioc_pd_init(struct blkg_policy_data *pd)
1991 {
1992 	struct ioc_gq *iocg = pd_to_iocg(pd);
1993 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1994 	struct ioc *ioc = q_to_ioc(blkg->q);
1995 	struct ioc_now now;
1996 	struct blkcg_gq *tblkg;
1997 	unsigned long flags;
1998 
1999 	ioc_now(ioc, &now);
2000 
2001 	iocg->ioc = ioc;
2002 	atomic64_set(&iocg->vtime, now.vnow);
2003 	atomic64_set(&iocg->done_vtime, now.vnow);
2004 	atomic64_set(&iocg->abs_vdebt, 0);
2005 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2006 	INIT_LIST_HEAD(&iocg->active_list);
2007 	iocg->hweight_active = HWEIGHT_WHOLE;
2008 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2009 
2010 	init_waitqueue_head(&iocg->waitq);
2011 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2012 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2013 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2014 	iocg->delay_timer.function = iocg_delay_timer_fn;
2015 
2016 	iocg->level = blkg->blkcg->css.cgroup->level;
2017 
2018 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2019 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2020 		iocg->ancestors[tiocg->level] = tiocg;
2021 	}
2022 
2023 	spin_lock_irqsave(&ioc->lock, flags);
2024 	weight_updated(iocg);
2025 	spin_unlock_irqrestore(&ioc->lock, flags);
2026 }
2027 
2028 static void ioc_pd_free(struct blkg_policy_data *pd)
2029 {
2030 	struct ioc_gq *iocg = pd_to_iocg(pd);
2031 	struct ioc *ioc = iocg->ioc;
2032 
2033 	if (ioc) {
2034 		spin_lock(&ioc->lock);
2035 		if (!list_empty(&iocg->active_list)) {
2036 			propagate_active_weight(iocg, 0, 0);
2037 			list_del_init(&iocg->active_list);
2038 		}
2039 		spin_unlock(&ioc->lock);
2040 
2041 		hrtimer_cancel(&iocg->waitq_timer);
2042 		hrtimer_cancel(&iocg->delay_timer);
2043 	}
2044 	kfree(iocg);
2045 }
2046 
2047 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2048 			     int off)
2049 {
2050 	const char *dname = blkg_dev_name(pd->blkg);
2051 	struct ioc_gq *iocg = pd_to_iocg(pd);
2052 
2053 	if (dname && iocg->cfg_weight)
2054 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2055 	return 0;
2056 }
2057 
2058 
2059 static int ioc_weight_show(struct seq_file *sf, void *v)
2060 {
2061 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2062 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2063 
2064 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2065 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2066 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2067 	return 0;
2068 }
2069 
2070 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2071 				size_t nbytes, loff_t off)
2072 {
2073 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2074 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2075 	struct blkg_conf_ctx ctx;
2076 	struct ioc_gq *iocg;
2077 	u32 v;
2078 	int ret;
2079 
2080 	if (!strchr(buf, ':')) {
2081 		struct blkcg_gq *blkg;
2082 
2083 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2084 			return -EINVAL;
2085 
2086 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2087 			return -EINVAL;
2088 
2089 		spin_lock(&blkcg->lock);
2090 		iocc->dfl_weight = v;
2091 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2092 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2093 
2094 			if (iocg) {
2095 				spin_lock_irq(&iocg->ioc->lock);
2096 				weight_updated(iocg);
2097 				spin_unlock_irq(&iocg->ioc->lock);
2098 			}
2099 		}
2100 		spin_unlock(&blkcg->lock);
2101 
2102 		return nbytes;
2103 	}
2104 
2105 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2106 	if (ret)
2107 		return ret;
2108 
2109 	iocg = blkg_to_iocg(ctx.blkg);
2110 
2111 	if (!strncmp(ctx.body, "default", 7)) {
2112 		v = 0;
2113 	} else {
2114 		if (!sscanf(ctx.body, "%u", &v))
2115 			goto einval;
2116 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2117 			goto einval;
2118 	}
2119 
2120 	spin_lock(&iocg->ioc->lock);
2121 	iocg->cfg_weight = v;
2122 	weight_updated(iocg);
2123 	spin_unlock(&iocg->ioc->lock);
2124 
2125 	blkg_conf_finish(&ctx);
2126 	return nbytes;
2127 
2128 einval:
2129 	blkg_conf_finish(&ctx);
2130 	return -EINVAL;
2131 }
2132 
2133 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2134 			  int off)
2135 {
2136 	const char *dname = blkg_dev_name(pd->blkg);
2137 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2138 
2139 	if (!dname)
2140 		return 0;
2141 
2142 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2143 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2144 		   ioc->params.qos[QOS_RPPM] / 10000,
2145 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2146 		   ioc->params.qos[QOS_RLAT],
2147 		   ioc->params.qos[QOS_WPPM] / 10000,
2148 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2149 		   ioc->params.qos[QOS_WLAT],
2150 		   ioc->params.qos[QOS_MIN] / 10000,
2151 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2152 		   ioc->params.qos[QOS_MAX] / 10000,
2153 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2154 	return 0;
2155 }
2156 
2157 static int ioc_qos_show(struct seq_file *sf, void *v)
2158 {
2159 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2160 
2161 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2162 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2163 	return 0;
2164 }
2165 
2166 static const match_table_t qos_ctrl_tokens = {
2167 	{ QOS_ENABLE,		"enable=%u"	},
2168 	{ QOS_CTRL,		"ctrl=%s"	},
2169 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2170 };
2171 
2172 static const match_table_t qos_tokens = {
2173 	{ QOS_RPPM,		"rpct=%s"	},
2174 	{ QOS_RLAT,		"rlat=%u"	},
2175 	{ QOS_WPPM,		"wpct=%s"	},
2176 	{ QOS_WLAT,		"wlat=%u"	},
2177 	{ QOS_MIN,		"min=%s"	},
2178 	{ QOS_MAX,		"max=%s"	},
2179 	{ NR_QOS_PARAMS,	NULL		},
2180 };
2181 
2182 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2183 			     size_t nbytes, loff_t off)
2184 {
2185 	struct gendisk *disk;
2186 	struct ioc *ioc;
2187 	u32 qos[NR_QOS_PARAMS];
2188 	bool enable, user;
2189 	char *p;
2190 	int ret;
2191 
2192 	disk = blkcg_conf_get_disk(&input);
2193 	if (IS_ERR(disk))
2194 		return PTR_ERR(disk);
2195 
2196 	ioc = q_to_ioc(disk->queue);
2197 	if (!ioc) {
2198 		ret = blk_iocost_init(disk->queue);
2199 		if (ret)
2200 			goto err;
2201 		ioc = q_to_ioc(disk->queue);
2202 	}
2203 
2204 	spin_lock_irq(&ioc->lock);
2205 	memcpy(qos, ioc->params.qos, sizeof(qos));
2206 	enable = ioc->enabled;
2207 	user = ioc->user_qos_params;
2208 	spin_unlock_irq(&ioc->lock);
2209 
2210 	while ((p = strsep(&input, " \t\n"))) {
2211 		substring_t args[MAX_OPT_ARGS];
2212 		char buf[32];
2213 		int tok;
2214 		s64 v;
2215 
2216 		if (!*p)
2217 			continue;
2218 
2219 		switch (match_token(p, qos_ctrl_tokens, args)) {
2220 		case QOS_ENABLE:
2221 			match_u64(&args[0], &v);
2222 			enable = v;
2223 			continue;
2224 		case QOS_CTRL:
2225 			match_strlcpy(buf, &args[0], sizeof(buf));
2226 			if (!strcmp(buf, "auto"))
2227 				user = false;
2228 			else if (!strcmp(buf, "user"))
2229 				user = true;
2230 			else
2231 				goto einval;
2232 			continue;
2233 		}
2234 
2235 		tok = match_token(p, qos_tokens, args);
2236 		switch (tok) {
2237 		case QOS_RPPM:
2238 		case QOS_WPPM:
2239 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2240 			    sizeof(buf))
2241 				goto einval;
2242 			if (cgroup_parse_float(buf, 2, &v))
2243 				goto einval;
2244 			if (v < 0 || v > 10000)
2245 				goto einval;
2246 			qos[tok] = v * 100;
2247 			break;
2248 		case QOS_RLAT:
2249 		case QOS_WLAT:
2250 			if (match_u64(&args[0], &v))
2251 				goto einval;
2252 			qos[tok] = v;
2253 			break;
2254 		case QOS_MIN:
2255 		case QOS_MAX:
2256 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2257 			    sizeof(buf))
2258 				goto einval;
2259 			if (cgroup_parse_float(buf, 2, &v))
2260 				goto einval;
2261 			if (v < 0)
2262 				goto einval;
2263 			qos[tok] = clamp_t(s64, v * 100,
2264 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2265 			break;
2266 		default:
2267 			goto einval;
2268 		}
2269 		user = true;
2270 	}
2271 
2272 	if (qos[QOS_MIN] > qos[QOS_MAX])
2273 		goto einval;
2274 
2275 	spin_lock_irq(&ioc->lock);
2276 
2277 	if (enable) {
2278 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2279 		ioc->enabled = true;
2280 	} else {
2281 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2282 		ioc->enabled = false;
2283 	}
2284 
2285 	if (user) {
2286 		memcpy(ioc->params.qos, qos, sizeof(qos));
2287 		ioc->user_qos_params = true;
2288 	} else {
2289 		ioc->user_qos_params = false;
2290 	}
2291 
2292 	ioc_refresh_params(ioc, true);
2293 	spin_unlock_irq(&ioc->lock);
2294 
2295 	put_disk_and_module(disk);
2296 	return nbytes;
2297 einval:
2298 	ret = -EINVAL;
2299 err:
2300 	put_disk_and_module(disk);
2301 	return ret;
2302 }
2303 
2304 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2305 				 struct blkg_policy_data *pd, int off)
2306 {
2307 	const char *dname = blkg_dev_name(pd->blkg);
2308 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2309 	u64 *u = ioc->params.i_lcoefs;
2310 
2311 	if (!dname)
2312 		return 0;
2313 
2314 	seq_printf(sf, "%s ctrl=%s model=linear "
2315 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2316 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2317 		   dname, ioc->user_cost_model ? "user" : "auto",
2318 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2319 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2320 	return 0;
2321 }
2322 
2323 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2324 {
2325 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2326 
2327 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2328 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2329 	return 0;
2330 }
2331 
2332 static const match_table_t cost_ctrl_tokens = {
2333 	{ COST_CTRL,		"ctrl=%s"	},
2334 	{ COST_MODEL,		"model=%s"	},
2335 	{ NR_COST_CTRL_PARAMS,	NULL		},
2336 };
2337 
2338 static const match_table_t i_lcoef_tokens = {
2339 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2340 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2341 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2342 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2343 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2344 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2345 	{ NR_I_LCOEFS,		NULL		},
2346 };
2347 
2348 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2349 				    size_t nbytes, loff_t off)
2350 {
2351 	struct gendisk *disk;
2352 	struct ioc *ioc;
2353 	u64 u[NR_I_LCOEFS];
2354 	bool user;
2355 	char *p;
2356 	int ret;
2357 
2358 	disk = blkcg_conf_get_disk(&input);
2359 	if (IS_ERR(disk))
2360 		return PTR_ERR(disk);
2361 
2362 	ioc = q_to_ioc(disk->queue);
2363 	if (!ioc) {
2364 		ret = blk_iocost_init(disk->queue);
2365 		if (ret)
2366 			goto err;
2367 		ioc = q_to_ioc(disk->queue);
2368 	}
2369 
2370 	spin_lock_irq(&ioc->lock);
2371 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2372 	user = ioc->user_cost_model;
2373 	spin_unlock_irq(&ioc->lock);
2374 
2375 	while ((p = strsep(&input, " \t\n"))) {
2376 		substring_t args[MAX_OPT_ARGS];
2377 		char buf[32];
2378 		int tok;
2379 		u64 v;
2380 
2381 		if (!*p)
2382 			continue;
2383 
2384 		switch (match_token(p, cost_ctrl_tokens, args)) {
2385 		case COST_CTRL:
2386 			match_strlcpy(buf, &args[0], sizeof(buf));
2387 			if (!strcmp(buf, "auto"))
2388 				user = false;
2389 			else if (!strcmp(buf, "user"))
2390 				user = true;
2391 			else
2392 				goto einval;
2393 			continue;
2394 		case COST_MODEL:
2395 			match_strlcpy(buf, &args[0], sizeof(buf));
2396 			if (strcmp(buf, "linear"))
2397 				goto einval;
2398 			continue;
2399 		}
2400 
2401 		tok = match_token(p, i_lcoef_tokens, args);
2402 		if (tok == NR_I_LCOEFS)
2403 			goto einval;
2404 		if (match_u64(&args[0], &v))
2405 			goto einval;
2406 		u[tok] = v;
2407 		user = true;
2408 	}
2409 
2410 	spin_lock_irq(&ioc->lock);
2411 	if (user) {
2412 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2413 		ioc->user_cost_model = true;
2414 	} else {
2415 		ioc->user_cost_model = false;
2416 	}
2417 	ioc_refresh_params(ioc, true);
2418 	spin_unlock_irq(&ioc->lock);
2419 
2420 	put_disk_and_module(disk);
2421 	return nbytes;
2422 
2423 einval:
2424 	ret = -EINVAL;
2425 err:
2426 	put_disk_and_module(disk);
2427 	return ret;
2428 }
2429 
2430 static struct cftype ioc_files[] = {
2431 	{
2432 		.name = "weight",
2433 		.flags = CFTYPE_NOT_ON_ROOT,
2434 		.seq_show = ioc_weight_show,
2435 		.write = ioc_weight_write,
2436 	},
2437 	{
2438 		.name = "cost.qos",
2439 		.flags = CFTYPE_ONLY_ON_ROOT,
2440 		.seq_show = ioc_qos_show,
2441 		.write = ioc_qos_write,
2442 	},
2443 	{
2444 		.name = "cost.model",
2445 		.flags = CFTYPE_ONLY_ON_ROOT,
2446 		.seq_show = ioc_cost_model_show,
2447 		.write = ioc_cost_model_write,
2448 	},
2449 	{}
2450 };
2451 
2452 static struct blkcg_policy blkcg_policy_iocost = {
2453 	.dfl_cftypes	= ioc_files,
2454 	.cpd_alloc_fn	= ioc_cpd_alloc,
2455 	.cpd_free_fn	= ioc_cpd_free,
2456 	.pd_alloc_fn	= ioc_pd_alloc,
2457 	.pd_init_fn	= ioc_pd_init,
2458 	.pd_free_fn	= ioc_pd_free,
2459 };
2460 
2461 static int __init ioc_init(void)
2462 {
2463 	return blkcg_policy_register(&blkcg_policy_iocost);
2464 }
2465 
2466 static void __exit ioc_exit(void)
2467 {
2468 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2469 }
2470 
2471 module_init(ioc_init);
2472 module_exit(ioc_exit);
2473