xref: /openbmc/linux/block/blk-iocost.c (revision adb57164)
1 /* SPDX-License-Identifier: GPL-2.0
2  *
3  * IO cost model based controller.
4  *
5  * Copyright (C) 2019 Tejun Heo <tj@kernel.org>
6  * Copyright (C) 2019 Andy Newell <newella@fb.com>
7  * Copyright (C) 2019 Facebook
8  *
9  * One challenge of controlling IO resources is the lack of trivially
10  * observable cost metric.  This is distinguished from CPU and memory where
11  * wallclock time and the number of bytes can serve as accurate enough
12  * approximations.
13  *
14  * Bandwidth and iops are the most commonly used metrics for IO devices but
15  * depending on the type and specifics of the device, different IO patterns
16  * easily lead to multiple orders of magnitude variations rendering them
17  * useless for the purpose of IO capacity distribution.  While on-device
18  * time, with a lot of clutches, could serve as a useful approximation for
19  * non-queued rotational devices, this is no longer viable with modern
20  * devices, even the rotational ones.
21  *
22  * While there is no cost metric we can trivially observe, it isn't a
23  * complete mystery.  For example, on a rotational device, seek cost
24  * dominates while a contiguous transfer contributes a smaller amount
25  * proportional to the size.  If we can characterize at least the relative
26  * costs of these different types of IOs, it should be possible to
27  * implement a reasonable work-conserving proportional IO resource
28  * distribution.
29  *
30  * 1. IO Cost Model
31  *
32  * IO cost model estimates the cost of an IO given its basic parameters and
33  * history (e.g. the end sector of the last IO).  The cost is measured in
34  * device time.  If a given IO is estimated to cost 10ms, the device should
35  * be able to process ~100 of those IOs in a second.
36  *
37  * Currently, there's only one builtin cost model - linear.  Each IO is
38  * classified as sequential or random and given a base cost accordingly.
39  * On top of that, a size cost proportional to the length of the IO is
40  * added.  While simple, this model captures the operational
41  * characteristics of a wide varienty of devices well enough.  Default
42  * paramters for several different classes of devices are provided and the
43  * parameters can be configured from userspace via
44  * /sys/fs/cgroup/io.cost.model.
45  *
46  * If needed, tools/cgroup/iocost_coef_gen.py can be used to generate
47  * device-specific coefficients.
48  *
49  * 2. Control Strategy
50  *
51  * The device virtual time (vtime) is used as the primary control metric.
52  * The control strategy is composed of the following three parts.
53  *
54  * 2-1. Vtime Distribution
55  *
56  * When a cgroup becomes active in terms of IOs, its hierarchical share is
57  * calculated.  Please consider the following hierarchy where the numbers
58  * inside parentheses denote the configured weights.
59  *
60  *           root
61  *         /       \
62  *      A (w:100)  B (w:300)
63  *      /       \
64  *  A0 (w:100)  A1 (w:100)
65  *
66  * If B is idle and only A0 and A1 are actively issuing IOs, as the two are
67  * of equal weight, each gets 50% share.  If then B starts issuing IOs, B
68  * gets 300/(100+300) or 75% share, and A0 and A1 equally splits the rest,
69  * 12.5% each.  The distribution mechanism only cares about these flattened
70  * shares.  They're called hweights (hierarchical weights) and always add
71  * upto 1 (HWEIGHT_WHOLE).
72  *
73  * A given cgroup's vtime runs slower in inverse proportion to its hweight.
74  * For example, with 12.5% weight, A0's time runs 8 times slower (100/12.5)
75  * against the device vtime - an IO which takes 10ms on the underlying
76  * device is considered to take 80ms on A0.
77  *
78  * This constitutes the basis of IO capacity distribution.  Each cgroup's
79  * vtime is running at a rate determined by its hweight.  A cgroup tracks
80  * the vtime consumed by past IOs and can issue a new IO iff doing so
81  * wouldn't outrun the current device vtime.  Otherwise, the IO is
82  * suspended until the vtime has progressed enough to cover it.
83  *
84  * 2-2. Vrate Adjustment
85  *
86  * It's unrealistic to expect the cost model to be perfect.  There are too
87  * many devices and even on the same device the overall performance
88  * fluctuates depending on numerous factors such as IO mixture and device
89  * internal garbage collection.  The controller needs to adapt dynamically.
90  *
91  * This is achieved by adjusting the overall IO rate according to how busy
92  * the device is.  If the device becomes overloaded, we're sending down too
93  * many IOs and should generally slow down.  If there are waiting issuers
94  * but the device isn't saturated, we're issuing too few and should
95  * generally speed up.
96  *
97  * To slow down, we lower the vrate - the rate at which the device vtime
98  * passes compared to the wall clock.  For example, if the vtime is running
99  * at the vrate of 75%, all cgroups added up would only be able to issue
100  * 750ms worth of IOs per second, and vice-versa for speeding up.
101  *
102  * Device business is determined using two criteria - rq wait and
103  * completion latencies.
104  *
105  * When a device gets saturated, the on-device and then the request queues
106  * fill up and a bio which is ready to be issued has to wait for a request
107  * to become available.  When this delay becomes noticeable, it's a clear
108  * indication that the device is saturated and we lower the vrate.  This
109  * saturation signal is fairly conservative as it only triggers when both
110  * hardware and software queues are filled up, and is used as the default
111  * busy signal.
112  *
113  * As devices can have deep queues and be unfair in how the queued commands
114  * are executed, soley depending on rq wait may not result in satisfactory
115  * control quality.  For a better control quality, completion latency QoS
116  * parameters can be configured so that the device is considered saturated
117  * if N'th percentile completion latency rises above the set point.
118  *
119  * The completion latency requirements are a function of both the
120  * underlying device characteristics and the desired IO latency quality of
121  * service.  There is an inherent trade-off - the tighter the latency QoS,
122  * the higher the bandwidth lossage.  Latency QoS is disabled by default
123  * and can be set through /sys/fs/cgroup/io.cost.qos.
124  *
125  * 2-3. Work Conservation
126  *
127  * Imagine two cgroups A and B with equal weights.  A is issuing a small IO
128  * periodically while B is sending out enough parallel IOs to saturate the
129  * device on its own.  Let's say A's usage amounts to 100ms worth of IO
130  * cost per second, i.e., 10% of the device capacity.  The naive
131  * distribution of half and half would lead to 60% utilization of the
132  * device, a significant reduction in the total amount of work done
133  * compared to free-for-all competition.  This is too high a cost to pay
134  * for IO control.
135  *
136  * To conserve the total amount of work done, we keep track of how much
137  * each active cgroup is actually using and yield part of its weight if
138  * there are other cgroups which can make use of it.  In the above case,
139  * A's weight will be lowered so that it hovers above the actual usage and
140  * B would be able to use the rest.
141  *
142  * As we don't want to penalize a cgroup for donating its weight, the
143  * surplus weight adjustment factors in a margin and has an immediate
144  * snapback mechanism in case the cgroup needs more IO vtime for itself.
145  *
146  * Note that adjusting down surplus weights has the same effects as
147  * accelerating vtime for other cgroups and work conservation can also be
148  * implemented by adjusting vrate dynamically.  However, squaring who can
149  * donate and should take back how much requires hweight propagations
150  * anyway making it easier to implement and understand as a separate
151  * mechanism.
152  *
153  * 3. Monitoring
154  *
155  * Instead of debugfs or other clumsy monitoring mechanisms, this
156  * controller uses a drgn based monitoring script -
157  * tools/cgroup/iocost_monitor.py.  For details on drgn, please see
158  * https://github.com/osandov/drgn.  The ouput looks like the following.
159  *
160  *  sdb RUN   per=300ms cur_per=234.218:v203.695 busy= +1 vrate= 62.12%
161  *                 active      weight      hweight% inflt% dbt  delay usages%
162  *  test/a              *    50/   50  33.33/ 33.33  27.65   2  0*041 033:033:033
163  *  test/b              *   100/  100  66.67/ 66.67  17.56   0  0*000 066:079:077
164  *
165  * - per	: Timer period
166  * - cur_per	: Internal wall and device vtime clock
167  * - vrate	: Device virtual time rate against wall clock
168  * - weight	: Surplus-adjusted and configured weights
169  * - hweight	: Surplus-adjusted and configured hierarchical weights
170  * - inflt	: The percentage of in-flight IO cost at the end of last period
171  * - del_ms	: Deferred issuer delay induction level and duration
172  * - usages	: Usage history
173  */
174 
175 #include <linux/kernel.h>
176 #include <linux/module.h>
177 #include <linux/timer.h>
178 #include <linux/time64.h>
179 #include <linux/parser.h>
180 #include <linux/sched/signal.h>
181 #include <linux/blk-cgroup.h>
182 #include "blk-rq-qos.h"
183 #include "blk-stat.h"
184 #include "blk-wbt.h"
185 
186 #ifdef CONFIG_TRACEPOINTS
187 
188 /* copied from TRACE_CGROUP_PATH, see cgroup-internal.h */
189 #define TRACE_IOCG_PATH_LEN 1024
190 static DEFINE_SPINLOCK(trace_iocg_path_lock);
191 static char trace_iocg_path[TRACE_IOCG_PATH_LEN];
192 
193 #define TRACE_IOCG_PATH(type, iocg, ...)					\
194 	do {									\
195 		unsigned long flags;						\
196 		if (trace_iocost_##type##_enabled()) {				\
197 			spin_lock_irqsave(&trace_iocg_path_lock, flags);	\
198 			cgroup_path(iocg_to_blkg(iocg)->blkcg->css.cgroup,	\
199 				    trace_iocg_path, TRACE_IOCG_PATH_LEN);	\
200 			trace_iocost_##type(iocg, trace_iocg_path,		\
201 					      ##__VA_ARGS__);			\
202 			spin_unlock_irqrestore(&trace_iocg_path_lock, flags);	\
203 		}								\
204 	} while (0)
205 
206 #else	/* CONFIG_TRACE_POINTS */
207 #define TRACE_IOCG_PATH(type, iocg, ...)	do { } while (0)
208 #endif	/* CONFIG_TRACE_POINTS */
209 
210 enum {
211 	MILLION			= 1000000,
212 
213 	/* timer period is calculated from latency requirements, bound it */
214 	MIN_PERIOD		= USEC_PER_MSEC,
215 	MAX_PERIOD		= USEC_PER_SEC,
216 
217 	/*
218 	 * A cgroup's vtime can run 50% behind the device vtime, which
219 	 * serves as its IO credit buffer.  Surplus weight adjustment is
220 	 * immediately canceled if the vtime margin runs below 10%.
221 	 */
222 	MARGIN_PCT		= 50,
223 	INUSE_MARGIN_PCT	= 10,
224 
225 	/* Have some play in waitq timer operations */
226 	WAITQ_TIMER_MARGIN_PCT	= 5,
227 
228 	/*
229 	 * vtime can wrap well within a reasonable uptime when vrate is
230 	 * consistently raised.  Don't trust recorded cgroup vtime if the
231 	 * period counter indicates that it's older than 5mins.
232 	 */
233 	VTIME_VALID_DUR		= 300 * USEC_PER_SEC,
234 
235 	/*
236 	 * Remember the past three non-zero usages and use the max for
237 	 * surplus calculation.  Three slots guarantee that we remember one
238 	 * full period usage from the last active stretch even after
239 	 * partial deactivation and re-activation periods.  Don't start
240 	 * giving away weight before collecting two data points to prevent
241 	 * hweight adjustments based on one partial activation period.
242 	 */
243 	NR_USAGE_SLOTS		= 3,
244 	MIN_VALID_USAGES	= 2,
245 
246 	/* 1/64k is granular enough and can easily be handled w/ u32 */
247 	HWEIGHT_WHOLE		= 1 << 16,
248 
249 	/*
250 	 * As vtime is used to calculate the cost of each IO, it needs to
251 	 * be fairly high precision.  For example, it should be able to
252 	 * represent the cost of a single page worth of discard with
253 	 * suffificient accuracy.  At the same time, it should be able to
254 	 * represent reasonably long enough durations to be useful and
255 	 * convenient during operation.
256 	 *
257 	 * 1s worth of vtime is 2^37.  This gives us both sub-nanosecond
258 	 * granularity and days of wrap-around time even at extreme vrates.
259 	 */
260 	VTIME_PER_SEC_SHIFT	= 37,
261 	VTIME_PER_SEC		= 1LLU << VTIME_PER_SEC_SHIFT,
262 	VTIME_PER_USEC		= VTIME_PER_SEC / USEC_PER_SEC,
263 
264 	/* bound vrate adjustments within two orders of magnitude */
265 	VRATE_MIN_PPM		= 10000,	/* 1% */
266 	VRATE_MAX_PPM		= 100000000,	/* 10000% */
267 
268 	VRATE_MIN		= VTIME_PER_USEC * VRATE_MIN_PPM / MILLION,
269 	VRATE_CLAMP_ADJ_PCT	= 4,
270 
271 	/* if IOs end up waiting for requests, issue less */
272 	RQ_WAIT_BUSY_PCT	= 5,
273 
274 	/* unbusy hysterisis */
275 	UNBUSY_THR_PCT		= 75,
276 
277 	/* don't let cmds which take a very long time pin lagging for too long */
278 	MAX_LAGGING_PERIODS	= 10,
279 
280 	/*
281 	 * If usage% * 1.25 + 2% is lower than hweight% by more than 3%,
282 	 * donate the surplus.
283 	 */
284 	SURPLUS_SCALE_PCT	= 125,			/* * 125% */
285 	SURPLUS_SCALE_ABS	= HWEIGHT_WHOLE / 50,	/* + 2% */
286 	SURPLUS_MIN_ADJ_DELTA	= HWEIGHT_WHOLE / 33,	/* 3% */
287 
288 	/* switch iff the conditions are met for longer than this */
289 	AUTOP_CYCLE_NSEC	= 10LLU * NSEC_PER_SEC,
290 
291 	/*
292 	 * Count IO size in 4k pages.  The 12bit shift helps keeping
293 	 * size-proportional components of cost calculation in closer
294 	 * numbers of digits to per-IO cost components.
295 	 */
296 	IOC_PAGE_SHIFT		= 12,
297 	IOC_PAGE_SIZE		= 1 << IOC_PAGE_SHIFT,
298 	IOC_SECT_TO_PAGE_SHIFT	= IOC_PAGE_SHIFT - SECTOR_SHIFT,
299 
300 	/* if apart further than 16M, consider randio for linear model */
301 	LCOEF_RANDIO_PAGES	= 4096,
302 };
303 
304 enum ioc_running {
305 	IOC_IDLE,
306 	IOC_RUNNING,
307 	IOC_STOP,
308 };
309 
310 /* io.cost.qos controls including per-dev enable of the whole controller */
311 enum {
312 	QOS_ENABLE,
313 	QOS_CTRL,
314 	NR_QOS_CTRL_PARAMS,
315 };
316 
317 /* io.cost.qos params */
318 enum {
319 	QOS_RPPM,
320 	QOS_RLAT,
321 	QOS_WPPM,
322 	QOS_WLAT,
323 	QOS_MIN,
324 	QOS_MAX,
325 	NR_QOS_PARAMS,
326 };
327 
328 /* io.cost.model controls */
329 enum {
330 	COST_CTRL,
331 	COST_MODEL,
332 	NR_COST_CTRL_PARAMS,
333 };
334 
335 /* builtin linear cost model coefficients */
336 enum {
337 	I_LCOEF_RBPS,
338 	I_LCOEF_RSEQIOPS,
339 	I_LCOEF_RRANDIOPS,
340 	I_LCOEF_WBPS,
341 	I_LCOEF_WSEQIOPS,
342 	I_LCOEF_WRANDIOPS,
343 	NR_I_LCOEFS,
344 };
345 
346 enum {
347 	LCOEF_RPAGE,
348 	LCOEF_RSEQIO,
349 	LCOEF_RRANDIO,
350 	LCOEF_WPAGE,
351 	LCOEF_WSEQIO,
352 	LCOEF_WRANDIO,
353 	NR_LCOEFS,
354 };
355 
356 enum {
357 	AUTOP_INVALID,
358 	AUTOP_HDD,
359 	AUTOP_SSD_QD1,
360 	AUTOP_SSD_DFL,
361 	AUTOP_SSD_FAST,
362 };
363 
364 struct ioc_gq;
365 
366 struct ioc_params {
367 	u32				qos[NR_QOS_PARAMS];
368 	u64				i_lcoefs[NR_I_LCOEFS];
369 	u64				lcoefs[NR_LCOEFS];
370 	u32				too_fast_vrate_pct;
371 	u32				too_slow_vrate_pct;
372 };
373 
374 struct ioc_missed {
375 	u32				nr_met;
376 	u32				nr_missed;
377 	u32				last_met;
378 	u32				last_missed;
379 };
380 
381 struct ioc_pcpu_stat {
382 	struct ioc_missed		missed[2];
383 
384 	u64				rq_wait_ns;
385 	u64				last_rq_wait_ns;
386 };
387 
388 /* per device */
389 struct ioc {
390 	struct rq_qos			rqos;
391 
392 	bool				enabled;
393 
394 	struct ioc_params		params;
395 	u32				period_us;
396 	u32				margin_us;
397 	u64				vrate_min;
398 	u64				vrate_max;
399 
400 	spinlock_t			lock;
401 	struct timer_list		timer;
402 	struct list_head		active_iocgs;	/* active cgroups */
403 	struct ioc_pcpu_stat __percpu	*pcpu_stat;
404 
405 	enum ioc_running		running;
406 	atomic64_t			vtime_rate;
407 
408 	seqcount_t			period_seqcount;
409 	u32				period_at;	/* wallclock starttime */
410 	u64				period_at_vtime; /* vtime starttime */
411 
412 	atomic64_t			cur_period;	/* inc'd each period */
413 	int				busy_level;	/* saturation history */
414 
415 	u64				inuse_margin_vtime;
416 	bool				weights_updated;
417 	atomic_t			hweight_gen;	/* for lazy hweights */
418 
419 	u64				autop_too_fast_at;
420 	u64				autop_too_slow_at;
421 	int				autop_idx;
422 	bool				user_qos_params:1;
423 	bool				user_cost_model:1;
424 };
425 
426 /* per device-cgroup pair */
427 struct ioc_gq {
428 	struct blkg_policy_data		pd;
429 	struct ioc			*ioc;
430 
431 	/*
432 	 * A iocg can get its weight from two sources - an explicit
433 	 * per-device-cgroup configuration or the default weight of the
434 	 * cgroup.  `cfg_weight` is the explicit per-device-cgroup
435 	 * configuration.  `weight` is the effective considering both
436 	 * sources.
437 	 *
438 	 * When an idle cgroup becomes active its `active` goes from 0 to
439 	 * `weight`.  `inuse` is the surplus adjusted active weight.
440 	 * `active` and `inuse` are used to calculate `hweight_active` and
441 	 * `hweight_inuse`.
442 	 *
443 	 * `last_inuse` remembers `inuse` while an iocg is idle to persist
444 	 * surplus adjustments.
445 	 */
446 	u32				cfg_weight;
447 	u32				weight;
448 	u32				active;
449 	u32				inuse;
450 	u32				last_inuse;
451 
452 	sector_t			cursor;		/* to detect randio */
453 
454 	/*
455 	 * `vtime` is this iocg's vtime cursor which progresses as IOs are
456 	 * issued.  If lagging behind device vtime, the delta represents
457 	 * the currently available IO budget.  If runnning ahead, the
458 	 * overage.
459 	 *
460 	 * `vtime_done` is the same but progressed on completion rather
461 	 * than issue.  The delta behind `vtime` represents the cost of
462 	 * currently in-flight IOs.
463 	 *
464 	 * `last_vtime` is used to remember `vtime` at the end of the last
465 	 * period to calculate utilization.
466 	 */
467 	atomic64_t			vtime;
468 	atomic64_t			done_vtime;
469 	atomic64_t			abs_vdebt;
470 	u64				last_vtime;
471 
472 	/*
473 	 * The period this iocg was last active in.  Used for deactivation
474 	 * and invalidating `vtime`.
475 	 */
476 	atomic64_t			active_period;
477 	struct list_head		active_list;
478 
479 	/* see __propagate_active_weight() and current_hweight() for details */
480 	u64				child_active_sum;
481 	u64				child_inuse_sum;
482 	int				hweight_gen;
483 	u32				hweight_active;
484 	u32				hweight_inuse;
485 	bool				has_surplus;
486 
487 	struct wait_queue_head		waitq;
488 	struct hrtimer			waitq_timer;
489 	struct hrtimer			delay_timer;
490 
491 	/* usage is recorded as fractions of HWEIGHT_WHOLE */
492 	int				usage_idx;
493 	u32				usages[NR_USAGE_SLOTS];
494 
495 	/* this iocg's depth in the hierarchy and ancestors including self */
496 	int				level;
497 	struct ioc_gq			*ancestors[];
498 };
499 
500 /* per cgroup */
501 struct ioc_cgrp {
502 	struct blkcg_policy_data	cpd;
503 	unsigned int			dfl_weight;
504 };
505 
506 struct ioc_now {
507 	u64				now_ns;
508 	u32				now;
509 	u64				vnow;
510 	u64				vrate;
511 };
512 
513 struct iocg_wait {
514 	struct wait_queue_entry		wait;
515 	struct bio			*bio;
516 	u64				abs_cost;
517 	bool				committed;
518 };
519 
520 struct iocg_wake_ctx {
521 	struct ioc_gq			*iocg;
522 	u32				hw_inuse;
523 	s64				vbudget;
524 };
525 
526 static const struct ioc_params autop[] = {
527 	[AUTOP_HDD] = {
528 		.qos				= {
529 			[QOS_RLAT]		=        250000, /* 250ms */
530 			[QOS_WLAT]		=        250000,
531 			[QOS_MIN]		= VRATE_MIN_PPM,
532 			[QOS_MAX]		= VRATE_MAX_PPM,
533 		},
534 		.i_lcoefs			= {
535 			[I_LCOEF_RBPS]		=     174019176,
536 			[I_LCOEF_RSEQIOPS]	=         41708,
537 			[I_LCOEF_RRANDIOPS]	=           370,
538 			[I_LCOEF_WBPS]		=     178075866,
539 			[I_LCOEF_WSEQIOPS]	=         42705,
540 			[I_LCOEF_WRANDIOPS]	=           378,
541 		},
542 	},
543 	[AUTOP_SSD_QD1] = {
544 		.qos				= {
545 			[QOS_RLAT]		=         25000, /* 25ms */
546 			[QOS_WLAT]		=         25000,
547 			[QOS_MIN]		= VRATE_MIN_PPM,
548 			[QOS_MAX]		= VRATE_MAX_PPM,
549 		},
550 		.i_lcoefs			= {
551 			[I_LCOEF_RBPS]		=     245855193,
552 			[I_LCOEF_RSEQIOPS]	=         61575,
553 			[I_LCOEF_RRANDIOPS]	=          6946,
554 			[I_LCOEF_WBPS]		=     141365009,
555 			[I_LCOEF_WSEQIOPS]	=         33716,
556 			[I_LCOEF_WRANDIOPS]	=         26796,
557 		},
558 	},
559 	[AUTOP_SSD_DFL] = {
560 		.qos				= {
561 			[QOS_RLAT]		=         25000, /* 25ms */
562 			[QOS_WLAT]		=         25000,
563 			[QOS_MIN]		= VRATE_MIN_PPM,
564 			[QOS_MAX]		= VRATE_MAX_PPM,
565 		},
566 		.i_lcoefs			= {
567 			[I_LCOEF_RBPS]		=     488636629,
568 			[I_LCOEF_RSEQIOPS]	=          8932,
569 			[I_LCOEF_RRANDIOPS]	=          8518,
570 			[I_LCOEF_WBPS]		=     427891549,
571 			[I_LCOEF_WSEQIOPS]	=         28755,
572 			[I_LCOEF_WRANDIOPS]	=         21940,
573 		},
574 		.too_fast_vrate_pct		=           500,
575 	},
576 	[AUTOP_SSD_FAST] = {
577 		.qos				= {
578 			[QOS_RLAT]		=          5000, /* 5ms */
579 			[QOS_WLAT]		=          5000,
580 			[QOS_MIN]		= VRATE_MIN_PPM,
581 			[QOS_MAX]		= VRATE_MAX_PPM,
582 		},
583 		.i_lcoefs			= {
584 			[I_LCOEF_RBPS]		=    3102524156LLU,
585 			[I_LCOEF_RSEQIOPS]	=        724816,
586 			[I_LCOEF_RRANDIOPS]	=        778122,
587 			[I_LCOEF_WBPS]		=    1742780862LLU,
588 			[I_LCOEF_WSEQIOPS]	=        425702,
589 			[I_LCOEF_WRANDIOPS]	=	 443193,
590 		},
591 		.too_slow_vrate_pct		=            10,
592 	},
593 };
594 
595 /*
596  * vrate adjust percentages indexed by ioc->busy_level.  We adjust up on
597  * vtime credit shortage and down on device saturation.
598  */
599 static u32 vrate_adj_pct[] =
600 	{ 0, 0, 0, 0,
601 	  1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,
602 	  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
603 	  4, 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 16 };
604 
605 static struct blkcg_policy blkcg_policy_iocost;
606 
607 /* accessors and helpers */
608 static struct ioc *rqos_to_ioc(struct rq_qos *rqos)
609 {
610 	return container_of(rqos, struct ioc, rqos);
611 }
612 
613 static struct ioc *q_to_ioc(struct request_queue *q)
614 {
615 	return rqos_to_ioc(rq_qos_id(q, RQ_QOS_COST));
616 }
617 
618 static const char *q_name(struct request_queue *q)
619 {
620 	if (test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags))
621 		return kobject_name(q->kobj.parent);
622 	else
623 		return "<unknown>";
624 }
625 
626 static const char __maybe_unused *ioc_name(struct ioc *ioc)
627 {
628 	return q_name(ioc->rqos.q);
629 }
630 
631 static struct ioc_gq *pd_to_iocg(struct blkg_policy_data *pd)
632 {
633 	return pd ? container_of(pd, struct ioc_gq, pd) : NULL;
634 }
635 
636 static struct ioc_gq *blkg_to_iocg(struct blkcg_gq *blkg)
637 {
638 	return pd_to_iocg(blkg_to_pd(blkg, &blkcg_policy_iocost));
639 }
640 
641 static struct blkcg_gq *iocg_to_blkg(struct ioc_gq *iocg)
642 {
643 	return pd_to_blkg(&iocg->pd);
644 }
645 
646 static struct ioc_cgrp *blkcg_to_iocc(struct blkcg *blkcg)
647 {
648 	return container_of(blkcg_to_cpd(blkcg, &blkcg_policy_iocost),
649 			    struct ioc_cgrp, cpd);
650 }
651 
652 /*
653  * Scale @abs_cost to the inverse of @hw_inuse.  The lower the hierarchical
654  * weight, the more expensive each IO.  Must round up.
655  */
656 static u64 abs_cost_to_cost(u64 abs_cost, u32 hw_inuse)
657 {
658 	return DIV64_U64_ROUND_UP(abs_cost * HWEIGHT_WHOLE, hw_inuse);
659 }
660 
661 /*
662  * The inverse of abs_cost_to_cost().  Must round up.
663  */
664 static u64 cost_to_abs_cost(u64 cost, u32 hw_inuse)
665 {
666 	return DIV64_U64_ROUND_UP(cost * hw_inuse, HWEIGHT_WHOLE);
667 }
668 
669 static void iocg_commit_bio(struct ioc_gq *iocg, struct bio *bio, u64 cost)
670 {
671 	bio->bi_iocost_cost = cost;
672 	atomic64_add(cost, &iocg->vtime);
673 }
674 
675 #define CREATE_TRACE_POINTS
676 #include <trace/events/iocost.h>
677 
678 /* latency Qos params changed, update period_us and all the dependent params */
679 static void ioc_refresh_period_us(struct ioc *ioc)
680 {
681 	u32 ppm, lat, multi, period_us;
682 
683 	lockdep_assert_held(&ioc->lock);
684 
685 	/* pick the higher latency target */
686 	if (ioc->params.qos[QOS_RLAT] >= ioc->params.qos[QOS_WLAT]) {
687 		ppm = ioc->params.qos[QOS_RPPM];
688 		lat = ioc->params.qos[QOS_RLAT];
689 	} else {
690 		ppm = ioc->params.qos[QOS_WPPM];
691 		lat = ioc->params.qos[QOS_WLAT];
692 	}
693 
694 	/*
695 	 * We want the period to be long enough to contain a healthy number
696 	 * of IOs while short enough for granular control.  Define it as a
697 	 * multiple of the latency target.  Ideally, the multiplier should
698 	 * be scaled according to the percentile so that it would nominally
699 	 * contain a certain number of requests.  Let's be simpler and
700 	 * scale it linearly so that it's 2x >= pct(90) and 10x at pct(50).
701 	 */
702 	if (ppm)
703 		multi = max_t(u32, (MILLION - ppm) / 50000, 2);
704 	else
705 		multi = 2;
706 	period_us = multi * lat;
707 	period_us = clamp_t(u32, period_us, MIN_PERIOD, MAX_PERIOD);
708 
709 	/* calculate dependent params */
710 	ioc->period_us = period_us;
711 	ioc->margin_us = period_us * MARGIN_PCT / 100;
712 	ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
713 			period_us * VTIME_PER_USEC * INUSE_MARGIN_PCT, 100);
714 }
715 
716 static int ioc_autop_idx(struct ioc *ioc)
717 {
718 	int idx = ioc->autop_idx;
719 	const struct ioc_params *p = &autop[idx];
720 	u32 vrate_pct;
721 	u64 now_ns;
722 
723 	/* rotational? */
724 	if (!blk_queue_nonrot(ioc->rqos.q))
725 		return AUTOP_HDD;
726 
727 	/* handle SATA SSDs w/ broken NCQ */
728 	if (blk_queue_depth(ioc->rqos.q) == 1)
729 		return AUTOP_SSD_QD1;
730 
731 	/* use one of the normal ssd sets */
732 	if (idx < AUTOP_SSD_DFL)
733 		return AUTOP_SSD_DFL;
734 
735 	/* if user is overriding anything, maintain what was there */
736 	if (ioc->user_qos_params || ioc->user_cost_model)
737 		return idx;
738 
739 	/* step up/down based on the vrate */
740 	vrate_pct = div64_u64(atomic64_read(&ioc->vtime_rate) * 100,
741 			      VTIME_PER_USEC);
742 	now_ns = ktime_get_ns();
743 
744 	if (p->too_fast_vrate_pct && p->too_fast_vrate_pct <= vrate_pct) {
745 		if (!ioc->autop_too_fast_at)
746 			ioc->autop_too_fast_at = now_ns;
747 		if (now_ns - ioc->autop_too_fast_at >= AUTOP_CYCLE_NSEC)
748 			return idx + 1;
749 	} else {
750 		ioc->autop_too_fast_at = 0;
751 	}
752 
753 	if (p->too_slow_vrate_pct && p->too_slow_vrate_pct >= vrate_pct) {
754 		if (!ioc->autop_too_slow_at)
755 			ioc->autop_too_slow_at = now_ns;
756 		if (now_ns - ioc->autop_too_slow_at >= AUTOP_CYCLE_NSEC)
757 			return idx - 1;
758 	} else {
759 		ioc->autop_too_slow_at = 0;
760 	}
761 
762 	return idx;
763 }
764 
765 /*
766  * Take the followings as input
767  *
768  *  @bps	maximum sequential throughput
769  *  @seqiops	maximum sequential 4k iops
770  *  @randiops	maximum random 4k iops
771  *
772  * and calculate the linear model cost coefficients.
773  *
774  *  *@page	per-page cost		1s / (@bps / 4096)
775  *  *@seqio	base cost of a seq IO	max((1s / @seqiops) - *@page, 0)
776  *  @randiops	base cost of a rand IO	max((1s / @randiops) - *@page, 0)
777  */
778 static void calc_lcoefs(u64 bps, u64 seqiops, u64 randiops,
779 			u64 *page, u64 *seqio, u64 *randio)
780 {
781 	u64 v;
782 
783 	*page = *seqio = *randio = 0;
784 
785 	if (bps)
786 		*page = DIV64_U64_ROUND_UP(VTIME_PER_SEC,
787 					   DIV_ROUND_UP_ULL(bps, IOC_PAGE_SIZE));
788 
789 	if (seqiops) {
790 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, seqiops);
791 		if (v > *page)
792 			*seqio = v - *page;
793 	}
794 
795 	if (randiops) {
796 		v = DIV64_U64_ROUND_UP(VTIME_PER_SEC, randiops);
797 		if (v > *page)
798 			*randio = v - *page;
799 	}
800 }
801 
802 static void ioc_refresh_lcoefs(struct ioc *ioc)
803 {
804 	u64 *u = ioc->params.i_lcoefs;
805 	u64 *c = ioc->params.lcoefs;
806 
807 	calc_lcoefs(u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
808 		    &c[LCOEF_RPAGE], &c[LCOEF_RSEQIO], &c[LCOEF_RRANDIO]);
809 	calc_lcoefs(u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS],
810 		    &c[LCOEF_WPAGE], &c[LCOEF_WSEQIO], &c[LCOEF_WRANDIO]);
811 }
812 
813 static bool ioc_refresh_params(struct ioc *ioc, bool force)
814 {
815 	const struct ioc_params *p;
816 	int idx;
817 
818 	lockdep_assert_held(&ioc->lock);
819 
820 	idx = ioc_autop_idx(ioc);
821 	p = &autop[idx];
822 
823 	if (idx == ioc->autop_idx && !force)
824 		return false;
825 
826 	if (idx != ioc->autop_idx)
827 		atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
828 
829 	ioc->autop_idx = idx;
830 	ioc->autop_too_fast_at = 0;
831 	ioc->autop_too_slow_at = 0;
832 
833 	if (!ioc->user_qos_params)
834 		memcpy(ioc->params.qos, p->qos, sizeof(p->qos));
835 	if (!ioc->user_cost_model)
836 		memcpy(ioc->params.i_lcoefs, p->i_lcoefs, sizeof(p->i_lcoefs));
837 
838 	ioc_refresh_period_us(ioc);
839 	ioc_refresh_lcoefs(ioc);
840 
841 	ioc->vrate_min = DIV64_U64_ROUND_UP((u64)ioc->params.qos[QOS_MIN] *
842 					    VTIME_PER_USEC, MILLION);
843 	ioc->vrate_max = div64_u64((u64)ioc->params.qos[QOS_MAX] *
844 				   VTIME_PER_USEC, MILLION);
845 
846 	return true;
847 }
848 
849 /* take a snapshot of the current [v]time and vrate */
850 static void ioc_now(struct ioc *ioc, struct ioc_now *now)
851 {
852 	unsigned seq;
853 
854 	now->now_ns = ktime_get();
855 	now->now = ktime_to_us(now->now_ns);
856 	now->vrate = atomic64_read(&ioc->vtime_rate);
857 
858 	/*
859 	 * The current vtime is
860 	 *
861 	 *   vtime at period start + (wallclock time since the start) * vrate
862 	 *
863 	 * As a consistent snapshot of `period_at_vtime` and `period_at` is
864 	 * needed, they're seqcount protected.
865 	 */
866 	do {
867 		seq = read_seqcount_begin(&ioc->period_seqcount);
868 		now->vnow = ioc->period_at_vtime +
869 			(now->now - ioc->period_at) * now->vrate;
870 	} while (read_seqcount_retry(&ioc->period_seqcount, seq));
871 }
872 
873 static void ioc_start_period(struct ioc *ioc, struct ioc_now *now)
874 {
875 	lockdep_assert_held(&ioc->lock);
876 	WARN_ON_ONCE(ioc->running != IOC_RUNNING);
877 
878 	write_seqcount_begin(&ioc->period_seqcount);
879 	ioc->period_at = now->now;
880 	ioc->period_at_vtime = now->vnow;
881 	write_seqcount_end(&ioc->period_seqcount);
882 
883 	ioc->timer.expires = jiffies + usecs_to_jiffies(ioc->period_us);
884 	add_timer(&ioc->timer);
885 }
886 
887 /*
888  * Update @iocg's `active` and `inuse` to @active and @inuse, update level
889  * weight sums and propagate upwards accordingly.
890  */
891 static void __propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
892 {
893 	struct ioc *ioc = iocg->ioc;
894 	int lvl;
895 
896 	lockdep_assert_held(&ioc->lock);
897 
898 	inuse = min(active, inuse);
899 
900 	for (lvl = iocg->level - 1; lvl >= 0; lvl--) {
901 		struct ioc_gq *parent = iocg->ancestors[lvl];
902 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
903 		u32 parent_active = 0, parent_inuse = 0;
904 
905 		/* update the level sums */
906 		parent->child_active_sum += (s32)(active - child->active);
907 		parent->child_inuse_sum += (s32)(inuse - child->inuse);
908 		/* apply the udpates */
909 		child->active = active;
910 		child->inuse = inuse;
911 
912 		/*
913 		 * The delta between inuse and active sums indicates that
914 		 * that much of weight is being given away.  Parent's inuse
915 		 * and active should reflect the ratio.
916 		 */
917 		if (parent->child_active_sum) {
918 			parent_active = parent->weight;
919 			parent_inuse = DIV64_U64_ROUND_UP(
920 				parent_active * parent->child_inuse_sum,
921 				parent->child_active_sum);
922 		}
923 
924 		/* do we need to keep walking up? */
925 		if (parent_active == parent->active &&
926 		    parent_inuse == parent->inuse)
927 			break;
928 
929 		active = parent_active;
930 		inuse = parent_inuse;
931 	}
932 
933 	ioc->weights_updated = true;
934 }
935 
936 static void commit_active_weights(struct ioc *ioc)
937 {
938 	lockdep_assert_held(&ioc->lock);
939 
940 	if (ioc->weights_updated) {
941 		/* paired with rmb in current_hweight(), see there */
942 		smp_wmb();
943 		atomic_inc(&ioc->hweight_gen);
944 		ioc->weights_updated = false;
945 	}
946 }
947 
948 static void propagate_active_weight(struct ioc_gq *iocg, u32 active, u32 inuse)
949 {
950 	__propagate_active_weight(iocg, active, inuse);
951 	commit_active_weights(iocg->ioc);
952 }
953 
954 static void current_hweight(struct ioc_gq *iocg, u32 *hw_activep, u32 *hw_inusep)
955 {
956 	struct ioc *ioc = iocg->ioc;
957 	int lvl;
958 	u32 hwa, hwi;
959 	int ioc_gen;
960 
961 	/* hot path - if uptodate, use cached */
962 	ioc_gen = atomic_read(&ioc->hweight_gen);
963 	if (ioc_gen == iocg->hweight_gen)
964 		goto out;
965 
966 	/*
967 	 * Paired with wmb in commit_active_weights().  If we saw the
968 	 * updated hweight_gen, all the weight updates from
969 	 * __propagate_active_weight() are visible too.
970 	 *
971 	 * We can race with weight updates during calculation and get it
972 	 * wrong.  However, hweight_gen would have changed and a future
973 	 * reader will recalculate and we're guaranteed to discard the
974 	 * wrong result soon.
975 	 */
976 	smp_rmb();
977 
978 	hwa = hwi = HWEIGHT_WHOLE;
979 	for (lvl = 0; lvl <= iocg->level - 1; lvl++) {
980 		struct ioc_gq *parent = iocg->ancestors[lvl];
981 		struct ioc_gq *child = iocg->ancestors[lvl + 1];
982 		u32 active_sum = READ_ONCE(parent->child_active_sum);
983 		u32 inuse_sum = READ_ONCE(parent->child_inuse_sum);
984 		u32 active = READ_ONCE(child->active);
985 		u32 inuse = READ_ONCE(child->inuse);
986 
987 		/* we can race with deactivations and either may read as zero */
988 		if (!active_sum || !inuse_sum)
989 			continue;
990 
991 		active_sum = max(active, active_sum);
992 		hwa = hwa * active / active_sum;	/* max 16bits * 10000 */
993 
994 		inuse_sum = max(inuse, inuse_sum);
995 		hwi = hwi * inuse / inuse_sum;		/* max 16bits * 10000 */
996 	}
997 
998 	iocg->hweight_active = max_t(u32, hwa, 1);
999 	iocg->hweight_inuse = max_t(u32, hwi, 1);
1000 	iocg->hweight_gen = ioc_gen;
1001 out:
1002 	if (hw_activep)
1003 		*hw_activep = iocg->hweight_active;
1004 	if (hw_inusep)
1005 		*hw_inusep = iocg->hweight_inuse;
1006 }
1007 
1008 static void weight_updated(struct ioc_gq *iocg)
1009 {
1010 	struct ioc *ioc = iocg->ioc;
1011 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1012 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkg->blkcg);
1013 	u32 weight;
1014 
1015 	lockdep_assert_held(&ioc->lock);
1016 
1017 	weight = iocg->cfg_weight ?: iocc->dfl_weight;
1018 	if (weight != iocg->weight && iocg->active)
1019 		propagate_active_weight(iocg, weight,
1020 			DIV64_U64_ROUND_UP(iocg->inuse * weight, iocg->weight));
1021 	iocg->weight = weight;
1022 }
1023 
1024 static bool iocg_activate(struct ioc_gq *iocg, struct ioc_now *now)
1025 {
1026 	struct ioc *ioc = iocg->ioc;
1027 	u64 last_period, cur_period, max_period_delta;
1028 	u64 vtime, vmargin, vmin;
1029 	int i;
1030 
1031 	/*
1032 	 * If seem to be already active, just update the stamp to tell the
1033 	 * timer that we're still active.  We don't mind occassional races.
1034 	 */
1035 	if (!list_empty(&iocg->active_list)) {
1036 		ioc_now(ioc, now);
1037 		cur_period = atomic64_read(&ioc->cur_period);
1038 		if (atomic64_read(&iocg->active_period) != cur_period)
1039 			atomic64_set(&iocg->active_period, cur_period);
1040 		return true;
1041 	}
1042 
1043 	/* racy check on internal node IOs, treat as root level IOs */
1044 	if (iocg->child_active_sum)
1045 		return false;
1046 
1047 	spin_lock_irq(&ioc->lock);
1048 
1049 	ioc_now(ioc, now);
1050 
1051 	/* update period */
1052 	cur_period = atomic64_read(&ioc->cur_period);
1053 	last_period = atomic64_read(&iocg->active_period);
1054 	atomic64_set(&iocg->active_period, cur_period);
1055 
1056 	/* already activated or breaking leaf-only constraint? */
1057 	if (!list_empty(&iocg->active_list))
1058 		goto succeed_unlock;
1059 	for (i = iocg->level - 1; i > 0; i--)
1060 		if (!list_empty(&iocg->ancestors[i]->active_list))
1061 			goto fail_unlock;
1062 
1063 	if (iocg->child_active_sum)
1064 		goto fail_unlock;
1065 
1066 	/*
1067 	 * vtime may wrap when vrate is raised substantially due to
1068 	 * underestimated IO costs.  Look at the period and ignore its
1069 	 * vtime if the iocg has been idle for too long.  Also, cap the
1070 	 * budget it can start with to the margin.
1071 	 */
1072 	max_period_delta = DIV64_U64_ROUND_UP(VTIME_VALID_DUR, ioc->period_us);
1073 	vtime = atomic64_read(&iocg->vtime);
1074 	vmargin = ioc->margin_us * now->vrate;
1075 	vmin = now->vnow - vmargin;
1076 
1077 	if (last_period + max_period_delta < cur_period ||
1078 	    time_before64(vtime, vmin)) {
1079 		atomic64_add(vmin - vtime, &iocg->vtime);
1080 		atomic64_add(vmin - vtime, &iocg->done_vtime);
1081 		vtime = vmin;
1082 	}
1083 
1084 	/*
1085 	 * Activate, propagate weight and start period timer if not
1086 	 * running.  Reset hweight_gen to avoid accidental match from
1087 	 * wrapping.
1088 	 */
1089 	iocg->hweight_gen = atomic_read(&ioc->hweight_gen) - 1;
1090 	list_add(&iocg->active_list, &ioc->active_iocgs);
1091 	propagate_active_weight(iocg, iocg->weight,
1092 				iocg->last_inuse ?: iocg->weight);
1093 
1094 	TRACE_IOCG_PATH(iocg_activate, iocg, now,
1095 			last_period, cur_period, vtime);
1096 
1097 	iocg->last_vtime = vtime;
1098 
1099 	if (ioc->running == IOC_IDLE) {
1100 		ioc->running = IOC_RUNNING;
1101 		ioc_start_period(ioc, now);
1102 	}
1103 
1104 succeed_unlock:
1105 	spin_unlock_irq(&ioc->lock);
1106 	return true;
1107 
1108 fail_unlock:
1109 	spin_unlock_irq(&ioc->lock);
1110 	return false;
1111 }
1112 
1113 static int iocg_wake_fn(struct wait_queue_entry *wq_entry, unsigned mode,
1114 			int flags, void *key)
1115 {
1116 	struct iocg_wait *wait = container_of(wq_entry, struct iocg_wait, wait);
1117 	struct iocg_wake_ctx *ctx = (struct iocg_wake_ctx *)key;
1118 	u64 cost = abs_cost_to_cost(wait->abs_cost, ctx->hw_inuse);
1119 
1120 	ctx->vbudget -= cost;
1121 
1122 	if (ctx->vbudget < 0)
1123 		return -1;
1124 
1125 	iocg_commit_bio(ctx->iocg, wait->bio, cost);
1126 
1127 	/*
1128 	 * autoremove_wake_function() removes the wait entry only when it
1129 	 * actually changed the task state.  We want the wait always
1130 	 * removed.  Remove explicitly and use default_wake_function().
1131 	 */
1132 	list_del_init(&wq_entry->entry);
1133 	wait->committed = true;
1134 
1135 	default_wake_function(wq_entry, mode, flags, key);
1136 	return 0;
1137 }
1138 
1139 static void iocg_kick_waitq(struct ioc_gq *iocg, struct ioc_now *now)
1140 {
1141 	struct ioc *ioc = iocg->ioc;
1142 	struct iocg_wake_ctx ctx = { .iocg = iocg };
1143 	u64 margin_ns = (u64)(ioc->period_us *
1144 			      WAITQ_TIMER_MARGIN_PCT / 100) * NSEC_PER_USEC;
1145 	u64 abs_vdebt, vdebt, vshortage, expires, oexpires;
1146 	s64 vbudget;
1147 	u32 hw_inuse;
1148 
1149 	lockdep_assert_held(&iocg->waitq.lock);
1150 
1151 	current_hweight(iocg, NULL, &hw_inuse);
1152 	vbudget = now->vnow - atomic64_read(&iocg->vtime);
1153 
1154 	/* pay off debt */
1155 	abs_vdebt = atomic64_read(&iocg->abs_vdebt);
1156 	vdebt = abs_cost_to_cost(abs_vdebt, hw_inuse);
1157 	if (vdebt && vbudget > 0) {
1158 		u64 delta = min_t(u64, vbudget, vdebt);
1159 		u64 abs_delta = min(cost_to_abs_cost(delta, hw_inuse),
1160 				    abs_vdebt);
1161 
1162 		atomic64_add(delta, &iocg->vtime);
1163 		atomic64_add(delta, &iocg->done_vtime);
1164 		atomic64_sub(abs_delta, &iocg->abs_vdebt);
1165 		if (WARN_ON_ONCE(atomic64_read(&iocg->abs_vdebt) < 0))
1166 			atomic64_set(&iocg->abs_vdebt, 0);
1167 	}
1168 
1169 	/*
1170 	 * Wake up the ones which are due and see how much vtime we'll need
1171 	 * for the next one.
1172 	 */
1173 	ctx.hw_inuse = hw_inuse;
1174 	ctx.vbudget = vbudget - vdebt;
1175 	__wake_up_locked_key(&iocg->waitq, TASK_NORMAL, &ctx);
1176 	if (!waitqueue_active(&iocg->waitq))
1177 		return;
1178 	if (WARN_ON_ONCE(ctx.vbudget >= 0))
1179 		return;
1180 
1181 	/* determine next wakeup, add a quarter margin to guarantee chunking */
1182 	vshortage = -ctx.vbudget;
1183 	expires = now->now_ns +
1184 		DIV64_U64_ROUND_UP(vshortage, now->vrate) * NSEC_PER_USEC;
1185 	expires += margin_ns / 4;
1186 
1187 	/* if already active and close enough, don't bother */
1188 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->waitq_timer));
1189 	if (hrtimer_is_queued(&iocg->waitq_timer) &&
1190 	    abs(oexpires - expires) <= margin_ns / 4)
1191 		return;
1192 
1193 	hrtimer_start_range_ns(&iocg->waitq_timer, ns_to_ktime(expires),
1194 			       margin_ns / 4, HRTIMER_MODE_ABS);
1195 }
1196 
1197 static enum hrtimer_restart iocg_waitq_timer_fn(struct hrtimer *timer)
1198 {
1199 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, waitq_timer);
1200 	struct ioc_now now;
1201 	unsigned long flags;
1202 
1203 	ioc_now(iocg->ioc, &now);
1204 
1205 	spin_lock_irqsave(&iocg->waitq.lock, flags);
1206 	iocg_kick_waitq(iocg, &now);
1207 	spin_unlock_irqrestore(&iocg->waitq.lock, flags);
1208 
1209 	return HRTIMER_NORESTART;
1210 }
1211 
1212 static bool iocg_kick_delay(struct ioc_gq *iocg, struct ioc_now *now, u64 cost)
1213 {
1214 	struct ioc *ioc = iocg->ioc;
1215 	struct blkcg_gq *blkg = iocg_to_blkg(iocg);
1216 	u64 vtime = atomic64_read(&iocg->vtime);
1217 	u64 vmargin = ioc->margin_us * now->vrate;
1218 	u64 margin_ns = ioc->margin_us * NSEC_PER_USEC;
1219 	u64 expires, oexpires;
1220 	u32 hw_inuse;
1221 
1222 	/* debt-adjust vtime */
1223 	current_hweight(iocg, NULL, &hw_inuse);
1224 	vtime += abs_cost_to_cost(atomic64_read(&iocg->abs_vdebt), hw_inuse);
1225 
1226 	/* clear or maintain depending on the overage */
1227 	if (time_before_eq64(vtime, now->vnow)) {
1228 		blkcg_clear_delay(blkg);
1229 		return false;
1230 	}
1231 	if (!atomic_read(&blkg->use_delay) &&
1232 	    time_before_eq64(vtime, now->vnow + vmargin))
1233 		return false;
1234 
1235 	/* use delay */
1236 	if (cost) {
1237 		u64 cost_ns = DIV64_U64_ROUND_UP(cost * NSEC_PER_USEC,
1238 						 now->vrate);
1239 		blkcg_add_delay(blkg, now->now_ns, cost_ns);
1240 	}
1241 	blkcg_use_delay(blkg);
1242 
1243 	expires = now->now_ns + DIV64_U64_ROUND_UP(vtime - now->vnow,
1244 						   now->vrate) * NSEC_PER_USEC;
1245 
1246 	/* if already active and close enough, don't bother */
1247 	oexpires = ktime_to_ns(hrtimer_get_softexpires(&iocg->delay_timer));
1248 	if (hrtimer_is_queued(&iocg->delay_timer) &&
1249 	    abs(oexpires - expires) <= margin_ns / 4)
1250 		return true;
1251 
1252 	hrtimer_start_range_ns(&iocg->delay_timer, ns_to_ktime(expires),
1253 			       margin_ns / 4, HRTIMER_MODE_ABS);
1254 	return true;
1255 }
1256 
1257 static enum hrtimer_restart iocg_delay_timer_fn(struct hrtimer *timer)
1258 {
1259 	struct ioc_gq *iocg = container_of(timer, struct ioc_gq, delay_timer);
1260 	struct ioc_now now;
1261 
1262 	ioc_now(iocg->ioc, &now);
1263 	iocg_kick_delay(iocg, &now, 0);
1264 
1265 	return HRTIMER_NORESTART;
1266 }
1267 
1268 static void ioc_lat_stat(struct ioc *ioc, u32 *missed_ppm_ar, u32 *rq_wait_pct_p)
1269 {
1270 	u32 nr_met[2] = { };
1271 	u32 nr_missed[2] = { };
1272 	u64 rq_wait_ns = 0;
1273 	int cpu, rw;
1274 
1275 	for_each_online_cpu(cpu) {
1276 		struct ioc_pcpu_stat *stat = per_cpu_ptr(ioc->pcpu_stat, cpu);
1277 		u64 this_rq_wait_ns;
1278 
1279 		for (rw = READ; rw <= WRITE; rw++) {
1280 			u32 this_met = READ_ONCE(stat->missed[rw].nr_met);
1281 			u32 this_missed = READ_ONCE(stat->missed[rw].nr_missed);
1282 
1283 			nr_met[rw] += this_met - stat->missed[rw].last_met;
1284 			nr_missed[rw] += this_missed - stat->missed[rw].last_missed;
1285 			stat->missed[rw].last_met = this_met;
1286 			stat->missed[rw].last_missed = this_missed;
1287 		}
1288 
1289 		this_rq_wait_ns = READ_ONCE(stat->rq_wait_ns);
1290 		rq_wait_ns += this_rq_wait_ns - stat->last_rq_wait_ns;
1291 		stat->last_rq_wait_ns = this_rq_wait_ns;
1292 	}
1293 
1294 	for (rw = READ; rw <= WRITE; rw++) {
1295 		if (nr_met[rw] + nr_missed[rw])
1296 			missed_ppm_ar[rw] =
1297 				DIV64_U64_ROUND_UP((u64)nr_missed[rw] * MILLION,
1298 						   nr_met[rw] + nr_missed[rw]);
1299 		else
1300 			missed_ppm_ar[rw] = 0;
1301 	}
1302 
1303 	*rq_wait_pct_p = div64_u64(rq_wait_ns * 100,
1304 				   ioc->period_us * NSEC_PER_USEC);
1305 }
1306 
1307 /* was iocg idle this period? */
1308 static bool iocg_is_idle(struct ioc_gq *iocg)
1309 {
1310 	struct ioc *ioc = iocg->ioc;
1311 
1312 	/* did something get issued this period? */
1313 	if (atomic64_read(&iocg->active_period) ==
1314 	    atomic64_read(&ioc->cur_period))
1315 		return false;
1316 
1317 	/* is something in flight? */
1318 	if (atomic64_read(&iocg->done_vtime) != atomic64_read(&iocg->vtime))
1319 		return false;
1320 
1321 	return true;
1322 }
1323 
1324 /* returns usage with margin added if surplus is large enough */
1325 static u32 surplus_adjusted_hweight_inuse(u32 usage, u32 hw_inuse)
1326 {
1327 	/* add margin */
1328 	usage = DIV_ROUND_UP(usage * SURPLUS_SCALE_PCT, 100);
1329 	usage += SURPLUS_SCALE_ABS;
1330 
1331 	/* don't bother if the surplus is too small */
1332 	if (usage + SURPLUS_MIN_ADJ_DELTA > hw_inuse)
1333 		return 0;
1334 
1335 	return usage;
1336 }
1337 
1338 static void ioc_timer_fn(struct timer_list *timer)
1339 {
1340 	struct ioc *ioc = container_of(timer, struct ioc, timer);
1341 	struct ioc_gq *iocg, *tiocg;
1342 	struct ioc_now now;
1343 	int nr_surpluses = 0, nr_shortages = 0, nr_lagging = 0;
1344 	u32 ppm_rthr = MILLION - ioc->params.qos[QOS_RPPM];
1345 	u32 ppm_wthr = MILLION - ioc->params.qos[QOS_WPPM];
1346 	u32 missed_ppm[2], rq_wait_pct;
1347 	u64 period_vtime;
1348 	int prev_busy_level, i;
1349 
1350 	/* how were the latencies during the period? */
1351 	ioc_lat_stat(ioc, missed_ppm, &rq_wait_pct);
1352 
1353 	/* take care of active iocgs */
1354 	spin_lock_irq(&ioc->lock);
1355 
1356 	ioc_now(ioc, &now);
1357 
1358 	period_vtime = now.vnow - ioc->period_at_vtime;
1359 	if (WARN_ON_ONCE(!period_vtime)) {
1360 		spin_unlock_irq(&ioc->lock);
1361 		return;
1362 	}
1363 
1364 	/*
1365 	 * Waiters determine the sleep durations based on the vrate they
1366 	 * saw at the time of sleep.  If vrate has increased, some waiters
1367 	 * could be sleeping for too long.  Wake up tardy waiters which
1368 	 * should have woken up in the last period and expire idle iocgs.
1369 	 */
1370 	list_for_each_entry_safe(iocg, tiocg, &ioc->active_iocgs, active_list) {
1371 		if (!waitqueue_active(&iocg->waitq) &&
1372 		    !atomic64_read(&iocg->abs_vdebt) && !iocg_is_idle(iocg))
1373 			continue;
1374 
1375 		spin_lock(&iocg->waitq.lock);
1376 
1377 		if (waitqueue_active(&iocg->waitq) ||
1378 		    atomic64_read(&iocg->abs_vdebt)) {
1379 			/* might be oversleeping vtime / hweight changes, kick */
1380 			iocg_kick_waitq(iocg, &now);
1381 			iocg_kick_delay(iocg, &now, 0);
1382 		} else if (iocg_is_idle(iocg)) {
1383 			/* no waiter and idle, deactivate */
1384 			iocg->last_inuse = iocg->inuse;
1385 			__propagate_active_weight(iocg, 0, 0);
1386 			list_del_init(&iocg->active_list);
1387 		}
1388 
1389 		spin_unlock(&iocg->waitq.lock);
1390 	}
1391 	commit_active_weights(ioc);
1392 
1393 	/* calc usages and see whether some weights need to be moved around */
1394 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1395 		u64 vdone, vtime, vusage, vmargin, vmin;
1396 		u32 hw_active, hw_inuse, usage;
1397 
1398 		/*
1399 		 * Collect unused and wind vtime closer to vnow to prevent
1400 		 * iocgs from accumulating a large amount of budget.
1401 		 */
1402 		vdone = atomic64_read(&iocg->done_vtime);
1403 		vtime = atomic64_read(&iocg->vtime);
1404 		current_hweight(iocg, &hw_active, &hw_inuse);
1405 
1406 		/*
1407 		 * Latency QoS detection doesn't account for IOs which are
1408 		 * in-flight for longer than a period.  Detect them by
1409 		 * comparing vdone against period start.  If lagging behind
1410 		 * IOs from past periods, don't increase vrate.
1411 		 */
1412 		if ((ppm_rthr != MILLION || ppm_wthr != MILLION) &&
1413 		    !atomic_read(&iocg_to_blkg(iocg)->use_delay) &&
1414 		    time_after64(vtime, vdone) &&
1415 		    time_after64(vtime, now.vnow -
1416 				 MAX_LAGGING_PERIODS * period_vtime) &&
1417 		    time_before64(vdone, now.vnow - period_vtime))
1418 			nr_lagging++;
1419 
1420 		if (waitqueue_active(&iocg->waitq))
1421 			vusage = now.vnow - iocg->last_vtime;
1422 		else if (time_before64(iocg->last_vtime, vtime))
1423 			vusage = vtime - iocg->last_vtime;
1424 		else
1425 			vusage = 0;
1426 
1427 		iocg->last_vtime += vusage;
1428 		/*
1429 		 * Factor in in-flight vtime into vusage to avoid
1430 		 * high-latency completions appearing as idle.  This should
1431 		 * be done after the above ->last_time adjustment.
1432 		 */
1433 		vusage = max(vusage, vtime - vdone);
1434 
1435 		/* calculate hweight based usage ratio and record */
1436 		if (vusage) {
1437 			usage = DIV64_U64_ROUND_UP(vusage * hw_inuse,
1438 						   period_vtime);
1439 			iocg->usage_idx = (iocg->usage_idx + 1) % NR_USAGE_SLOTS;
1440 			iocg->usages[iocg->usage_idx] = usage;
1441 		} else {
1442 			usage = 0;
1443 		}
1444 
1445 		/* see whether there's surplus vtime */
1446 		vmargin = ioc->margin_us * now.vrate;
1447 		vmin = now.vnow - vmargin;
1448 
1449 		iocg->has_surplus = false;
1450 
1451 		if (!waitqueue_active(&iocg->waitq) &&
1452 		    time_before64(vtime, vmin)) {
1453 			u64 delta = vmin - vtime;
1454 
1455 			/* throw away surplus vtime */
1456 			atomic64_add(delta, &iocg->vtime);
1457 			atomic64_add(delta, &iocg->done_vtime);
1458 			iocg->last_vtime += delta;
1459 			/* if usage is sufficiently low, maybe it can donate */
1460 			if (surplus_adjusted_hweight_inuse(usage, hw_inuse)) {
1461 				iocg->has_surplus = true;
1462 				nr_surpluses++;
1463 			}
1464 		} else if (hw_inuse < hw_active) {
1465 			u32 new_hwi, new_inuse;
1466 
1467 			/* was donating but might need to take back some */
1468 			if (waitqueue_active(&iocg->waitq)) {
1469 				new_hwi = hw_active;
1470 			} else {
1471 				new_hwi = max(hw_inuse,
1472 					      usage * SURPLUS_SCALE_PCT / 100 +
1473 					      SURPLUS_SCALE_ABS);
1474 			}
1475 
1476 			new_inuse = div64_u64((u64)iocg->inuse * new_hwi,
1477 					      hw_inuse);
1478 			new_inuse = clamp_t(u32, new_inuse, 1, iocg->active);
1479 
1480 			if (new_inuse > iocg->inuse) {
1481 				TRACE_IOCG_PATH(inuse_takeback, iocg, &now,
1482 						iocg->inuse, new_inuse,
1483 						hw_inuse, new_hwi);
1484 				__propagate_active_weight(iocg, iocg->weight,
1485 							  new_inuse);
1486 			}
1487 		} else {
1488 			/* genuninely out of vtime */
1489 			nr_shortages++;
1490 		}
1491 	}
1492 
1493 	if (!nr_shortages || !nr_surpluses)
1494 		goto skip_surplus_transfers;
1495 
1496 	/* there are both shortages and surpluses, transfer surpluses */
1497 	list_for_each_entry(iocg, &ioc->active_iocgs, active_list) {
1498 		u32 usage, hw_active, hw_inuse, new_hwi, new_inuse;
1499 		int nr_valid = 0;
1500 
1501 		if (!iocg->has_surplus)
1502 			continue;
1503 
1504 		/* base the decision on max historical usage */
1505 		for (i = 0, usage = 0; i < NR_USAGE_SLOTS; i++) {
1506 			if (iocg->usages[i]) {
1507 				usage = max(usage, iocg->usages[i]);
1508 				nr_valid++;
1509 			}
1510 		}
1511 		if (nr_valid < MIN_VALID_USAGES)
1512 			continue;
1513 
1514 		current_hweight(iocg, &hw_active, &hw_inuse);
1515 		new_hwi = surplus_adjusted_hweight_inuse(usage, hw_inuse);
1516 		if (!new_hwi)
1517 			continue;
1518 
1519 		new_inuse = DIV64_U64_ROUND_UP((u64)iocg->inuse * new_hwi,
1520 					       hw_inuse);
1521 		if (new_inuse < iocg->inuse) {
1522 			TRACE_IOCG_PATH(inuse_giveaway, iocg, &now,
1523 					iocg->inuse, new_inuse,
1524 					hw_inuse, new_hwi);
1525 			__propagate_active_weight(iocg, iocg->weight, new_inuse);
1526 		}
1527 	}
1528 skip_surplus_transfers:
1529 	commit_active_weights(ioc);
1530 
1531 	/*
1532 	 * If q is getting clogged or we're missing too much, we're issuing
1533 	 * too much IO and should lower vtime rate.  If we're not missing
1534 	 * and experiencing shortages but not surpluses, we're too stingy
1535 	 * and should increase vtime rate.
1536 	 */
1537 	prev_busy_level = ioc->busy_level;
1538 	if (rq_wait_pct > RQ_WAIT_BUSY_PCT ||
1539 	    missed_ppm[READ] > ppm_rthr ||
1540 	    missed_ppm[WRITE] > ppm_wthr) {
1541 		ioc->busy_level = max(ioc->busy_level, 0);
1542 		ioc->busy_level++;
1543 	} else if (rq_wait_pct <= RQ_WAIT_BUSY_PCT * UNBUSY_THR_PCT / 100 &&
1544 		   missed_ppm[READ] <= ppm_rthr * UNBUSY_THR_PCT / 100 &&
1545 		   missed_ppm[WRITE] <= ppm_wthr * UNBUSY_THR_PCT / 100) {
1546 		/* take action iff there is contention */
1547 		if (nr_shortages && !nr_lagging) {
1548 			ioc->busy_level = min(ioc->busy_level, 0);
1549 			/* redistribute surpluses first */
1550 			if (!nr_surpluses)
1551 				ioc->busy_level--;
1552 		}
1553 	} else {
1554 		ioc->busy_level = 0;
1555 	}
1556 
1557 	ioc->busy_level = clamp(ioc->busy_level, -1000, 1000);
1558 
1559 	if (ioc->busy_level > 0 || (ioc->busy_level < 0 && !nr_lagging)) {
1560 		u64 vrate = atomic64_read(&ioc->vtime_rate);
1561 		u64 vrate_min = ioc->vrate_min, vrate_max = ioc->vrate_max;
1562 
1563 		/* rq_wait signal is always reliable, ignore user vrate_min */
1564 		if (rq_wait_pct > RQ_WAIT_BUSY_PCT)
1565 			vrate_min = VRATE_MIN;
1566 
1567 		/*
1568 		 * If vrate is out of bounds, apply clamp gradually as the
1569 		 * bounds can change abruptly.  Otherwise, apply busy_level
1570 		 * based adjustment.
1571 		 */
1572 		if (vrate < vrate_min) {
1573 			vrate = div64_u64(vrate * (100 + VRATE_CLAMP_ADJ_PCT),
1574 					  100);
1575 			vrate = min(vrate, vrate_min);
1576 		} else if (vrate > vrate_max) {
1577 			vrate = div64_u64(vrate * (100 - VRATE_CLAMP_ADJ_PCT),
1578 					  100);
1579 			vrate = max(vrate, vrate_max);
1580 		} else {
1581 			int idx = min_t(int, abs(ioc->busy_level),
1582 					ARRAY_SIZE(vrate_adj_pct) - 1);
1583 			u32 adj_pct = vrate_adj_pct[idx];
1584 
1585 			if (ioc->busy_level > 0)
1586 				adj_pct = 100 - adj_pct;
1587 			else
1588 				adj_pct = 100 + adj_pct;
1589 
1590 			vrate = clamp(DIV64_U64_ROUND_UP(vrate * adj_pct, 100),
1591 				      vrate_min, vrate_max);
1592 		}
1593 
1594 		trace_iocost_ioc_vrate_adj(ioc, vrate, missed_ppm, rq_wait_pct,
1595 					   nr_lagging, nr_shortages,
1596 					   nr_surpluses);
1597 
1598 		atomic64_set(&ioc->vtime_rate, vrate);
1599 		ioc->inuse_margin_vtime = DIV64_U64_ROUND_UP(
1600 			ioc->period_us * vrate * INUSE_MARGIN_PCT, 100);
1601 	} else if (ioc->busy_level != prev_busy_level || nr_lagging) {
1602 		trace_iocost_ioc_vrate_adj(ioc, atomic64_read(&ioc->vtime_rate),
1603 					   missed_ppm, rq_wait_pct, nr_lagging,
1604 					   nr_shortages, nr_surpluses);
1605 	}
1606 
1607 	ioc_refresh_params(ioc, false);
1608 
1609 	/*
1610 	 * This period is done.  Move onto the next one.  If nothing's
1611 	 * going on with the device, stop the timer.
1612 	 */
1613 	atomic64_inc(&ioc->cur_period);
1614 
1615 	if (ioc->running != IOC_STOP) {
1616 		if (!list_empty(&ioc->active_iocgs)) {
1617 			ioc_start_period(ioc, &now);
1618 		} else {
1619 			ioc->busy_level = 0;
1620 			ioc->running = IOC_IDLE;
1621 		}
1622 	}
1623 
1624 	spin_unlock_irq(&ioc->lock);
1625 }
1626 
1627 static void calc_vtime_cost_builtin(struct bio *bio, struct ioc_gq *iocg,
1628 				    bool is_merge, u64 *costp)
1629 {
1630 	struct ioc *ioc = iocg->ioc;
1631 	u64 coef_seqio, coef_randio, coef_page;
1632 	u64 pages = max_t(u64, bio_sectors(bio) >> IOC_SECT_TO_PAGE_SHIFT, 1);
1633 	u64 seek_pages = 0;
1634 	u64 cost = 0;
1635 
1636 	switch (bio_op(bio)) {
1637 	case REQ_OP_READ:
1638 		coef_seqio	= ioc->params.lcoefs[LCOEF_RSEQIO];
1639 		coef_randio	= ioc->params.lcoefs[LCOEF_RRANDIO];
1640 		coef_page	= ioc->params.lcoefs[LCOEF_RPAGE];
1641 		break;
1642 	case REQ_OP_WRITE:
1643 		coef_seqio	= ioc->params.lcoefs[LCOEF_WSEQIO];
1644 		coef_randio	= ioc->params.lcoefs[LCOEF_WRANDIO];
1645 		coef_page	= ioc->params.lcoefs[LCOEF_WPAGE];
1646 		break;
1647 	default:
1648 		goto out;
1649 	}
1650 
1651 	if (iocg->cursor) {
1652 		seek_pages = abs(bio->bi_iter.bi_sector - iocg->cursor);
1653 		seek_pages >>= IOC_SECT_TO_PAGE_SHIFT;
1654 	}
1655 
1656 	if (!is_merge) {
1657 		if (seek_pages > LCOEF_RANDIO_PAGES) {
1658 			cost += coef_randio;
1659 		} else {
1660 			cost += coef_seqio;
1661 		}
1662 	}
1663 	cost += pages * coef_page;
1664 out:
1665 	*costp = cost;
1666 }
1667 
1668 static u64 calc_vtime_cost(struct bio *bio, struct ioc_gq *iocg, bool is_merge)
1669 {
1670 	u64 cost;
1671 
1672 	calc_vtime_cost_builtin(bio, iocg, is_merge, &cost);
1673 	return cost;
1674 }
1675 
1676 static void ioc_rqos_throttle(struct rq_qos *rqos, struct bio *bio)
1677 {
1678 	struct blkcg_gq *blkg = bio->bi_blkg;
1679 	struct ioc *ioc = rqos_to_ioc(rqos);
1680 	struct ioc_gq *iocg = blkg_to_iocg(blkg);
1681 	struct ioc_now now;
1682 	struct iocg_wait wait;
1683 	u32 hw_active, hw_inuse;
1684 	u64 abs_cost, cost, vtime;
1685 
1686 	/* bypass IOs if disabled or for root cgroup */
1687 	if (!ioc->enabled || !iocg->level)
1688 		return;
1689 
1690 	/* always activate so that even 0 cost IOs get protected to some level */
1691 	if (!iocg_activate(iocg, &now))
1692 		return;
1693 
1694 	/* calculate the absolute vtime cost */
1695 	abs_cost = calc_vtime_cost(bio, iocg, false);
1696 	if (!abs_cost)
1697 		return;
1698 
1699 	iocg->cursor = bio_end_sector(bio);
1700 
1701 	vtime = atomic64_read(&iocg->vtime);
1702 	current_hweight(iocg, &hw_active, &hw_inuse);
1703 
1704 	if (hw_inuse < hw_active &&
1705 	    time_after_eq64(vtime + ioc->inuse_margin_vtime, now.vnow)) {
1706 		TRACE_IOCG_PATH(inuse_reset, iocg, &now,
1707 				iocg->inuse, iocg->weight, hw_inuse, hw_active);
1708 		spin_lock_irq(&ioc->lock);
1709 		propagate_active_weight(iocg, iocg->weight, iocg->weight);
1710 		spin_unlock_irq(&ioc->lock);
1711 		current_hweight(iocg, &hw_active, &hw_inuse);
1712 	}
1713 
1714 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1715 
1716 	/*
1717 	 * If no one's waiting and within budget, issue right away.  The
1718 	 * tests are racy but the races aren't systemic - we only miss once
1719 	 * in a while which is fine.
1720 	 */
1721 	if (!waitqueue_active(&iocg->waitq) &&
1722 	    !atomic64_read(&iocg->abs_vdebt) &&
1723 	    time_before_eq64(vtime + cost, now.vnow)) {
1724 		iocg_commit_bio(iocg, bio, cost);
1725 		return;
1726 	}
1727 
1728 	/*
1729 	 * We're over budget.  If @bio has to be issued regardless,
1730 	 * remember the abs_cost instead of advancing vtime.
1731 	 * iocg_kick_waitq() will pay off the debt before waking more IOs.
1732 	 * This way, the debt is continuously paid off each period with the
1733 	 * actual budget available to the cgroup.  If we just wound vtime,
1734 	 * we would incorrectly use the current hw_inuse for the entire
1735 	 * amount which, for example, can lead to the cgroup staying
1736 	 * blocked for a long time even with substantially raised hw_inuse.
1737 	 */
1738 	if (bio_issue_as_root_blkg(bio) || fatal_signal_pending(current)) {
1739 		atomic64_add(abs_cost, &iocg->abs_vdebt);
1740 		if (iocg_kick_delay(iocg, &now, cost))
1741 			blkcg_schedule_throttle(rqos->q,
1742 					(bio->bi_opf & REQ_SWAP) == REQ_SWAP);
1743 		return;
1744 	}
1745 
1746 	/*
1747 	 * Append self to the waitq and schedule the wakeup timer if we're
1748 	 * the first waiter.  The timer duration is calculated based on the
1749 	 * current vrate.  vtime and hweight changes can make it too short
1750 	 * or too long.  Each wait entry records the absolute cost it's
1751 	 * waiting for to allow re-evaluation using a custom wait entry.
1752 	 *
1753 	 * If too short, the timer simply reschedules itself.  If too long,
1754 	 * the period timer will notice and trigger wakeups.
1755 	 *
1756 	 * All waiters are on iocg->waitq and the wait states are
1757 	 * synchronized using waitq.lock.
1758 	 */
1759 	spin_lock_irq(&iocg->waitq.lock);
1760 
1761 	/*
1762 	 * We activated above but w/o any synchronization.  Deactivation is
1763 	 * synchronized with waitq.lock and we won't get deactivated as
1764 	 * long as we're waiting, so we're good if we're activated here.
1765 	 * In the unlikely case that we are deactivated, just issue the IO.
1766 	 */
1767 	if (unlikely(list_empty(&iocg->active_list))) {
1768 		spin_unlock_irq(&iocg->waitq.lock);
1769 		iocg_commit_bio(iocg, bio, cost);
1770 		return;
1771 	}
1772 
1773 	init_waitqueue_func_entry(&wait.wait, iocg_wake_fn);
1774 	wait.wait.private = current;
1775 	wait.bio = bio;
1776 	wait.abs_cost = abs_cost;
1777 	wait.committed = false;	/* will be set true by waker */
1778 
1779 	__add_wait_queue_entry_tail(&iocg->waitq, &wait.wait);
1780 	iocg_kick_waitq(iocg, &now);
1781 
1782 	spin_unlock_irq(&iocg->waitq.lock);
1783 
1784 	while (true) {
1785 		set_current_state(TASK_UNINTERRUPTIBLE);
1786 		if (wait.committed)
1787 			break;
1788 		io_schedule();
1789 	}
1790 
1791 	/* waker already committed us, proceed */
1792 	finish_wait(&iocg->waitq, &wait.wait);
1793 }
1794 
1795 static void ioc_rqos_merge(struct rq_qos *rqos, struct request *rq,
1796 			   struct bio *bio)
1797 {
1798 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1799 	struct ioc *ioc = iocg->ioc;
1800 	sector_t bio_end = bio_end_sector(bio);
1801 	struct ioc_now now;
1802 	u32 hw_inuse;
1803 	u64 abs_cost, cost;
1804 
1805 	/* bypass if disabled or for root cgroup */
1806 	if (!ioc->enabled || !iocg->level)
1807 		return;
1808 
1809 	abs_cost = calc_vtime_cost(bio, iocg, true);
1810 	if (!abs_cost)
1811 		return;
1812 
1813 	ioc_now(ioc, &now);
1814 	current_hweight(iocg, NULL, &hw_inuse);
1815 	cost = abs_cost_to_cost(abs_cost, hw_inuse);
1816 
1817 	/* update cursor if backmerging into the request at the cursor */
1818 	if (blk_rq_pos(rq) < bio_end &&
1819 	    blk_rq_pos(rq) + blk_rq_sectors(rq) == iocg->cursor)
1820 		iocg->cursor = bio_end;
1821 
1822 	/*
1823 	 * Charge if there's enough vtime budget and the existing request
1824 	 * has cost assigned.  Otherwise, account it as debt.  See debt
1825 	 * handling in ioc_rqos_throttle() for details.
1826 	 */
1827 	if (rq->bio && rq->bio->bi_iocost_cost &&
1828 	    time_before_eq64(atomic64_read(&iocg->vtime) + cost, now.vnow))
1829 		iocg_commit_bio(iocg, bio, cost);
1830 	else
1831 		atomic64_add(abs_cost, &iocg->abs_vdebt);
1832 }
1833 
1834 static void ioc_rqos_done_bio(struct rq_qos *rqos, struct bio *bio)
1835 {
1836 	struct ioc_gq *iocg = blkg_to_iocg(bio->bi_blkg);
1837 
1838 	if (iocg && bio->bi_iocost_cost)
1839 		atomic64_add(bio->bi_iocost_cost, &iocg->done_vtime);
1840 }
1841 
1842 static void ioc_rqos_done(struct rq_qos *rqos, struct request *rq)
1843 {
1844 	struct ioc *ioc = rqos_to_ioc(rqos);
1845 	u64 on_q_ns, rq_wait_ns;
1846 	int pidx, rw;
1847 
1848 	if (!ioc->enabled || !rq->alloc_time_ns || !rq->start_time_ns)
1849 		return;
1850 
1851 	switch (req_op(rq) & REQ_OP_MASK) {
1852 	case REQ_OP_READ:
1853 		pidx = QOS_RLAT;
1854 		rw = READ;
1855 		break;
1856 	case REQ_OP_WRITE:
1857 		pidx = QOS_WLAT;
1858 		rw = WRITE;
1859 		break;
1860 	default:
1861 		return;
1862 	}
1863 
1864 	on_q_ns = ktime_get_ns() - rq->alloc_time_ns;
1865 	rq_wait_ns = rq->start_time_ns - rq->alloc_time_ns;
1866 
1867 	if (on_q_ns <= ioc->params.qos[pidx] * NSEC_PER_USEC)
1868 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_met);
1869 	else
1870 		this_cpu_inc(ioc->pcpu_stat->missed[rw].nr_missed);
1871 
1872 	this_cpu_add(ioc->pcpu_stat->rq_wait_ns, rq_wait_ns);
1873 }
1874 
1875 static void ioc_rqos_queue_depth_changed(struct rq_qos *rqos)
1876 {
1877 	struct ioc *ioc = rqos_to_ioc(rqos);
1878 
1879 	spin_lock_irq(&ioc->lock);
1880 	ioc_refresh_params(ioc, false);
1881 	spin_unlock_irq(&ioc->lock);
1882 }
1883 
1884 static void ioc_rqos_exit(struct rq_qos *rqos)
1885 {
1886 	struct ioc *ioc = rqos_to_ioc(rqos);
1887 
1888 	blkcg_deactivate_policy(rqos->q, &blkcg_policy_iocost);
1889 
1890 	spin_lock_irq(&ioc->lock);
1891 	ioc->running = IOC_STOP;
1892 	spin_unlock_irq(&ioc->lock);
1893 
1894 	del_timer_sync(&ioc->timer);
1895 	free_percpu(ioc->pcpu_stat);
1896 	kfree(ioc);
1897 }
1898 
1899 static struct rq_qos_ops ioc_rqos_ops = {
1900 	.throttle = ioc_rqos_throttle,
1901 	.merge = ioc_rqos_merge,
1902 	.done_bio = ioc_rqos_done_bio,
1903 	.done = ioc_rqos_done,
1904 	.queue_depth_changed = ioc_rqos_queue_depth_changed,
1905 	.exit = ioc_rqos_exit,
1906 };
1907 
1908 static int blk_iocost_init(struct request_queue *q)
1909 {
1910 	struct ioc *ioc;
1911 	struct rq_qos *rqos;
1912 	int ret;
1913 
1914 	ioc = kzalloc(sizeof(*ioc), GFP_KERNEL);
1915 	if (!ioc)
1916 		return -ENOMEM;
1917 
1918 	ioc->pcpu_stat = alloc_percpu(struct ioc_pcpu_stat);
1919 	if (!ioc->pcpu_stat) {
1920 		kfree(ioc);
1921 		return -ENOMEM;
1922 	}
1923 
1924 	rqos = &ioc->rqos;
1925 	rqos->id = RQ_QOS_COST;
1926 	rqos->ops = &ioc_rqos_ops;
1927 	rqos->q = q;
1928 
1929 	spin_lock_init(&ioc->lock);
1930 	timer_setup(&ioc->timer, ioc_timer_fn, 0);
1931 	INIT_LIST_HEAD(&ioc->active_iocgs);
1932 
1933 	ioc->running = IOC_IDLE;
1934 	atomic64_set(&ioc->vtime_rate, VTIME_PER_USEC);
1935 	seqcount_init(&ioc->period_seqcount);
1936 	ioc->period_at = ktime_to_us(ktime_get());
1937 	atomic64_set(&ioc->cur_period, 0);
1938 	atomic_set(&ioc->hweight_gen, 0);
1939 
1940 	spin_lock_irq(&ioc->lock);
1941 	ioc->autop_idx = AUTOP_INVALID;
1942 	ioc_refresh_params(ioc, true);
1943 	spin_unlock_irq(&ioc->lock);
1944 
1945 	rq_qos_add(q, rqos);
1946 	ret = blkcg_activate_policy(q, &blkcg_policy_iocost);
1947 	if (ret) {
1948 		rq_qos_del(q, rqos);
1949 		free_percpu(ioc->pcpu_stat);
1950 		kfree(ioc);
1951 		return ret;
1952 	}
1953 	return 0;
1954 }
1955 
1956 static struct blkcg_policy_data *ioc_cpd_alloc(gfp_t gfp)
1957 {
1958 	struct ioc_cgrp *iocc;
1959 
1960 	iocc = kzalloc(sizeof(struct ioc_cgrp), gfp);
1961 	if (!iocc)
1962 		return NULL;
1963 
1964 	iocc->dfl_weight = CGROUP_WEIGHT_DFL;
1965 	return &iocc->cpd;
1966 }
1967 
1968 static void ioc_cpd_free(struct blkcg_policy_data *cpd)
1969 {
1970 	kfree(container_of(cpd, struct ioc_cgrp, cpd));
1971 }
1972 
1973 static struct blkg_policy_data *ioc_pd_alloc(gfp_t gfp, struct request_queue *q,
1974 					     struct blkcg *blkcg)
1975 {
1976 	int levels = blkcg->css.cgroup->level + 1;
1977 	struct ioc_gq *iocg;
1978 
1979 	iocg = kzalloc_node(sizeof(*iocg) + levels * sizeof(iocg->ancestors[0]),
1980 			    gfp, q->node);
1981 	if (!iocg)
1982 		return NULL;
1983 
1984 	return &iocg->pd;
1985 }
1986 
1987 static void ioc_pd_init(struct blkg_policy_data *pd)
1988 {
1989 	struct ioc_gq *iocg = pd_to_iocg(pd);
1990 	struct blkcg_gq *blkg = pd_to_blkg(&iocg->pd);
1991 	struct ioc *ioc = q_to_ioc(blkg->q);
1992 	struct ioc_now now;
1993 	struct blkcg_gq *tblkg;
1994 	unsigned long flags;
1995 
1996 	ioc_now(ioc, &now);
1997 
1998 	iocg->ioc = ioc;
1999 	atomic64_set(&iocg->vtime, now.vnow);
2000 	atomic64_set(&iocg->done_vtime, now.vnow);
2001 	atomic64_set(&iocg->abs_vdebt, 0);
2002 	atomic64_set(&iocg->active_period, atomic64_read(&ioc->cur_period));
2003 	INIT_LIST_HEAD(&iocg->active_list);
2004 	iocg->hweight_active = HWEIGHT_WHOLE;
2005 	iocg->hweight_inuse = HWEIGHT_WHOLE;
2006 
2007 	init_waitqueue_head(&iocg->waitq);
2008 	hrtimer_init(&iocg->waitq_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2009 	iocg->waitq_timer.function = iocg_waitq_timer_fn;
2010 	hrtimer_init(&iocg->delay_timer, CLOCK_MONOTONIC, HRTIMER_MODE_ABS);
2011 	iocg->delay_timer.function = iocg_delay_timer_fn;
2012 
2013 	iocg->level = blkg->blkcg->css.cgroup->level;
2014 
2015 	for (tblkg = blkg; tblkg; tblkg = tblkg->parent) {
2016 		struct ioc_gq *tiocg = blkg_to_iocg(tblkg);
2017 		iocg->ancestors[tiocg->level] = tiocg;
2018 	}
2019 
2020 	spin_lock_irqsave(&ioc->lock, flags);
2021 	weight_updated(iocg);
2022 	spin_unlock_irqrestore(&ioc->lock, flags);
2023 }
2024 
2025 static void ioc_pd_free(struct blkg_policy_data *pd)
2026 {
2027 	struct ioc_gq *iocg = pd_to_iocg(pd);
2028 	struct ioc *ioc = iocg->ioc;
2029 
2030 	if (ioc) {
2031 		spin_lock(&ioc->lock);
2032 		if (!list_empty(&iocg->active_list)) {
2033 			propagate_active_weight(iocg, 0, 0);
2034 			list_del_init(&iocg->active_list);
2035 		}
2036 		spin_unlock(&ioc->lock);
2037 
2038 		hrtimer_cancel(&iocg->waitq_timer);
2039 		hrtimer_cancel(&iocg->delay_timer);
2040 	}
2041 	kfree(iocg);
2042 }
2043 
2044 static u64 ioc_weight_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2045 			     int off)
2046 {
2047 	const char *dname = blkg_dev_name(pd->blkg);
2048 	struct ioc_gq *iocg = pd_to_iocg(pd);
2049 
2050 	if (dname && iocg->cfg_weight)
2051 		seq_printf(sf, "%s %u\n", dname, iocg->cfg_weight);
2052 	return 0;
2053 }
2054 
2055 
2056 static int ioc_weight_show(struct seq_file *sf, void *v)
2057 {
2058 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2059 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2060 
2061 	seq_printf(sf, "default %u\n", iocc->dfl_weight);
2062 	blkcg_print_blkgs(sf, blkcg, ioc_weight_prfill,
2063 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2064 	return 0;
2065 }
2066 
2067 static ssize_t ioc_weight_write(struct kernfs_open_file *of, char *buf,
2068 				size_t nbytes, loff_t off)
2069 {
2070 	struct blkcg *blkcg = css_to_blkcg(of_css(of));
2071 	struct ioc_cgrp *iocc = blkcg_to_iocc(blkcg);
2072 	struct blkg_conf_ctx ctx;
2073 	struct ioc_gq *iocg;
2074 	u32 v;
2075 	int ret;
2076 
2077 	if (!strchr(buf, ':')) {
2078 		struct blkcg_gq *blkg;
2079 
2080 		if (!sscanf(buf, "default %u", &v) && !sscanf(buf, "%u", &v))
2081 			return -EINVAL;
2082 
2083 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2084 			return -EINVAL;
2085 
2086 		spin_lock(&blkcg->lock);
2087 		iocc->dfl_weight = v;
2088 		hlist_for_each_entry(blkg, &blkcg->blkg_list, blkcg_node) {
2089 			struct ioc_gq *iocg = blkg_to_iocg(blkg);
2090 
2091 			if (iocg) {
2092 				spin_lock_irq(&iocg->ioc->lock);
2093 				weight_updated(iocg);
2094 				spin_unlock_irq(&iocg->ioc->lock);
2095 			}
2096 		}
2097 		spin_unlock(&blkcg->lock);
2098 
2099 		return nbytes;
2100 	}
2101 
2102 	ret = blkg_conf_prep(blkcg, &blkcg_policy_iocost, buf, &ctx);
2103 	if (ret)
2104 		return ret;
2105 
2106 	iocg = blkg_to_iocg(ctx.blkg);
2107 
2108 	if (!strncmp(ctx.body, "default", 7)) {
2109 		v = 0;
2110 	} else {
2111 		if (!sscanf(ctx.body, "%u", &v))
2112 			goto einval;
2113 		if (v < CGROUP_WEIGHT_MIN || v > CGROUP_WEIGHT_MAX)
2114 			goto einval;
2115 	}
2116 
2117 	spin_lock(&iocg->ioc->lock);
2118 	iocg->cfg_weight = v;
2119 	weight_updated(iocg);
2120 	spin_unlock(&iocg->ioc->lock);
2121 
2122 	blkg_conf_finish(&ctx);
2123 	return nbytes;
2124 
2125 einval:
2126 	blkg_conf_finish(&ctx);
2127 	return -EINVAL;
2128 }
2129 
2130 static u64 ioc_qos_prfill(struct seq_file *sf, struct blkg_policy_data *pd,
2131 			  int off)
2132 {
2133 	const char *dname = blkg_dev_name(pd->blkg);
2134 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2135 
2136 	if (!dname)
2137 		return 0;
2138 
2139 	seq_printf(sf, "%s enable=%d ctrl=%s rpct=%u.%02u rlat=%u wpct=%u.%02u wlat=%u min=%u.%02u max=%u.%02u\n",
2140 		   dname, ioc->enabled, ioc->user_qos_params ? "user" : "auto",
2141 		   ioc->params.qos[QOS_RPPM] / 10000,
2142 		   ioc->params.qos[QOS_RPPM] % 10000 / 100,
2143 		   ioc->params.qos[QOS_RLAT],
2144 		   ioc->params.qos[QOS_WPPM] / 10000,
2145 		   ioc->params.qos[QOS_WPPM] % 10000 / 100,
2146 		   ioc->params.qos[QOS_WLAT],
2147 		   ioc->params.qos[QOS_MIN] / 10000,
2148 		   ioc->params.qos[QOS_MIN] % 10000 / 100,
2149 		   ioc->params.qos[QOS_MAX] / 10000,
2150 		   ioc->params.qos[QOS_MAX] % 10000 / 100);
2151 	return 0;
2152 }
2153 
2154 static int ioc_qos_show(struct seq_file *sf, void *v)
2155 {
2156 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2157 
2158 	blkcg_print_blkgs(sf, blkcg, ioc_qos_prfill,
2159 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2160 	return 0;
2161 }
2162 
2163 static const match_table_t qos_ctrl_tokens = {
2164 	{ QOS_ENABLE,		"enable=%u"	},
2165 	{ QOS_CTRL,		"ctrl=%s"	},
2166 	{ NR_QOS_CTRL_PARAMS,	NULL		},
2167 };
2168 
2169 static const match_table_t qos_tokens = {
2170 	{ QOS_RPPM,		"rpct=%s"	},
2171 	{ QOS_RLAT,		"rlat=%u"	},
2172 	{ QOS_WPPM,		"wpct=%s"	},
2173 	{ QOS_WLAT,		"wlat=%u"	},
2174 	{ QOS_MIN,		"min=%s"	},
2175 	{ QOS_MAX,		"max=%s"	},
2176 	{ NR_QOS_PARAMS,	NULL		},
2177 };
2178 
2179 static ssize_t ioc_qos_write(struct kernfs_open_file *of, char *input,
2180 			     size_t nbytes, loff_t off)
2181 {
2182 	struct gendisk *disk;
2183 	struct ioc *ioc;
2184 	u32 qos[NR_QOS_PARAMS];
2185 	bool enable, user;
2186 	char *p;
2187 	int ret;
2188 
2189 	disk = blkcg_conf_get_disk(&input);
2190 	if (IS_ERR(disk))
2191 		return PTR_ERR(disk);
2192 
2193 	ioc = q_to_ioc(disk->queue);
2194 	if (!ioc) {
2195 		ret = blk_iocost_init(disk->queue);
2196 		if (ret)
2197 			goto err;
2198 		ioc = q_to_ioc(disk->queue);
2199 	}
2200 
2201 	spin_lock_irq(&ioc->lock);
2202 	memcpy(qos, ioc->params.qos, sizeof(qos));
2203 	enable = ioc->enabled;
2204 	user = ioc->user_qos_params;
2205 	spin_unlock_irq(&ioc->lock);
2206 
2207 	while ((p = strsep(&input, " \t\n"))) {
2208 		substring_t args[MAX_OPT_ARGS];
2209 		char buf[32];
2210 		int tok;
2211 		s64 v;
2212 
2213 		if (!*p)
2214 			continue;
2215 
2216 		switch (match_token(p, qos_ctrl_tokens, args)) {
2217 		case QOS_ENABLE:
2218 			match_u64(&args[0], &v);
2219 			enable = v;
2220 			continue;
2221 		case QOS_CTRL:
2222 			match_strlcpy(buf, &args[0], sizeof(buf));
2223 			if (!strcmp(buf, "auto"))
2224 				user = false;
2225 			else if (!strcmp(buf, "user"))
2226 				user = true;
2227 			else
2228 				goto einval;
2229 			continue;
2230 		}
2231 
2232 		tok = match_token(p, qos_tokens, args);
2233 		switch (tok) {
2234 		case QOS_RPPM:
2235 		case QOS_WPPM:
2236 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2237 			    sizeof(buf))
2238 				goto einval;
2239 			if (cgroup_parse_float(buf, 2, &v))
2240 				goto einval;
2241 			if (v < 0 || v > 10000)
2242 				goto einval;
2243 			qos[tok] = v * 100;
2244 			break;
2245 		case QOS_RLAT:
2246 		case QOS_WLAT:
2247 			if (match_u64(&args[0], &v))
2248 				goto einval;
2249 			qos[tok] = v;
2250 			break;
2251 		case QOS_MIN:
2252 		case QOS_MAX:
2253 			if (match_strlcpy(buf, &args[0], sizeof(buf)) >=
2254 			    sizeof(buf))
2255 				goto einval;
2256 			if (cgroup_parse_float(buf, 2, &v))
2257 				goto einval;
2258 			if (v < 0)
2259 				goto einval;
2260 			qos[tok] = clamp_t(s64, v * 100,
2261 					   VRATE_MIN_PPM, VRATE_MAX_PPM);
2262 			break;
2263 		default:
2264 			goto einval;
2265 		}
2266 		user = true;
2267 	}
2268 
2269 	if (qos[QOS_MIN] > qos[QOS_MAX])
2270 		goto einval;
2271 
2272 	spin_lock_irq(&ioc->lock);
2273 
2274 	if (enable) {
2275 		blk_queue_flag_set(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2276 		ioc->enabled = true;
2277 	} else {
2278 		blk_queue_flag_clear(QUEUE_FLAG_RQ_ALLOC_TIME, ioc->rqos.q);
2279 		ioc->enabled = false;
2280 	}
2281 
2282 	if (user) {
2283 		memcpy(ioc->params.qos, qos, sizeof(qos));
2284 		ioc->user_qos_params = true;
2285 	} else {
2286 		ioc->user_qos_params = false;
2287 	}
2288 
2289 	ioc_refresh_params(ioc, true);
2290 	spin_unlock_irq(&ioc->lock);
2291 
2292 	put_disk_and_module(disk);
2293 	return nbytes;
2294 einval:
2295 	ret = -EINVAL;
2296 err:
2297 	put_disk_and_module(disk);
2298 	return ret;
2299 }
2300 
2301 static u64 ioc_cost_model_prfill(struct seq_file *sf,
2302 				 struct blkg_policy_data *pd, int off)
2303 {
2304 	const char *dname = blkg_dev_name(pd->blkg);
2305 	struct ioc *ioc = pd_to_iocg(pd)->ioc;
2306 	u64 *u = ioc->params.i_lcoefs;
2307 
2308 	if (!dname)
2309 		return 0;
2310 
2311 	seq_printf(sf, "%s ctrl=%s model=linear "
2312 		   "rbps=%llu rseqiops=%llu rrandiops=%llu "
2313 		   "wbps=%llu wseqiops=%llu wrandiops=%llu\n",
2314 		   dname, ioc->user_cost_model ? "user" : "auto",
2315 		   u[I_LCOEF_RBPS], u[I_LCOEF_RSEQIOPS], u[I_LCOEF_RRANDIOPS],
2316 		   u[I_LCOEF_WBPS], u[I_LCOEF_WSEQIOPS], u[I_LCOEF_WRANDIOPS]);
2317 	return 0;
2318 }
2319 
2320 static int ioc_cost_model_show(struct seq_file *sf, void *v)
2321 {
2322 	struct blkcg *blkcg = css_to_blkcg(seq_css(sf));
2323 
2324 	blkcg_print_blkgs(sf, blkcg, ioc_cost_model_prfill,
2325 			  &blkcg_policy_iocost, seq_cft(sf)->private, false);
2326 	return 0;
2327 }
2328 
2329 static const match_table_t cost_ctrl_tokens = {
2330 	{ COST_CTRL,		"ctrl=%s"	},
2331 	{ COST_MODEL,		"model=%s"	},
2332 	{ NR_COST_CTRL_PARAMS,	NULL		},
2333 };
2334 
2335 static const match_table_t i_lcoef_tokens = {
2336 	{ I_LCOEF_RBPS,		"rbps=%u"	},
2337 	{ I_LCOEF_RSEQIOPS,	"rseqiops=%u"	},
2338 	{ I_LCOEF_RRANDIOPS,	"rrandiops=%u"	},
2339 	{ I_LCOEF_WBPS,		"wbps=%u"	},
2340 	{ I_LCOEF_WSEQIOPS,	"wseqiops=%u"	},
2341 	{ I_LCOEF_WRANDIOPS,	"wrandiops=%u"	},
2342 	{ NR_I_LCOEFS,		NULL		},
2343 };
2344 
2345 static ssize_t ioc_cost_model_write(struct kernfs_open_file *of, char *input,
2346 				    size_t nbytes, loff_t off)
2347 {
2348 	struct gendisk *disk;
2349 	struct ioc *ioc;
2350 	u64 u[NR_I_LCOEFS];
2351 	bool user;
2352 	char *p;
2353 	int ret;
2354 
2355 	disk = blkcg_conf_get_disk(&input);
2356 	if (IS_ERR(disk))
2357 		return PTR_ERR(disk);
2358 
2359 	ioc = q_to_ioc(disk->queue);
2360 	if (!ioc) {
2361 		ret = blk_iocost_init(disk->queue);
2362 		if (ret)
2363 			goto err;
2364 		ioc = q_to_ioc(disk->queue);
2365 	}
2366 
2367 	spin_lock_irq(&ioc->lock);
2368 	memcpy(u, ioc->params.i_lcoefs, sizeof(u));
2369 	user = ioc->user_cost_model;
2370 	spin_unlock_irq(&ioc->lock);
2371 
2372 	while ((p = strsep(&input, " \t\n"))) {
2373 		substring_t args[MAX_OPT_ARGS];
2374 		char buf[32];
2375 		int tok;
2376 		u64 v;
2377 
2378 		if (!*p)
2379 			continue;
2380 
2381 		switch (match_token(p, cost_ctrl_tokens, args)) {
2382 		case COST_CTRL:
2383 			match_strlcpy(buf, &args[0], sizeof(buf));
2384 			if (!strcmp(buf, "auto"))
2385 				user = false;
2386 			else if (!strcmp(buf, "user"))
2387 				user = true;
2388 			else
2389 				goto einval;
2390 			continue;
2391 		case COST_MODEL:
2392 			match_strlcpy(buf, &args[0], sizeof(buf));
2393 			if (strcmp(buf, "linear"))
2394 				goto einval;
2395 			continue;
2396 		}
2397 
2398 		tok = match_token(p, i_lcoef_tokens, args);
2399 		if (tok == NR_I_LCOEFS)
2400 			goto einval;
2401 		if (match_u64(&args[0], &v))
2402 			goto einval;
2403 		u[tok] = v;
2404 		user = true;
2405 	}
2406 
2407 	spin_lock_irq(&ioc->lock);
2408 	if (user) {
2409 		memcpy(ioc->params.i_lcoefs, u, sizeof(u));
2410 		ioc->user_cost_model = true;
2411 	} else {
2412 		ioc->user_cost_model = false;
2413 	}
2414 	ioc_refresh_params(ioc, true);
2415 	spin_unlock_irq(&ioc->lock);
2416 
2417 	put_disk_and_module(disk);
2418 	return nbytes;
2419 
2420 einval:
2421 	ret = -EINVAL;
2422 err:
2423 	put_disk_and_module(disk);
2424 	return ret;
2425 }
2426 
2427 static struct cftype ioc_files[] = {
2428 	{
2429 		.name = "weight",
2430 		.flags = CFTYPE_NOT_ON_ROOT,
2431 		.seq_show = ioc_weight_show,
2432 		.write = ioc_weight_write,
2433 	},
2434 	{
2435 		.name = "cost.qos",
2436 		.flags = CFTYPE_ONLY_ON_ROOT,
2437 		.seq_show = ioc_qos_show,
2438 		.write = ioc_qos_write,
2439 	},
2440 	{
2441 		.name = "cost.model",
2442 		.flags = CFTYPE_ONLY_ON_ROOT,
2443 		.seq_show = ioc_cost_model_show,
2444 		.write = ioc_cost_model_write,
2445 	},
2446 	{}
2447 };
2448 
2449 static struct blkcg_policy blkcg_policy_iocost = {
2450 	.dfl_cftypes	= ioc_files,
2451 	.cpd_alloc_fn	= ioc_cpd_alloc,
2452 	.cpd_free_fn	= ioc_cpd_free,
2453 	.pd_alloc_fn	= ioc_pd_alloc,
2454 	.pd_init_fn	= ioc_pd_init,
2455 	.pd_free_fn	= ioc_pd_free,
2456 };
2457 
2458 static int __init ioc_init(void)
2459 {
2460 	return blkcg_policy_register(&blkcg_policy_iocost);
2461 }
2462 
2463 static void __exit ioc_exit(void)
2464 {
2465 	return blkcg_policy_unregister(&blkcg_policy_iocost);
2466 }
2467 
2468 module_init(ioc_init);
2469 module_exit(ioc_exit);
2470