xref: /openbmc/linux/block/blk-wbt.c (revision 601b5540)
13dcf60bcSChristoph Hellwig // SPDX-License-Identifier: GPL-2.0
2e34cbd30SJens Axboe /*
3e34cbd30SJens Axboe  * buffered writeback throttling. loosely based on CoDel. We can't drop
4e34cbd30SJens Axboe  * packets for IO scheduling, so the logic is something like this:
5e34cbd30SJens Axboe  *
6e34cbd30SJens Axboe  * - Monitor latencies in a defined window of time.
7e34cbd30SJens Axboe  * - If the minimum latency in the above window exceeds some target, increment
8e34cbd30SJens Axboe  *   scaling step and scale down queue depth by a factor of 2x. The monitoring
9e34cbd30SJens Axboe  *   window is then shrunk to 100 / sqrt(scaling step + 1).
10e34cbd30SJens Axboe  * - For any window where we don't have solid data on what the latencies
11e34cbd30SJens Axboe  *   look like, retain status quo.
12e34cbd30SJens Axboe  * - If latencies look good, decrement scaling step.
13e34cbd30SJens Axboe  * - If we're only doing writes, allow the scaling step to go negative. This
14e34cbd30SJens Axboe  *   will temporarily boost write performance, snapping back to a stable
15e34cbd30SJens Axboe  *   scaling step of 0 if reads show up or the heavy writers finish. Unlike
16e34cbd30SJens Axboe  *   positive scaling steps where we shrink the monitoring window, a negative
17e34cbd30SJens Axboe  *   scaling step retains the default step==0 window size.
18e34cbd30SJens Axboe  *
19e34cbd30SJens Axboe  * Copyright (C) 2016 Jens Axboe
20e34cbd30SJens Axboe  *
21e34cbd30SJens Axboe  */
22e34cbd30SJens Axboe #include <linux/kernel.h>
23e34cbd30SJens Axboe #include <linux/blk_types.h>
24e34cbd30SJens Axboe #include <linux/slab.h>
25e34cbd30SJens Axboe #include <linux/backing-dev.h>
26e34cbd30SJens Axboe #include <linux/swap.h>
27e34cbd30SJens Axboe 
280bc65bd4SChristoph Hellwig #include "blk-stat.h"
29e34cbd30SJens Axboe #include "blk-wbt.h"
30a7905043SJosef Bacik #include "blk-rq-qos.h"
31671fae5eSYu Kuai #include "elevator.h"
32e34cbd30SJens Axboe 
33e34cbd30SJens Axboe #define CREATE_TRACE_POINTS
34e34cbd30SJens Axboe #include <trace/events/wbt.h>
35e34cbd30SJens Axboe 
360bc65bd4SChristoph Hellwig enum wbt_flags {
370bc65bd4SChristoph Hellwig 	WBT_TRACKED		= 1,	/* write, tracked for throttling */
380bc65bd4SChristoph Hellwig 	WBT_READ		= 2,	/* read */
390bc65bd4SChristoph Hellwig 	WBT_KSWAPD		= 4,	/* write, from kswapd */
400bc65bd4SChristoph Hellwig 	WBT_DISCARD		= 8,	/* discard */
410bc65bd4SChristoph Hellwig 
420bc65bd4SChristoph Hellwig 	WBT_NR_BITS		= 4,	/* number of bits */
430bc65bd4SChristoph Hellwig };
440bc65bd4SChristoph Hellwig 
450bc65bd4SChristoph Hellwig enum {
460bc65bd4SChristoph Hellwig 	WBT_RWQ_BG		= 0,
470bc65bd4SChristoph Hellwig 	WBT_RWQ_KSWAPD,
480bc65bd4SChristoph Hellwig 	WBT_RWQ_DISCARD,
490bc65bd4SChristoph Hellwig 	WBT_NUM_RWQ,
500bc65bd4SChristoph Hellwig };
510bc65bd4SChristoph Hellwig 
520bc65bd4SChristoph Hellwig /*
530bc65bd4SChristoph Hellwig  * If current state is WBT_STATE_ON/OFF_DEFAULT, it can be covered to any other
540bc65bd4SChristoph Hellwig  * state, if current state is WBT_STATE_ON/OFF_MANUAL, it can only be covered
550bc65bd4SChristoph Hellwig  * to WBT_STATE_OFF/ON_MANUAL.
560bc65bd4SChristoph Hellwig  */
570bc65bd4SChristoph Hellwig enum {
580bc65bd4SChristoph Hellwig 	WBT_STATE_ON_DEFAULT	= 1,	/* on by default */
590bc65bd4SChristoph Hellwig 	WBT_STATE_ON_MANUAL	= 2,	/* on manually by sysfs */
600bc65bd4SChristoph Hellwig 	WBT_STATE_OFF_DEFAULT	= 3,	/* off by default */
610bc65bd4SChristoph Hellwig 	WBT_STATE_OFF_MANUAL	= 4,	/* off manually by sysfs */
620bc65bd4SChristoph Hellwig };
630bc65bd4SChristoph Hellwig 
640bc65bd4SChristoph Hellwig struct rq_wb {
650bc65bd4SChristoph Hellwig 	/*
660bc65bd4SChristoph Hellwig 	 * Settings that govern how we throttle
670bc65bd4SChristoph Hellwig 	 */
680bc65bd4SChristoph Hellwig 	unsigned int wb_background;		/* background writeback */
690bc65bd4SChristoph Hellwig 	unsigned int wb_normal;			/* normal writeback */
700bc65bd4SChristoph Hellwig 
710bc65bd4SChristoph Hellwig 	short enable_state;			/* WBT_STATE_* */
720bc65bd4SChristoph Hellwig 
730bc65bd4SChristoph Hellwig 	/*
740bc65bd4SChristoph Hellwig 	 * Number of consecutive periods where we don't have enough
750bc65bd4SChristoph Hellwig 	 * information to make a firm scale up/down decision.
760bc65bd4SChristoph Hellwig 	 */
770bc65bd4SChristoph Hellwig 	unsigned int unknown_cnt;
780bc65bd4SChristoph Hellwig 
790bc65bd4SChristoph Hellwig 	u64 win_nsec;				/* default window size */
800bc65bd4SChristoph Hellwig 	u64 cur_win_nsec;			/* current window size */
810bc65bd4SChristoph Hellwig 
820bc65bd4SChristoph Hellwig 	struct blk_stat_callback *cb;
830bc65bd4SChristoph Hellwig 
840bc65bd4SChristoph Hellwig 	u64 sync_issue;
850bc65bd4SChristoph Hellwig 	void *sync_cookie;
860bc65bd4SChristoph Hellwig 
870bc65bd4SChristoph Hellwig 	unsigned int wc;
880bc65bd4SChristoph Hellwig 
890bc65bd4SChristoph Hellwig 	unsigned long last_issue;		/* last non-throttled issue */
900bc65bd4SChristoph Hellwig 	unsigned long last_comp;		/* last non-throttled comp */
910bc65bd4SChristoph Hellwig 	unsigned long min_lat_nsec;
920bc65bd4SChristoph Hellwig 	struct rq_qos rqos;
930bc65bd4SChristoph Hellwig 	struct rq_wait rq_wait[WBT_NUM_RWQ];
940bc65bd4SChristoph Hellwig 	struct rq_depth rq_depth;
950bc65bd4SChristoph Hellwig };
960bc65bd4SChristoph Hellwig 
RQWB(struct rq_qos * rqos)970bc65bd4SChristoph Hellwig static inline struct rq_wb *RQWB(struct rq_qos *rqos)
980bc65bd4SChristoph Hellwig {
990bc65bd4SChristoph Hellwig 	return container_of(rqos, struct rq_wb, rqos);
1000bc65bd4SChristoph Hellwig }
1010bc65bd4SChristoph Hellwig 
wbt_clear_state(struct request * rq)102a8a45941SOmar Sandoval static inline void wbt_clear_state(struct request *rq)
103934031a1SOmar Sandoval {
104544ccc8dSOmar Sandoval 	rq->wbt_flags = 0;
105934031a1SOmar Sandoval }
106934031a1SOmar Sandoval 
wbt_flags(struct request * rq)107a8a45941SOmar Sandoval static inline enum wbt_flags wbt_flags(struct request *rq)
108934031a1SOmar Sandoval {
109544ccc8dSOmar Sandoval 	return rq->wbt_flags;
110934031a1SOmar Sandoval }
111934031a1SOmar Sandoval 
wbt_is_tracked(struct request * rq)112a8a45941SOmar Sandoval static inline bool wbt_is_tracked(struct request *rq)
113934031a1SOmar Sandoval {
114544ccc8dSOmar Sandoval 	return rq->wbt_flags & WBT_TRACKED;
115934031a1SOmar Sandoval }
116934031a1SOmar Sandoval 
wbt_is_read(struct request * rq)117a8a45941SOmar Sandoval static inline bool wbt_is_read(struct request *rq)
118934031a1SOmar Sandoval {
119544ccc8dSOmar Sandoval 	return rq->wbt_flags & WBT_READ;
120934031a1SOmar Sandoval }
121934031a1SOmar Sandoval 
122e34cbd30SJens Axboe enum {
123e34cbd30SJens Axboe 	/*
124e34cbd30SJens Axboe 	 * Default setting, we'll scale up (to 75% of QD max) or down (min 1)
125e34cbd30SJens Axboe 	 * from here depending on device stats
126e34cbd30SJens Axboe 	 */
127e34cbd30SJens Axboe 	RWB_DEF_DEPTH	= 16,
128e34cbd30SJens Axboe 
129e34cbd30SJens Axboe 	/*
130e34cbd30SJens Axboe 	 * 100msec window
131e34cbd30SJens Axboe 	 */
132e34cbd30SJens Axboe 	RWB_WINDOW_NSEC		= 100 * 1000 * 1000ULL,
133e34cbd30SJens Axboe 
134e34cbd30SJens Axboe 	/*
135e34cbd30SJens Axboe 	 * Disregard stats, if we don't meet this minimum
136e34cbd30SJens Axboe 	 */
137e34cbd30SJens Axboe 	RWB_MIN_WRITE_SAMPLES	= 3,
138e34cbd30SJens Axboe 
139e34cbd30SJens Axboe 	/*
140e34cbd30SJens Axboe 	 * If we have this number of consecutive windows with not enough
141e34cbd30SJens Axboe 	 * information to scale up or down, scale up.
142e34cbd30SJens Axboe 	 */
143e34cbd30SJens Axboe 	RWB_UNKNOWN_BUMP	= 5,
144e34cbd30SJens Axboe };
145e34cbd30SJens Axboe 
rwb_enabled(struct rq_wb * rwb)146e34cbd30SJens Axboe static inline bool rwb_enabled(struct rq_wb *rwb)
147e34cbd30SJens Axboe {
1481d0903d6SZhang Yi 	return rwb && rwb->enable_state != WBT_STATE_OFF_DEFAULT &&
14906257fdaSYu Kuai 		      rwb->enable_state != WBT_STATE_OFF_MANUAL;
150e34cbd30SJens Axboe }
151e34cbd30SJens Axboe 
wb_timestamp(struct rq_wb * rwb,unsigned long * var)152e34cbd30SJens Axboe static void wb_timestamp(struct rq_wb *rwb, unsigned long *var)
153e34cbd30SJens Axboe {
154e34cbd30SJens Axboe 	if (rwb_enabled(rwb)) {
155e34cbd30SJens Axboe 		const unsigned long cur = jiffies;
156e34cbd30SJens Axboe 
157e34cbd30SJens Axboe 		if (cur != *var)
158e34cbd30SJens Axboe 			*var = cur;
159e34cbd30SJens Axboe 	}
160e34cbd30SJens Axboe }
161e34cbd30SJens Axboe 
162e34cbd30SJens Axboe /*
163e34cbd30SJens Axboe  * If a task was rate throttled in balance_dirty_pages() within the last
164e34cbd30SJens Axboe  * second or so, use that to indicate a higher cleaning rate.
165e34cbd30SJens Axboe  */
wb_recent_wait(struct rq_wb * rwb)166e34cbd30SJens Axboe static bool wb_recent_wait(struct rq_wb *rwb)
167e34cbd30SJens Axboe {
168*601b5540SJan Kara 	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
169e34cbd30SJens Axboe 
170*601b5540SJan Kara 	return time_before(jiffies, bdi->last_bdp_sleep + HZ);
171e34cbd30SJens Axboe }
172e34cbd30SJens Axboe 
get_rq_wait(struct rq_wb * rwb,enum wbt_flags wb_acct)1738bea6090SJens Axboe static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb,
1748bea6090SJens Axboe 					  enum wbt_flags wb_acct)
175e34cbd30SJens Axboe {
1768bea6090SJens Axboe 	if (wb_acct & WBT_KSWAPD)
1778bea6090SJens Axboe 		return &rwb->rq_wait[WBT_RWQ_KSWAPD];
178782f5697SJens Axboe 	else if (wb_acct & WBT_DISCARD)
179782f5697SJens Axboe 		return &rwb->rq_wait[WBT_RWQ_DISCARD];
1808bea6090SJens Axboe 
1818bea6090SJens Axboe 	return &rwb->rq_wait[WBT_RWQ_BG];
182e34cbd30SJens Axboe }
183e34cbd30SJens Axboe 
rwb_wake_all(struct rq_wb * rwb)184e34cbd30SJens Axboe static void rwb_wake_all(struct rq_wb *rwb)
185e34cbd30SJens Axboe {
186e34cbd30SJens Axboe 	int i;
187e34cbd30SJens Axboe 
188e34cbd30SJens Axboe 	for (i = 0; i < WBT_NUM_RWQ; i++) {
189e34cbd30SJens Axboe 		struct rq_wait *rqw = &rwb->rq_wait[i];
190e34cbd30SJens Axboe 
191b7882093SJens Axboe 		if (wq_has_sleeper(&rqw->wait))
192e34cbd30SJens Axboe 			wake_up_all(&rqw->wait);
193e34cbd30SJens Axboe 	}
194e34cbd30SJens Axboe }
195e34cbd30SJens Axboe 
wbt_rqw_done(struct rq_wb * rwb,struct rq_wait * rqw,enum wbt_flags wb_acct)196061a5427SJens Axboe static void wbt_rqw_done(struct rq_wb *rwb, struct rq_wait *rqw,
197061a5427SJens Axboe 			 enum wbt_flags wb_acct)
198e34cbd30SJens Axboe {
199e34cbd30SJens Axboe 	int inflight, limit;
200e34cbd30SJens Axboe 
201e34cbd30SJens Axboe 	inflight = atomic_dec_return(&rqw->inflight);
202e34cbd30SJens Axboe 
203e34cbd30SJens Axboe 	/*
204782f5697SJens Axboe 	 * For discards, our limit is always the background. For writes, if
205782f5697SJens Axboe 	 * the device does write back caching, drop further down before we
206782f5697SJens Axboe 	 * wake people up.
207e34cbd30SJens Axboe 	 */
208782f5697SJens Axboe 	if (wb_acct & WBT_DISCARD)
209782f5697SJens Axboe 		limit = rwb->wb_background;
210782f5697SJens Axboe 	else if (rwb->wc && !wb_recent_wait(rwb))
211e34cbd30SJens Axboe 		limit = 0;
212e34cbd30SJens Axboe 	else
213e34cbd30SJens Axboe 		limit = rwb->wb_normal;
214e34cbd30SJens Axboe 
215e34cbd30SJens Axboe 	/*
216e34cbd30SJens Axboe 	 * Don't wake anyone up if we are above the normal limit.
217e34cbd30SJens Axboe 	 */
218e34cbd30SJens Axboe 	if (inflight && inflight >= limit)
219e34cbd30SJens Axboe 		return;
220e34cbd30SJens Axboe 
221b7882093SJens Axboe 	if (wq_has_sleeper(&rqw->wait)) {
222e34cbd30SJens Axboe 		int diff = limit - inflight;
223e34cbd30SJens Axboe 
224e34cbd30SJens Axboe 		if (!inflight || diff >= rwb->wb_background / 2)
22538cfb5a4SJens Axboe 			wake_up_all(&rqw->wait);
226e34cbd30SJens Axboe 	}
227e34cbd30SJens Axboe }
228e34cbd30SJens Axboe 
__wbt_done(struct rq_qos * rqos,enum wbt_flags wb_acct)229061a5427SJens Axboe static void __wbt_done(struct rq_qos *rqos, enum wbt_flags wb_acct)
230061a5427SJens Axboe {
231061a5427SJens Axboe 	struct rq_wb *rwb = RQWB(rqos);
232061a5427SJens Axboe 	struct rq_wait *rqw;
233061a5427SJens Axboe 
234061a5427SJens Axboe 	if (!(wb_acct & WBT_TRACKED))
235061a5427SJens Axboe 		return;
236061a5427SJens Axboe 
237061a5427SJens Axboe 	rqw = get_rq_wait(rwb, wb_acct);
238061a5427SJens Axboe 	wbt_rqw_done(rwb, rqw, wb_acct);
239061a5427SJens Axboe }
240061a5427SJens Axboe 
241e34cbd30SJens Axboe /*
242e34cbd30SJens Axboe  * Called on completion of a request. Note that it's also called when
243e34cbd30SJens Axboe  * a request is merged, when the request gets freed.
244e34cbd30SJens Axboe  */
wbt_done(struct rq_qos * rqos,struct request * rq)245a7905043SJosef Bacik static void wbt_done(struct rq_qos *rqos, struct request *rq)
246e34cbd30SJens Axboe {
247a7905043SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
248e34cbd30SJens Axboe 
249a8a45941SOmar Sandoval 	if (!wbt_is_tracked(rq)) {
250a8a45941SOmar Sandoval 		if (rwb->sync_cookie == rq) {
251e34cbd30SJens Axboe 			rwb->sync_issue = 0;
252e34cbd30SJens Axboe 			rwb->sync_cookie = NULL;
253e34cbd30SJens Axboe 		}
254e34cbd30SJens Axboe 
255a8a45941SOmar Sandoval 		if (wbt_is_read(rq))
256e34cbd30SJens Axboe 			wb_timestamp(rwb, &rwb->last_comp);
257e34cbd30SJens Axboe 	} else {
258a8a45941SOmar Sandoval 		WARN_ON_ONCE(rq == rwb->sync_cookie);
259a7905043SJosef Bacik 		__wbt_done(rqos, wbt_flags(rq));
260e34cbd30SJens Axboe 	}
261a8a45941SOmar Sandoval 	wbt_clear_state(rq);
262e34cbd30SJens Axboe }
263e34cbd30SJens Axboe 
stat_sample_valid(struct blk_rq_stat * stat)2644121d385SArnd Bergmann static inline bool stat_sample_valid(struct blk_rq_stat *stat)
265e34cbd30SJens Axboe {
266e34cbd30SJens Axboe 	/*
267e34cbd30SJens Axboe 	 * We need at least one read sample, and a minimum of
268e34cbd30SJens Axboe 	 * RWB_MIN_WRITE_SAMPLES. We require some write samples to know
269e34cbd30SJens Axboe 	 * that it's writes impacting us, and not just some sole read on
270e34cbd30SJens Axboe 	 * a device that is in a lower power state.
271e34cbd30SJens Axboe 	 */
272fa2e39cbSOmar Sandoval 	return (stat[READ].nr_samples >= 1 &&
273fa2e39cbSOmar Sandoval 		stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES);
274e34cbd30SJens Axboe }
275e34cbd30SJens Axboe 
rwb_sync_issue_lat(struct rq_wb * rwb)276e34cbd30SJens Axboe static u64 rwb_sync_issue_lat(struct rq_wb *rwb)
277e34cbd30SJens Axboe {
2786aa7de05SMark Rutland 	u64 now, issue = READ_ONCE(rwb->sync_issue);
279e34cbd30SJens Axboe 
280e34cbd30SJens Axboe 	if (!issue || !rwb->sync_cookie)
281e34cbd30SJens Axboe 		return 0;
282e34cbd30SJens Axboe 
283e34cbd30SJens Axboe 	now = ktime_to_ns(ktime_get());
284e34cbd30SJens Axboe 	return now - issue;
285e34cbd30SJens Axboe }
286e34cbd30SJens Axboe 
wbt_inflight(struct rq_wb * rwb)2870bc65bd4SChristoph Hellwig static inline unsigned int wbt_inflight(struct rq_wb *rwb)
2880bc65bd4SChristoph Hellwig {
2890bc65bd4SChristoph Hellwig 	unsigned int i, ret = 0;
2900bc65bd4SChristoph Hellwig 
2910bc65bd4SChristoph Hellwig 	for (i = 0; i < WBT_NUM_RWQ; i++)
2920bc65bd4SChristoph Hellwig 		ret += atomic_read(&rwb->rq_wait[i].inflight);
2930bc65bd4SChristoph Hellwig 
2940bc65bd4SChristoph Hellwig 	return ret;
2950bc65bd4SChristoph Hellwig }
2960bc65bd4SChristoph Hellwig 
297e34cbd30SJens Axboe enum {
298e34cbd30SJens Axboe 	LAT_OK = 1,
299e34cbd30SJens Axboe 	LAT_UNKNOWN,
300e34cbd30SJens Axboe 	LAT_UNKNOWN_WRITES,
301e34cbd30SJens Axboe 	LAT_EXCEEDED,
302e34cbd30SJens Axboe };
303e34cbd30SJens Axboe 
latency_exceeded(struct rq_wb * rwb,struct blk_rq_stat * stat)30434dbad5dSOmar Sandoval static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat)
305e34cbd30SJens Axboe {
306ba91c849SChristoph Hellwig 	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
307a7905043SJosef Bacik 	struct rq_depth *rqd = &rwb->rq_depth;
308e34cbd30SJens Axboe 	u64 thislat;
309e34cbd30SJens Axboe 
310e34cbd30SJens Axboe 	/*
311e34cbd30SJens Axboe 	 * If our stored sync issue exceeds the window size, or it
312e34cbd30SJens Axboe 	 * exceeds our min target AND we haven't logged any entries,
313e34cbd30SJens Axboe 	 * flag the latency as exceeded. wbt works off completion latencies,
314e34cbd30SJens Axboe 	 * but for a flooded device, a single sync IO can take a long time
315e34cbd30SJens Axboe 	 * to complete after being issued. If this time exceeds our
316e34cbd30SJens Axboe 	 * monitoring window AND we didn't see any other completions in that
317e34cbd30SJens Axboe 	 * window, then count that sync IO as a violation of the latency.
318e34cbd30SJens Axboe 	 */
319e34cbd30SJens Axboe 	thislat = rwb_sync_issue_lat(rwb);
320e34cbd30SJens Axboe 	if (thislat > rwb->cur_win_nsec ||
321fa2e39cbSOmar Sandoval 	    (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) {
322d8a0cbfdSJens Axboe 		trace_wbt_lat(bdi, thislat);
323e34cbd30SJens Axboe 		return LAT_EXCEEDED;
324e34cbd30SJens Axboe 	}
325e34cbd30SJens Axboe 
326e34cbd30SJens Axboe 	/*
327e34cbd30SJens Axboe 	 * No read/write mix, if stat isn't valid
328e34cbd30SJens Axboe 	 */
329e34cbd30SJens Axboe 	if (!stat_sample_valid(stat)) {
330e34cbd30SJens Axboe 		/*
331e34cbd30SJens Axboe 		 * If we had writes in this stat window and the window is
332e34cbd30SJens Axboe 		 * current, we're only doing writes. If a task recently
333e34cbd30SJens Axboe 		 * waited or still has writes in flights, consider us doing
334e34cbd30SJens Axboe 		 * just writes as well.
335e34cbd30SJens Axboe 		 */
33634dbad5dSOmar Sandoval 		if (stat[WRITE].nr_samples || wb_recent_wait(rwb) ||
33734dbad5dSOmar Sandoval 		    wbt_inflight(rwb))
338e34cbd30SJens Axboe 			return LAT_UNKNOWN_WRITES;
339e34cbd30SJens Axboe 		return LAT_UNKNOWN;
340e34cbd30SJens Axboe 	}
341e34cbd30SJens Axboe 
342e34cbd30SJens Axboe 	/*
343e34cbd30SJens Axboe 	 * If the 'min' latency exceeds our target, step down.
344e34cbd30SJens Axboe 	 */
345fa2e39cbSOmar Sandoval 	if (stat[READ].min > rwb->min_lat_nsec) {
346fa2e39cbSOmar Sandoval 		trace_wbt_lat(bdi, stat[READ].min);
347d8a0cbfdSJens Axboe 		trace_wbt_stat(bdi, stat);
348e34cbd30SJens Axboe 		return LAT_EXCEEDED;
349e34cbd30SJens Axboe 	}
350e34cbd30SJens Axboe 
351a7905043SJosef Bacik 	if (rqd->scale_step)
352d8a0cbfdSJens Axboe 		trace_wbt_stat(bdi, stat);
353e34cbd30SJens Axboe 
354e34cbd30SJens Axboe 	return LAT_OK;
355e34cbd30SJens Axboe }
356e34cbd30SJens Axboe 
rwb_trace_step(struct rq_wb * rwb,const char * msg)357e34cbd30SJens Axboe static void rwb_trace_step(struct rq_wb *rwb, const char *msg)
358e34cbd30SJens Axboe {
359ba91c849SChristoph Hellwig 	struct backing_dev_info *bdi = rwb->rqos.disk->bdi;
360a7905043SJosef Bacik 	struct rq_depth *rqd = &rwb->rq_depth;
361d8a0cbfdSJens Axboe 
362a7905043SJosef Bacik 	trace_wbt_step(bdi, msg, rqd->scale_step, rwb->cur_win_nsec,
363a7905043SJosef Bacik 			rwb->wb_background, rwb->wb_normal, rqd->max_depth);
364a7905043SJosef Bacik }
365a7905043SJosef Bacik 
calc_wb_limits(struct rq_wb * rwb)366a7905043SJosef Bacik static void calc_wb_limits(struct rq_wb *rwb)
367a7905043SJosef Bacik {
368a7905043SJosef Bacik 	if (rwb->min_lat_nsec == 0) {
369a7905043SJosef Bacik 		rwb->wb_normal = rwb->wb_background = 0;
370a7905043SJosef Bacik 	} else if (rwb->rq_depth.max_depth <= 2) {
371a7905043SJosef Bacik 		rwb->wb_normal = rwb->rq_depth.max_depth;
372a7905043SJosef Bacik 		rwb->wb_background = 1;
373a7905043SJosef Bacik 	} else {
374a7905043SJosef Bacik 		rwb->wb_normal = (rwb->rq_depth.max_depth + 1) / 2;
375a7905043SJosef Bacik 		rwb->wb_background = (rwb->rq_depth.max_depth + 3) / 4;
376a7905043SJosef Bacik 	}
377e34cbd30SJens Axboe }
378e34cbd30SJens Axboe 
scale_up(struct rq_wb * rwb)379e34cbd30SJens Axboe static void scale_up(struct rq_wb *rwb)
380e34cbd30SJens Axboe {
381b84477d3SHarshad Shirwadkar 	if (!rq_depth_scale_up(&rwb->rq_depth))
382b84477d3SHarshad Shirwadkar 		return;
383a7905043SJosef Bacik 	calc_wb_limits(rwb);
384e34cbd30SJens Axboe 	rwb->unknown_cnt = 0;
3855e65a203SJosef Bacik 	rwb_wake_all(rwb);
3863a89c25dSTommi Rantala 	rwb_trace_step(rwb, tracepoint_string("scale up"));
387e34cbd30SJens Axboe }
388e34cbd30SJens Axboe 
scale_down(struct rq_wb * rwb,bool hard_throttle)389e34cbd30SJens Axboe static void scale_down(struct rq_wb *rwb, bool hard_throttle)
390e34cbd30SJens Axboe {
391b84477d3SHarshad Shirwadkar 	if (!rq_depth_scale_down(&rwb->rq_depth, hard_throttle))
392b84477d3SHarshad Shirwadkar 		return;
393e34cbd30SJens Axboe 	calc_wb_limits(rwb);
394a7905043SJosef Bacik 	rwb->unknown_cnt = 0;
3953a89c25dSTommi Rantala 	rwb_trace_step(rwb, tracepoint_string("scale down"));
396e34cbd30SJens Axboe }
397e34cbd30SJens Axboe 
rwb_arm_timer(struct rq_wb * rwb)398e34cbd30SJens Axboe static void rwb_arm_timer(struct rq_wb *rwb)
399e34cbd30SJens Axboe {
400a7905043SJosef Bacik 	struct rq_depth *rqd = &rwb->rq_depth;
401a7905043SJosef Bacik 
402a7905043SJosef Bacik 	if (rqd->scale_step > 0) {
403e34cbd30SJens Axboe 		/*
404e34cbd30SJens Axboe 		 * We should speed this up, using some variant of a fast
405e34cbd30SJens Axboe 		 * integer inverse square root calculation. Since we only do
406e34cbd30SJens Axboe 		 * this for every window expiration, it's not a huge deal,
407e34cbd30SJens Axboe 		 * though.
408e34cbd30SJens Axboe 		 */
409e34cbd30SJens Axboe 		rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4,
410a7905043SJosef Bacik 					int_sqrt((rqd->scale_step + 1) << 8));
411e34cbd30SJens Axboe 	} else {
412e34cbd30SJens Axboe 		/*
413e34cbd30SJens Axboe 		 * For step < 0, we don't want to increase/decrease the
414e34cbd30SJens Axboe 		 * window size.
415e34cbd30SJens Axboe 		 */
416e34cbd30SJens Axboe 		rwb->cur_win_nsec = rwb->win_nsec;
417e34cbd30SJens Axboe 	}
418e34cbd30SJens Axboe 
41934dbad5dSOmar Sandoval 	blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec);
420e34cbd30SJens Axboe }
421e34cbd30SJens Axboe 
wb_timer_fn(struct blk_stat_callback * cb)42234dbad5dSOmar Sandoval static void wb_timer_fn(struct blk_stat_callback *cb)
423e34cbd30SJens Axboe {
42434dbad5dSOmar Sandoval 	struct rq_wb *rwb = cb->data;
425a7905043SJosef Bacik 	struct rq_depth *rqd = &rwb->rq_depth;
426e34cbd30SJens Axboe 	unsigned int inflight = wbt_inflight(rwb);
427e34cbd30SJens Axboe 	int status;
428e34cbd30SJens Axboe 
429ba91c849SChristoph Hellwig 	if (!rwb->rqos.disk)
430480d42dcSAndrea Righi 		return;
431480d42dcSAndrea Righi 
43234dbad5dSOmar Sandoval 	status = latency_exceeded(rwb, cb->stat);
433e34cbd30SJens Axboe 
434ba91c849SChristoph Hellwig 	trace_wbt_timer(rwb->rqos.disk->bdi, status, rqd->scale_step, inflight);
435e34cbd30SJens Axboe 
436e34cbd30SJens Axboe 	/*
437e34cbd30SJens Axboe 	 * If we exceeded the latency target, step down. If we did not,
438e34cbd30SJens Axboe 	 * step one level up. If we don't know enough to say either exceeded
439e34cbd30SJens Axboe 	 * or ok, then don't do anything.
440e34cbd30SJens Axboe 	 */
441e34cbd30SJens Axboe 	switch (status) {
442e34cbd30SJens Axboe 	case LAT_EXCEEDED:
443e34cbd30SJens Axboe 		scale_down(rwb, true);
444e34cbd30SJens Axboe 		break;
445e34cbd30SJens Axboe 	case LAT_OK:
446e34cbd30SJens Axboe 		scale_up(rwb);
447e34cbd30SJens Axboe 		break;
448e34cbd30SJens Axboe 	case LAT_UNKNOWN_WRITES:
449e34cbd30SJens Axboe 		/*
450e34cbd30SJens Axboe 		 * We started a the center step, but don't have a valid
451e34cbd30SJens Axboe 		 * read/write sample, but we do have writes going on.
452e34cbd30SJens Axboe 		 * Allow step to go negative, to increase write perf.
453e34cbd30SJens Axboe 		 */
454e34cbd30SJens Axboe 		scale_up(rwb);
455e34cbd30SJens Axboe 		break;
456e34cbd30SJens Axboe 	case LAT_UNKNOWN:
457e34cbd30SJens Axboe 		if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP)
458e34cbd30SJens Axboe 			break;
459e34cbd30SJens Axboe 		/*
460e34cbd30SJens Axboe 		 * We get here when previously scaled reduced depth, and we
461e34cbd30SJens Axboe 		 * currently don't have a valid read/write sample. For that
462e34cbd30SJens Axboe 		 * case, slowly return to center state (step == 0).
463e34cbd30SJens Axboe 		 */
464a7905043SJosef Bacik 		if (rqd->scale_step > 0)
465e34cbd30SJens Axboe 			scale_up(rwb);
466a7905043SJosef Bacik 		else if (rqd->scale_step < 0)
467e34cbd30SJens Axboe 			scale_down(rwb, false);
468e34cbd30SJens Axboe 		break;
469e34cbd30SJens Axboe 	default:
470e34cbd30SJens Axboe 		break;
471e34cbd30SJens Axboe 	}
472e34cbd30SJens Axboe 
473e34cbd30SJens Axboe 	/*
474e34cbd30SJens Axboe 	 * Re-arm timer, if we have IO in flight
475e34cbd30SJens Axboe 	 */
476a7905043SJosef Bacik 	if (rqd->scale_step || inflight)
477e34cbd30SJens Axboe 		rwb_arm_timer(rwb);
478e34cbd30SJens Axboe }
479e34cbd30SJens Axboe 
wbt_update_limits(struct rq_wb * rwb)4804d89e1d1SGuoqing Jiang static void wbt_update_limits(struct rq_wb *rwb)
481e34cbd30SJens Axboe {
482a7905043SJosef Bacik 	struct rq_depth *rqd = &rwb->rq_depth;
483a7905043SJosef Bacik 
484a7905043SJosef Bacik 	rqd->scale_step = 0;
485a7905043SJosef Bacik 	rqd->scaled_max = false;
486a7905043SJosef Bacik 
487a7905043SJosef Bacik 	rq_depth_calc_max_depth(rqd);
488e34cbd30SJens Axboe 	calc_wb_limits(rwb);
489e34cbd30SJens Axboe 
490e34cbd30SJens Axboe 	rwb_wake_all(rwb);
491e34cbd30SJens Axboe }
492e34cbd30SJens Axboe 
wbt_disabled(struct request_queue * q)4933642ef4dSYu Kuai bool wbt_disabled(struct request_queue *q)
4943642ef4dSYu Kuai {
4953642ef4dSYu Kuai 	struct rq_qos *rqos = wbt_rq_qos(q);
4963642ef4dSYu Kuai 
49706257fdaSYu Kuai 	return !rqos || !rwb_enabled(RQWB(rqos));
4983642ef4dSYu Kuai }
4993642ef4dSYu Kuai 
wbt_get_min_lat(struct request_queue * q)500a7905043SJosef Bacik u64 wbt_get_min_lat(struct request_queue *q)
501a7905043SJosef Bacik {
502a7905043SJosef Bacik 	struct rq_qos *rqos = wbt_rq_qos(q);
503a7905043SJosef Bacik 	if (!rqos)
504a7905043SJosef Bacik 		return 0;
505a7905043SJosef Bacik 	return RQWB(rqos)->min_lat_nsec;
506a7905043SJosef Bacik }
507a7905043SJosef Bacik 
wbt_set_min_lat(struct request_queue * q,u64 val)508a7905043SJosef Bacik void wbt_set_min_lat(struct request_queue *q, u64 val)
509a7905043SJosef Bacik {
510a7905043SJosef Bacik 	struct rq_qos *rqos = wbt_rq_qos(q);
511a7905043SJosef Bacik 	if (!rqos)
512a7905043SJosef Bacik 		return;
513a9a236d2SYu Kuai 
514a7905043SJosef Bacik 	RQWB(rqos)->min_lat_nsec = val;
515a9a236d2SYu Kuai 	if (val)
516a7905043SJosef Bacik 		RQWB(rqos)->enable_state = WBT_STATE_ON_MANUAL;
517a9a236d2SYu Kuai 	else
518a9a236d2SYu Kuai 		RQWB(rqos)->enable_state = WBT_STATE_OFF_MANUAL;
519a9a236d2SYu Kuai 
5204d89e1d1SGuoqing Jiang 	wbt_update_limits(RQWB(rqos));
521a7905043SJosef Bacik }
522a7905043SJosef Bacik 
523a7905043SJosef Bacik 
close_io(struct rq_wb * rwb)524e34cbd30SJens Axboe static bool close_io(struct rq_wb *rwb)
525e34cbd30SJens Axboe {
526e34cbd30SJens Axboe 	const unsigned long now = jiffies;
527e34cbd30SJens Axboe 
528e34cbd30SJens Axboe 	return time_before(now, rwb->last_issue + HZ / 10) ||
529e34cbd30SJens Axboe 		time_before(now, rwb->last_comp + HZ / 10);
530e34cbd30SJens Axboe }
531e34cbd30SJens Axboe 
532e34cbd30SJens Axboe #define REQ_HIPRIO	(REQ_SYNC | REQ_META | REQ_PRIO)
533e34cbd30SJens Axboe 
get_limit(struct rq_wb * rwb,blk_opf_t opf)53416458cf3SBart Van Assche static inline unsigned int get_limit(struct rq_wb *rwb, blk_opf_t opf)
535e34cbd30SJens Axboe {
536e34cbd30SJens Axboe 	unsigned int limit;
537e34cbd30SJens Axboe 
53816458cf3SBart Van Assche 	if ((opf & REQ_OP_MASK) == REQ_OP_DISCARD)
539782f5697SJens Axboe 		return rwb->wb_background;
540782f5697SJens Axboe 
541e34cbd30SJens Axboe 	/*
542e34cbd30SJens Axboe 	 * At this point we know it's a buffered write. If this is
5433dfbdc44Sweiping zhang 	 * kswapd trying to free memory, or REQ_SYNC is set, then
544e34cbd30SJens Axboe 	 * it's WB_SYNC_ALL writeback, and we'll use the max limit for
545e34cbd30SJens Axboe 	 * that. If the write is marked as a background write, then use
546e34cbd30SJens Axboe 	 * the idle limit, or go to normal if we haven't had competing
547e34cbd30SJens Axboe 	 * IO for a bit.
548e34cbd30SJens Axboe 	 */
54916458cf3SBart Van Assche 	if ((opf & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd())
550a7905043SJosef Bacik 		limit = rwb->rq_depth.max_depth;
55116458cf3SBart Van Assche 	else if ((opf & REQ_BACKGROUND) || close_io(rwb)) {
552e34cbd30SJens Axboe 		/*
553e34cbd30SJens Axboe 		 * If less than 100ms since we completed unrelated IO,
554e34cbd30SJens Axboe 		 * limit us to half the depth for background writeback.
555e34cbd30SJens Axboe 		 */
556e34cbd30SJens Axboe 		limit = rwb->wb_background;
557e34cbd30SJens Axboe 	} else
558e34cbd30SJens Axboe 		limit = rwb->wb_normal;
559e34cbd30SJens Axboe 
560e34cbd30SJens Axboe 	return limit;
561e34cbd30SJens Axboe }
562e34cbd30SJens Axboe 
56338cfb5a4SJens Axboe struct wbt_wait_data {
56438cfb5a4SJens Axboe 	struct rq_wb *rwb;
565b6c7b58fSJosef Bacik 	enum wbt_flags wb_acct;
56616458cf3SBart Van Assche 	blk_opf_t opf;
56738cfb5a4SJens Axboe };
56838cfb5a4SJens Axboe 
wbt_inflight_cb(struct rq_wait * rqw,void * private_data)569b6c7b58fSJosef Bacik static bool wbt_inflight_cb(struct rq_wait *rqw, void *private_data)
57038cfb5a4SJens Axboe {
571b6c7b58fSJosef Bacik 	struct wbt_wait_data *data = private_data;
57216458cf3SBart Van Assche 	return rq_wait_inc_below(rqw, get_limit(data->rwb, data->opf));
573b6c7b58fSJosef Bacik }
57438cfb5a4SJens Axboe 
wbt_cleanup_cb(struct rq_wait * rqw,void * private_data)575b6c7b58fSJosef Bacik static void wbt_cleanup_cb(struct rq_wait *rqw, void *private_data)
576b6c7b58fSJosef Bacik {
577b6c7b58fSJosef Bacik 	struct wbt_wait_data *data = private_data;
578b6c7b58fSJosef Bacik 	wbt_rqw_done(data->rwb, rqw, data->wb_acct);
57938cfb5a4SJens Axboe }
58038cfb5a4SJens Axboe 
581e34cbd30SJens Axboe /*
582e34cbd30SJens Axboe  * Block if we will exceed our limit, or if we are currently waiting for
583e34cbd30SJens Axboe  * the timer to kick off queuing again.
584e34cbd30SJens Axboe  */
__wbt_wait(struct rq_wb * rwb,enum wbt_flags wb_acct,blk_opf_t opf)5858bea6090SJens Axboe static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct,
58616458cf3SBart Van Assche 		       blk_opf_t opf)
587e34cbd30SJens Axboe {
5888bea6090SJens Axboe 	struct rq_wait *rqw = get_rq_wait(rwb, wb_acct);
58938cfb5a4SJens Axboe 	struct wbt_wait_data data = {
59038cfb5a4SJens Axboe 		.rwb = rwb,
591b6c7b58fSJosef Bacik 		.wb_acct = wb_acct,
59216458cf3SBart Van Assche 		.opf = opf,
59338cfb5a4SJens Axboe 	};
594e34cbd30SJens Axboe 
595b6c7b58fSJosef Bacik 	rq_qos_wait(rqw, &data, wbt_inflight_cb, wbt_cleanup_cb);
596e34cbd30SJens Axboe }
597e34cbd30SJens Axboe 
wbt_should_throttle(struct bio * bio)598482e302aSLei Chen static inline bool wbt_should_throttle(struct bio *bio)
599e34cbd30SJens Axboe {
600782f5697SJens Axboe 	switch (bio_op(bio)) {
601782f5697SJens Axboe 	case REQ_OP_WRITE:
602e34cbd30SJens Axboe 		/*
603e34cbd30SJens Axboe 		 * Don't throttle WRITE_ODIRECT
604e34cbd30SJens Axboe 		 */
605782f5697SJens Axboe 		if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) ==
606782f5697SJens Axboe 		    (REQ_SYNC | REQ_IDLE))
607e34cbd30SJens Axboe 			return false;
608df561f66SGustavo A. R. Silva 		fallthrough;
609782f5697SJens Axboe 	case REQ_OP_DISCARD:
610e34cbd30SJens Axboe 		return true;
611782f5697SJens Axboe 	default:
612782f5697SJens Axboe 		return false;
613782f5697SJens Axboe 	}
614e34cbd30SJens Axboe }
615e34cbd30SJens Axboe 
bio_to_wbt_flags(struct rq_wb * rwb,struct bio * bio)616c1c80384SJosef Bacik static enum wbt_flags bio_to_wbt_flags(struct rq_wb *rwb, struct bio *bio)
617c1c80384SJosef Bacik {
618c1c80384SJosef Bacik 	enum wbt_flags flags = 0;
619c1c80384SJosef Bacik 
620c125311dSJens Axboe 	if (!rwb_enabled(rwb))
621c125311dSJens Axboe 		return 0;
622c125311dSJens Axboe 
623c1c80384SJosef Bacik 	if (bio_op(bio) == REQ_OP_READ) {
624c1c80384SJosef Bacik 		flags = WBT_READ;
625482e302aSLei Chen 	} else if (wbt_should_throttle(bio)) {
626c1c80384SJosef Bacik 		if (current_is_kswapd())
627c1c80384SJosef Bacik 			flags |= WBT_KSWAPD;
628c1c80384SJosef Bacik 		if (bio_op(bio) == REQ_OP_DISCARD)
629c1c80384SJosef Bacik 			flags |= WBT_DISCARD;
630c1c80384SJosef Bacik 		flags |= WBT_TRACKED;
631c1c80384SJosef Bacik 	}
632c1c80384SJosef Bacik 	return flags;
633c1c80384SJosef Bacik }
634c1c80384SJosef Bacik 
wbt_cleanup(struct rq_qos * rqos,struct bio * bio)635c1c80384SJosef Bacik static void wbt_cleanup(struct rq_qos *rqos, struct bio *bio)
636c1c80384SJosef Bacik {
637c1c80384SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
638c1c80384SJosef Bacik 	enum wbt_flags flags = bio_to_wbt_flags(rwb, bio);
639c1c80384SJosef Bacik 	__wbt_done(rqos, flags);
640c1c80384SJosef Bacik }
641c1c80384SJosef Bacik 
642e34cbd30SJens Axboe /*
643e34cbd30SJens Axboe  * May sleep, if we have exceeded the writeback limits. Caller can pass
644e34cbd30SJens Axboe  * in an irq held spinlock, if it holds one when calling this function.
645e34cbd30SJens Axboe  * If we do sleep, we'll release and re-grab it.
646e34cbd30SJens Axboe  */
wbt_wait(struct rq_qos * rqos,struct bio * bio)647d5337560SChristoph Hellwig static void wbt_wait(struct rq_qos *rqos, struct bio *bio)
648e34cbd30SJens Axboe {
649a7905043SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
650c1c80384SJosef Bacik 	enum wbt_flags flags;
651e34cbd30SJens Axboe 
652c1c80384SJosef Bacik 	flags = bio_to_wbt_flags(rwb, bio);
653df60f6e8SMing Lei 	if (!(flags & WBT_TRACKED)) {
654c1c80384SJosef Bacik 		if (flags & WBT_READ)
655e34cbd30SJens Axboe 			wb_timestamp(rwb, &rwb->last_issue);
656c1c80384SJosef Bacik 		return;
657e34cbd30SJens Axboe 	}
658e34cbd30SJens Axboe 
659d5337560SChristoph Hellwig 	__wbt_wait(rwb, flags, bio->bi_opf);
660e34cbd30SJens Axboe 
66134dbad5dSOmar Sandoval 	if (!blk_stat_is_active(rwb->cb))
662e34cbd30SJens Axboe 		rwb_arm_timer(rwb);
663c1c80384SJosef Bacik }
664e34cbd30SJens Axboe 
wbt_track(struct rq_qos * rqos,struct request * rq,struct bio * bio)665c1c80384SJosef Bacik static void wbt_track(struct rq_qos *rqos, struct request *rq, struct bio *bio)
666c1c80384SJosef Bacik {
667c1c80384SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
668c1c80384SJosef Bacik 	rq->wbt_flags |= bio_to_wbt_flags(rwb, bio);
669e34cbd30SJens Axboe }
670e34cbd30SJens Axboe 
wbt_issue(struct rq_qos * rqos,struct request * rq)671c83f536aSBart Van Assche static void wbt_issue(struct rq_qos *rqos, struct request *rq)
672e34cbd30SJens Axboe {
673a7905043SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
674a7905043SJosef Bacik 
675e34cbd30SJens Axboe 	if (!rwb_enabled(rwb))
676e34cbd30SJens Axboe 		return;
677e34cbd30SJens Axboe 
678e34cbd30SJens Axboe 	/*
679a8a45941SOmar Sandoval 	 * Track sync issue, in case it takes a long time to complete. Allows us
680a8a45941SOmar Sandoval 	 * to react quicker, if a sync IO takes a long time to complete. Note
681a8a45941SOmar Sandoval 	 * that this is just a hint. The request can go away when it completes,
682a8a45941SOmar Sandoval 	 * so it's important we never dereference it. We only use the address to
683a8a45941SOmar Sandoval 	 * compare with, which is why we store the sync_issue time locally.
684e34cbd30SJens Axboe 	 */
685a8a45941SOmar Sandoval 	if (wbt_is_read(rq) && !rwb->sync_issue) {
686a8a45941SOmar Sandoval 		rwb->sync_cookie = rq;
687544ccc8dSOmar Sandoval 		rwb->sync_issue = rq->io_start_time_ns;
688e34cbd30SJens Axboe 	}
689e34cbd30SJens Axboe }
690e34cbd30SJens Axboe 
wbt_requeue(struct rq_qos * rqos,struct request * rq)691c83f536aSBart Van Assche static void wbt_requeue(struct rq_qos *rqos, struct request *rq)
692e34cbd30SJens Axboe {
693a7905043SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
694e34cbd30SJens Axboe 	if (!rwb_enabled(rwb))
695e34cbd30SJens Axboe 		return;
696a8a45941SOmar Sandoval 	if (rq == rwb->sync_cookie) {
697e34cbd30SJens Axboe 		rwb->sync_issue = 0;
698e34cbd30SJens Axboe 		rwb->sync_cookie = NULL;
699e34cbd30SJens Axboe 	}
700e34cbd30SJens Axboe }
701e34cbd30SJens Axboe 
wbt_set_write_cache(struct request_queue * q,bool write_cache_on)702a7905043SJosef Bacik void wbt_set_write_cache(struct request_queue *q, bool write_cache_on)
703e34cbd30SJens Axboe {
704a7905043SJosef Bacik 	struct rq_qos *rqos = wbt_rq_qos(q);
705a7905043SJosef Bacik 	if (rqos)
706a7905043SJosef Bacik 		RQWB(rqos)->wc = write_cache_on;
707e34cbd30SJens Axboe }
708e34cbd30SJens Axboe 
709fa224eedSJens Axboe /*
7108330cdb0SJan Kara  * Enable wbt if defaults are configured that way
7118330cdb0SJan Kara  */
wbt_enable_default(struct gendisk * disk)71204aad37bSChristoph Hellwig void wbt_enable_default(struct gendisk *disk)
7138330cdb0SJan Kara {
71404aad37bSChristoph Hellwig 	struct request_queue *q = disk->queue;
715671fae5eSYu Kuai 	struct rq_qos *rqos;
7168a2b20a9SYu Kuai 	bool enable = IS_ENABLED(CONFIG_BLK_WBT_MQ);
7178a2b20a9SYu Kuai 
7188a2b20a9SYu Kuai 	if (q->elevator &&
7198a2b20a9SYu Kuai 	    test_bit(ELEVATOR_FLAG_DISABLE_WBT, &q->elevator->flags))
7208a2b20a9SYu Kuai 		enable = false;
72176a80408SZhang Yi 
7228330cdb0SJan Kara 	/* Throttling already enabled? */
723671fae5eSYu Kuai 	rqos = wbt_rq_qos(q);
72476a80408SZhang Yi 	if (rqos) {
7258a2b20a9SYu Kuai 		if (enable && RQWB(rqos)->enable_state == WBT_STATE_OFF_DEFAULT)
72676a80408SZhang Yi 			RQWB(rqos)->enable_state = WBT_STATE_ON_DEFAULT;
7278330cdb0SJan Kara 		return;
72876a80408SZhang Yi 	}
7298330cdb0SJan Kara 
7308330cdb0SJan Kara 	/* Queue not registered? Maybe shutting down... */
73158c898baSMing Lei 	if (!blk_queue_registered(q))
7328330cdb0SJan Kara 		return;
7338330cdb0SJan Kara 
7348a2b20a9SYu Kuai 	if (queue_is_mq(q) && enable)
735958f2965SChristoph Hellwig 		wbt_init(disk);
7368330cdb0SJan Kara }
7378330cdb0SJan Kara EXPORT_SYMBOL_GPL(wbt_enable_default);
7388330cdb0SJan Kara 
wbt_default_latency_nsec(struct request_queue * q)73980e091d1SJens Axboe u64 wbt_default_latency_nsec(struct request_queue *q)
74080e091d1SJens Axboe {
74180e091d1SJens Axboe 	/*
74280e091d1SJens Axboe 	 * We default to 2msec for non-rotational storage, and 75msec
74380e091d1SJens Axboe 	 * for rotational storage.
74480e091d1SJens Axboe 	 */
74580e091d1SJens Axboe 	if (blk_queue_nonrot(q))
74680e091d1SJens Axboe 		return 2000000ULL;
74780e091d1SJens Axboe 	else
74880e091d1SJens Axboe 		return 75000000ULL;
74980e091d1SJens Axboe }
75080e091d1SJens Axboe 
wbt_data_dir(const struct request * rq)75199c749a4SJens Axboe static int wbt_data_dir(const struct request *rq)
75299c749a4SJens Axboe {
75377e7ffd7SBart Van Assche 	const enum req_op op = req_op(rq);
7545235553dSJens Axboe 
7555235553dSJens Axboe 	if (op == REQ_OP_READ)
7565235553dSJens Axboe 		return READ;
757825843b0SJens Axboe 	else if (op_is_write(op))
7585235553dSJens Axboe 		return WRITE;
7595235553dSJens Axboe 
7605235553dSJens Axboe 	/* don't account */
7615235553dSJens Axboe 	return -1;
76299c749a4SJens Axboe }
76399c749a4SJens Axboe 
wbt_queue_depth_changed(struct rq_qos * rqos)7649677a3e0STejun Heo static void wbt_queue_depth_changed(struct rq_qos *rqos)
7659677a3e0STejun Heo {
766ba91c849SChristoph Hellwig 	RQWB(rqos)->rq_depth.queue_depth = blk_queue_depth(rqos->disk->queue);
7674d89e1d1SGuoqing Jiang 	wbt_update_limits(RQWB(rqos));
7689677a3e0STejun Heo }
7699677a3e0STejun Heo 
wbt_exit(struct rq_qos * rqos)770a7905043SJosef Bacik static void wbt_exit(struct rq_qos *rqos)
771a7905043SJosef Bacik {
772a7905043SJosef Bacik 	struct rq_wb *rwb = RQWB(rqos);
773a7905043SJosef Bacik 
774ba91c849SChristoph Hellwig 	blk_stat_remove_callback(rqos->disk->queue, rwb->cb);
775a7905043SJosef Bacik 	blk_stat_free_callback(rwb->cb);
776a7905043SJosef Bacik 	kfree(rwb);
777a7905043SJosef Bacik }
778a7905043SJosef Bacik 
779a7905043SJosef Bacik /*
780a7905043SJosef Bacik  * Disable wbt, if enabled by default.
781a7905043SJosef Bacik  */
wbt_disable_default(struct gendisk * disk)78204aad37bSChristoph Hellwig void wbt_disable_default(struct gendisk *disk)
783a7905043SJosef Bacik {
78404aad37bSChristoph Hellwig 	struct rq_qos *rqos = wbt_rq_qos(disk->queue);
785a7905043SJosef Bacik 	struct rq_wb *rwb;
786a7905043SJosef Bacik 	if (!rqos)
787a7905043SJosef Bacik 		return;
788a7905043SJosef Bacik 	rwb = RQWB(rqos);
789544fbd16SMing Lei 	if (rwb->enable_state == WBT_STATE_ON_DEFAULT) {
790544fbd16SMing Lei 		blk_stat_deactivate(rwb->cb);
7911d0903d6SZhang Yi 		rwb->enable_state = WBT_STATE_OFF_DEFAULT;
792a7905043SJosef Bacik 	}
793544fbd16SMing Lei }
794e815f404SJens Axboe EXPORT_SYMBOL_GPL(wbt_disable_default);
795a7905043SJosef Bacik 
796d19afebcSMing Lei #ifdef CONFIG_BLK_DEBUG_FS
wbt_curr_win_nsec_show(void * data,struct seq_file * m)797d19afebcSMing Lei static int wbt_curr_win_nsec_show(void *data, struct seq_file *m)
798d19afebcSMing Lei {
799d19afebcSMing Lei 	struct rq_qos *rqos = data;
800d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
801d19afebcSMing Lei 
802d19afebcSMing Lei 	seq_printf(m, "%llu\n", rwb->cur_win_nsec);
803d19afebcSMing Lei 	return 0;
804d19afebcSMing Lei }
805d19afebcSMing Lei 
wbt_enabled_show(void * data,struct seq_file * m)806d19afebcSMing Lei static int wbt_enabled_show(void *data, struct seq_file *m)
807d19afebcSMing Lei {
808d19afebcSMing Lei 	struct rq_qos *rqos = data;
809d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
810d19afebcSMing Lei 
811d19afebcSMing Lei 	seq_printf(m, "%d\n", rwb->enable_state);
812d19afebcSMing Lei 	return 0;
813d19afebcSMing Lei }
814d19afebcSMing Lei 
wbt_id_show(void * data,struct seq_file * m)815d19afebcSMing Lei static int wbt_id_show(void *data, struct seq_file *m)
816d19afebcSMing Lei {
817d19afebcSMing Lei 	struct rq_qos *rqos = data;
818d19afebcSMing Lei 
819d19afebcSMing Lei 	seq_printf(m, "%u\n", rqos->id);
820d19afebcSMing Lei 	return 0;
821d19afebcSMing Lei }
822d19afebcSMing Lei 
wbt_inflight_show(void * data,struct seq_file * m)823d19afebcSMing Lei static int wbt_inflight_show(void *data, struct seq_file *m)
824d19afebcSMing Lei {
825d19afebcSMing Lei 	struct rq_qos *rqos = data;
826d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
827d19afebcSMing Lei 	int i;
828d19afebcSMing Lei 
829d19afebcSMing Lei 	for (i = 0; i < WBT_NUM_RWQ; i++)
830d19afebcSMing Lei 		seq_printf(m, "%d: inflight %d\n", i,
831d19afebcSMing Lei 			   atomic_read(&rwb->rq_wait[i].inflight));
832d19afebcSMing Lei 	return 0;
833d19afebcSMing Lei }
834d19afebcSMing Lei 
wbt_min_lat_nsec_show(void * data,struct seq_file * m)835d19afebcSMing Lei static int wbt_min_lat_nsec_show(void *data, struct seq_file *m)
836d19afebcSMing Lei {
837d19afebcSMing Lei 	struct rq_qos *rqos = data;
838d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
839d19afebcSMing Lei 
840d19afebcSMing Lei 	seq_printf(m, "%lu\n", rwb->min_lat_nsec);
841d19afebcSMing Lei 	return 0;
842d19afebcSMing Lei }
843d19afebcSMing Lei 
wbt_unknown_cnt_show(void * data,struct seq_file * m)844d19afebcSMing Lei static int wbt_unknown_cnt_show(void *data, struct seq_file *m)
845d19afebcSMing Lei {
846d19afebcSMing Lei 	struct rq_qos *rqos = data;
847d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
848d19afebcSMing Lei 
849d19afebcSMing Lei 	seq_printf(m, "%u\n", rwb->unknown_cnt);
850d19afebcSMing Lei 	return 0;
851d19afebcSMing Lei }
852d19afebcSMing Lei 
wbt_normal_show(void * data,struct seq_file * m)853d19afebcSMing Lei static int wbt_normal_show(void *data, struct seq_file *m)
854d19afebcSMing Lei {
855d19afebcSMing Lei 	struct rq_qos *rqos = data;
856d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
857d19afebcSMing Lei 
858d19afebcSMing Lei 	seq_printf(m, "%u\n", rwb->wb_normal);
859d19afebcSMing Lei 	return 0;
860d19afebcSMing Lei }
861d19afebcSMing Lei 
wbt_background_show(void * data,struct seq_file * m)862d19afebcSMing Lei static int wbt_background_show(void *data, struct seq_file *m)
863d19afebcSMing Lei {
864d19afebcSMing Lei 	struct rq_qos *rqos = data;
865d19afebcSMing Lei 	struct rq_wb *rwb = RQWB(rqos);
866d19afebcSMing Lei 
867d19afebcSMing Lei 	seq_printf(m, "%u\n", rwb->wb_background);
868d19afebcSMing Lei 	return 0;
869d19afebcSMing Lei }
870d19afebcSMing Lei 
871d19afebcSMing Lei static const struct blk_mq_debugfs_attr wbt_debugfs_attrs[] = {
872d19afebcSMing Lei 	{"curr_win_nsec", 0400, wbt_curr_win_nsec_show},
873d19afebcSMing Lei 	{"enabled", 0400, wbt_enabled_show},
874d19afebcSMing Lei 	{"id", 0400, wbt_id_show},
875d19afebcSMing Lei 	{"inflight", 0400, wbt_inflight_show},
876d19afebcSMing Lei 	{"min_lat_nsec", 0400, wbt_min_lat_nsec_show},
877d19afebcSMing Lei 	{"unknown_cnt", 0400, wbt_unknown_cnt_show},
878d19afebcSMing Lei 	{"wb_normal", 0400, wbt_normal_show},
879d19afebcSMing Lei 	{"wb_background", 0400, wbt_background_show},
880d19afebcSMing Lei 	{},
881d19afebcSMing Lei };
882d19afebcSMing Lei #endif
883d19afebcSMing Lei 
8843963d84dSChristoph Hellwig static const struct rq_qos_ops wbt_rqos_ops = {
885a7905043SJosef Bacik 	.throttle = wbt_wait,
886a7905043SJosef Bacik 	.issue = wbt_issue,
887c1c80384SJosef Bacik 	.track = wbt_track,
888a7905043SJosef Bacik 	.requeue = wbt_requeue,
889a7905043SJosef Bacik 	.done = wbt_done,
890c1c80384SJosef Bacik 	.cleanup = wbt_cleanup,
8919677a3e0STejun Heo 	.queue_depth_changed = wbt_queue_depth_changed,
892a7905043SJosef Bacik 	.exit = wbt_exit,
893d19afebcSMing Lei #ifdef CONFIG_BLK_DEBUG_FS
894d19afebcSMing Lei 	.debugfs_attrs = wbt_debugfs_attrs,
895d19afebcSMing Lei #endif
896a7905043SJosef Bacik };
897a7905043SJosef Bacik 
wbt_init(struct gendisk * disk)898958f2965SChristoph Hellwig int wbt_init(struct gendisk *disk)
899e34cbd30SJens Axboe {
900958f2965SChristoph Hellwig 	struct request_queue *q = disk->queue;
901e34cbd30SJens Axboe 	struct rq_wb *rwb;
902e34cbd30SJens Axboe 	int i;
90314a6e2ebSJinke Han 	int ret;
904e34cbd30SJens Axboe 
905e34cbd30SJens Axboe 	rwb = kzalloc(sizeof(*rwb), GFP_KERNEL);
906e34cbd30SJens Axboe 	if (!rwb)
907e34cbd30SJens Axboe 		return -ENOMEM;
908e34cbd30SJens Axboe 
90999c749a4SJens Axboe 	rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb);
91034dbad5dSOmar Sandoval 	if (!rwb->cb) {
91134dbad5dSOmar Sandoval 		kfree(rwb);
91234dbad5dSOmar Sandoval 		return -ENOMEM;
91334dbad5dSOmar Sandoval 	}
91434dbad5dSOmar Sandoval 
915a7905043SJosef Bacik 	for (i = 0; i < WBT_NUM_RWQ; i++)
916a7905043SJosef Bacik 		rq_wait_init(&rwb->rq_wait[i]);
917e34cbd30SJens Axboe 
918e34cbd30SJens Axboe 	rwb->last_comp = rwb->last_issue = jiffies;
919e34cbd30SJens Axboe 	rwb->win_nsec = RWB_WINDOW_NSEC;
920d62118b6SJens Axboe 	rwb->enable_state = WBT_STATE_ON_DEFAULT;
921285febabSYu Kuai 	rwb->wc = test_bit(QUEUE_FLAG_WC, &q->queue_flags);
922a7905043SJosef Bacik 	rwb->rq_depth.default_depth = RWB_DEF_DEPTH;
9238c5035dfSYu Kuai 	rwb->min_lat_nsec = wbt_default_latency_nsec(q);
9244e1d91aeSChristoph Hellwig 	rwb->rq_depth.queue_depth = blk_queue_depth(q);
9254e1d91aeSChristoph Hellwig 	wbt_update_limits(rwb);
926e34cbd30SJens Axboe 
927e34cbd30SJens Axboe 	/*
92834dbad5dSOmar Sandoval 	 * Assign rwb and add the stats callback.
929e34cbd30SJens Axboe 	 */
930a13bd91bSYu Kuai 	mutex_lock(&q->rq_qos_mutex);
931ce57b558SChristoph Hellwig 	ret = rq_qos_add(&rwb->rqos, disk, RQ_QOS_WBT, &wbt_rqos_ops);
932a13bd91bSYu Kuai 	mutex_unlock(&q->rq_qos_mutex);
93314a6e2ebSJinke Han 	if (ret)
93414a6e2ebSJinke Han 		goto err_free;
93514a6e2ebSJinke Han 
93634dbad5dSOmar Sandoval 	blk_stat_add_callback(q, rwb->cb);
937e34cbd30SJens Axboe 
938e34cbd30SJens Axboe 	return 0;
93914a6e2ebSJinke Han 
94014a6e2ebSJinke Han err_free:
94114a6e2ebSJinke Han 	blk_stat_free_callback(rwb->cb);
94214a6e2ebSJinke Han 	kfree(rwb);
94314a6e2ebSJinke Han 	return ret;
94414a6e2ebSJinke Han 
945e34cbd30SJens Axboe }
946