1e34cbd30SJens Axboe /* 2e34cbd30SJens Axboe * buffered writeback throttling. loosely based on CoDel. We can't drop 3e34cbd30SJens Axboe * packets for IO scheduling, so the logic is something like this: 4e34cbd30SJens Axboe * 5e34cbd30SJens Axboe * - Monitor latencies in a defined window of time. 6e34cbd30SJens Axboe * - If the minimum latency in the above window exceeds some target, increment 7e34cbd30SJens Axboe * scaling step and scale down queue depth by a factor of 2x. The monitoring 8e34cbd30SJens Axboe * window is then shrunk to 100 / sqrt(scaling step + 1). 9e34cbd30SJens Axboe * - For any window where we don't have solid data on what the latencies 10e34cbd30SJens Axboe * look like, retain status quo. 11e34cbd30SJens Axboe * - If latencies look good, decrement scaling step. 12e34cbd30SJens Axboe * - If we're only doing writes, allow the scaling step to go negative. This 13e34cbd30SJens Axboe * will temporarily boost write performance, snapping back to a stable 14e34cbd30SJens Axboe * scaling step of 0 if reads show up or the heavy writers finish. Unlike 15e34cbd30SJens Axboe * positive scaling steps where we shrink the monitoring window, a negative 16e34cbd30SJens Axboe * scaling step retains the default step==0 window size. 17e34cbd30SJens Axboe * 18e34cbd30SJens Axboe * Copyright (C) 2016 Jens Axboe 19e34cbd30SJens Axboe * 20e34cbd30SJens Axboe */ 21e34cbd30SJens Axboe #include <linux/kernel.h> 22e34cbd30SJens Axboe #include <linux/blk_types.h> 23e34cbd30SJens Axboe #include <linux/slab.h> 24e34cbd30SJens Axboe #include <linux/backing-dev.h> 25e34cbd30SJens Axboe #include <linux/swap.h> 26e34cbd30SJens Axboe 27e34cbd30SJens Axboe #include "blk-wbt.h" 28e34cbd30SJens Axboe 29e34cbd30SJens Axboe #define CREATE_TRACE_POINTS 30e34cbd30SJens Axboe #include <trace/events/wbt.h> 31e34cbd30SJens Axboe 32e34cbd30SJens Axboe enum { 33e34cbd30SJens Axboe /* 34e34cbd30SJens Axboe * Default setting, we'll scale up (to 75% of QD max) or down (min 1) 35e34cbd30SJens Axboe * from here depending on device stats 36e34cbd30SJens Axboe */ 37e34cbd30SJens Axboe RWB_DEF_DEPTH = 16, 38e34cbd30SJens Axboe 39e34cbd30SJens Axboe /* 40e34cbd30SJens Axboe * 100msec window 41e34cbd30SJens Axboe */ 42e34cbd30SJens Axboe RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, 43e34cbd30SJens Axboe 44e34cbd30SJens Axboe /* 45e34cbd30SJens Axboe * Disregard stats, if we don't meet this minimum 46e34cbd30SJens Axboe */ 47e34cbd30SJens Axboe RWB_MIN_WRITE_SAMPLES = 3, 48e34cbd30SJens Axboe 49e34cbd30SJens Axboe /* 50e34cbd30SJens Axboe * If we have this number of consecutive windows with not enough 51e34cbd30SJens Axboe * information to scale up or down, scale up. 52e34cbd30SJens Axboe */ 53e34cbd30SJens Axboe RWB_UNKNOWN_BUMP = 5, 54e34cbd30SJens Axboe }; 55e34cbd30SJens Axboe 56e34cbd30SJens Axboe static inline bool rwb_enabled(struct rq_wb *rwb) 57e34cbd30SJens Axboe { 58e34cbd30SJens Axboe return rwb && rwb->wb_normal != 0; 59e34cbd30SJens Axboe } 60e34cbd30SJens Axboe 61e34cbd30SJens Axboe /* 62e34cbd30SJens Axboe * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 63e34cbd30SJens Axboe * false if 'v' + 1 would be bigger than 'below'. 64e34cbd30SJens Axboe */ 65e34cbd30SJens Axboe static bool atomic_inc_below(atomic_t *v, int below) 66e34cbd30SJens Axboe { 67e34cbd30SJens Axboe int cur = atomic_read(v); 68e34cbd30SJens Axboe 69e34cbd30SJens Axboe for (;;) { 70e34cbd30SJens Axboe int old; 71e34cbd30SJens Axboe 72e34cbd30SJens Axboe if (cur >= below) 73e34cbd30SJens Axboe return false; 74e34cbd30SJens Axboe old = atomic_cmpxchg(v, cur, cur + 1); 75e34cbd30SJens Axboe if (old == cur) 76e34cbd30SJens Axboe break; 77e34cbd30SJens Axboe cur = old; 78e34cbd30SJens Axboe } 79e34cbd30SJens Axboe 80e34cbd30SJens Axboe return true; 81e34cbd30SJens Axboe } 82e34cbd30SJens Axboe 83e34cbd30SJens Axboe static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 84e34cbd30SJens Axboe { 85e34cbd30SJens Axboe if (rwb_enabled(rwb)) { 86e34cbd30SJens Axboe const unsigned long cur = jiffies; 87e34cbd30SJens Axboe 88e34cbd30SJens Axboe if (cur != *var) 89e34cbd30SJens Axboe *var = cur; 90e34cbd30SJens Axboe } 91e34cbd30SJens Axboe } 92e34cbd30SJens Axboe 93e34cbd30SJens Axboe /* 94e34cbd30SJens Axboe * If a task was rate throttled in balance_dirty_pages() within the last 95e34cbd30SJens Axboe * second or so, use that to indicate a higher cleaning rate. 96e34cbd30SJens Axboe */ 97e34cbd30SJens Axboe static bool wb_recent_wait(struct rq_wb *rwb) 98e34cbd30SJens Axboe { 99dc3b17ccSJan Kara struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; 100e34cbd30SJens Axboe 101e34cbd30SJens Axboe return time_before(jiffies, wb->dirty_sleep + HZ); 102e34cbd30SJens Axboe } 103e34cbd30SJens Axboe 1048bea6090SJens Axboe static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, 1058bea6090SJens Axboe enum wbt_flags wb_acct) 106e34cbd30SJens Axboe { 1078bea6090SJens Axboe if (wb_acct & WBT_KSWAPD) 1088bea6090SJens Axboe return &rwb->rq_wait[WBT_RWQ_KSWAPD]; 109782f5697SJens Axboe else if (wb_acct & WBT_DISCARD) 110782f5697SJens Axboe return &rwb->rq_wait[WBT_RWQ_DISCARD]; 1118bea6090SJens Axboe 1128bea6090SJens Axboe return &rwb->rq_wait[WBT_RWQ_BG]; 113e34cbd30SJens Axboe } 114e34cbd30SJens Axboe 115e34cbd30SJens Axboe static void rwb_wake_all(struct rq_wb *rwb) 116e34cbd30SJens Axboe { 117e34cbd30SJens Axboe int i; 118e34cbd30SJens Axboe 119e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 120e34cbd30SJens Axboe struct rq_wait *rqw = &rwb->rq_wait[i]; 121e34cbd30SJens Axboe 122e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) 123e34cbd30SJens Axboe wake_up_all(&rqw->wait); 124e34cbd30SJens Axboe } 125e34cbd30SJens Axboe } 126e34cbd30SJens Axboe 127e34cbd30SJens Axboe void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) 128e34cbd30SJens Axboe { 129e34cbd30SJens Axboe struct rq_wait *rqw; 130e34cbd30SJens Axboe int inflight, limit; 131e34cbd30SJens Axboe 132e34cbd30SJens Axboe if (!(wb_acct & WBT_TRACKED)) 133e34cbd30SJens Axboe return; 134e34cbd30SJens Axboe 1358bea6090SJens Axboe rqw = get_rq_wait(rwb, wb_acct); 136e34cbd30SJens Axboe inflight = atomic_dec_return(&rqw->inflight); 137e34cbd30SJens Axboe 138e34cbd30SJens Axboe /* 139e34cbd30SJens Axboe * wbt got disabled with IO in flight. Wake up any potential 140e34cbd30SJens Axboe * waiters, we don't have to do more than that. 141e34cbd30SJens Axboe */ 142e34cbd30SJens Axboe if (unlikely(!rwb_enabled(rwb))) { 143e34cbd30SJens Axboe rwb_wake_all(rwb); 144e34cbd30SJens Axboe return; 145e34cbd30SJens Axboe } 146e34cbd30SJens Axboe 147e34cbd30SJens Axboe /* 148782f5697SJens Axboe * For discards, our limit is always the background. For writes, if 149782f5697SJens Axboe * the device does write back caching, drop further down before we 150782f5697SJens Axboe * wake people up. 151e34cbd30SJens Axboe */ 152782f5697SJens Axboe if (wb_acct & WBT_DISCARD) 153782f5697SJens Axboe limit = rwb->wb_background; 154782f5697SJens Axboe else if (rwb->wc && !wb_recent_wait(rwb)) 155e34cbd30SJens Axboe limit = 0; 156e34cbd30SJens Axboe else 157e34cbd30SJens Axboe limit = rwb->wb_normal; 158e34cbd30SJens Axboe 159e34cbd30SJens Axboe /* 160e34cbd30SJens Axboe * Don't wake anyone up if we are above the normal limit. 161e34cbd30SJens Axboe */ 162e34cbd30SJens Axboe if (inflight && inflight >= limit) 163e34cbd30SJens Axboe return; 164e34cbd30SJens Axboe 165e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) { 166e34cbd30SJens Axboe int diff = limit - inflight; 167e34cbd30SJens Axboe 168e34cbd30SJens Axboe if (!inflight || diff >= rwb->wb_background / 2) 169e34cbd30SJens Axboe wake_up_all(&rqw->wait); 170e34cbd30SJens Axboe } 171e34cbd30SJens Axboe } 172e34cbd30SJens Axboe 173e34cbd30SJens Axboe /* 174e34cbd30SJens Axboe * Called on completion of a request. Note that it's also called when 175e34cbd30SJens Axboe * a request is merged, when the request gets freed. 176e34cbd30SJens Axboe */ 177e34cbd30SJens Axboe void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) 178e34cbd30SJens Axboe { 179e34cbd30SJens Axboe if (!rwb) 180e34cbd30SJens Axboe return; 181e34cbd30SJens Axboe 182e34cbd30SJens Axboe if (!wbt_is_tracked(stat)) { 183e34cbd30SJens Axboe if (rwb->sync_cookie == stat) { 184e34cbd30SJens Axboe rwb->sync_issue = 0; 185e34cbd30SJens Axboe rwb->sync_cookie = NULL; 186e34cbd30SJens Axboe } 187e34cbd30SJens Axboe 188e34cbd30SJens Axboe if (wbt_is_read(stat)) 189e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_comp); 190e34cbd30SJens Axboe } else { 191e34cbd30SJens Axboe WARN_ON_ONCE(stat == rwb->sync_cookie); 192e34cbd30SJens Axboe __wbt_done(rwb, wbt_stat_to_mask(stat)); 193e34cbd30SJens Axboe } 19462d772faSweiping zhang wbt_clear_state(stat); 195e34cbd30SJens Axboe } 196e34cbd30SJens Axboe 197e34cbd30SJens Axboe /* 198e34cbd30SJens Axboe * Return true, if we can't increase the depth further by scaling 199e34cbd30SJens Axboe */ 200e34cbd30SJens Axboe static bool calc_wb_limits(struct rq_wb *rwb) 201e34cbd30SJens Axboe { 202e34cbd30SJens Axboe unsigned int depth; 203e34cbd30SJens Axboe bool ret = false; 204e34cbd30SJens Axboe 205e34cbd30SJens Axboe if (!rwb->min_lat_nsec) { 206e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; 207e34cbd30SJens Axboe return false; 208e34cbd30SJens Axboe } 209e34cbd30SJens Axboe 210e34cbd30SJens Axboe /* 211e34cbd30SJens Axboe * For QD=1 devices, this is a special case. It's important for those 212e34cbd30SJens Axboe * to have one request ready when one completes, so force a depth of 213e34cbd30SJens Axboe * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 214e34cbd30SJens Axboe * since the device can't have more than that in flight. If we're 215e34cbd30SJens Axboe * scaling down, then keep a setting of 1/1/1. 216e34cbd30SJens Axboe */ 217e34cbd30SJens Axboe if (rwb->queue_depth == 1) { 218e34cbd30SJens Axboe if (rwb->scale_step > 0) 219e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 1; 220e34cbd30SJens Axboe else { 221e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 2; 222e34cbd30SJens Axboe ret = true; 223e34cbd30SJens Axboe } 224e34cbd30SJens Axboe rwb->wb_background = 1; 225e34cbd30SJens Axboe } else { 226e34cbd30SJens Axboe /* 227e34cbd30SJens Axboe * scale_step == 0 is our default state. If we have suffered 228e34cbd30SJens Axboe * latency spikes, step will be > 0, and we shrink the 229e34cbd30SJens Axboe * allowed write depths. If step is < 0, we're only doing 230e34cbd30SJens Axboe * writes, and we allow a temporarily higher depth to 231e34cbd30SJens Axboe * increase performance. 232e34cbd30SJens Axboe */ 233e34cbd30SJens Axboe depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); 234e34cbd30SJens Axboe if (rwb->scale_step > 0) 235e34cbd30SJens Axboe depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); 236e34cbd30SJens Axboe else if (rwb->scale_step < 0) { 237e34cbd30SJens Axboe unsigned int maxd = 3 * rwb->queue_depth / 4; 238e34cbd30SJens Axboe 239e34cbd30SJens Axboe depth = 1 + ((depth - 1) << -rwb->scale_step); 240e34cbd30SJens Axboe if (depth > maxd) { 241e34cbd30SJens Axboe depth = maxd; 242e34cbd30SJens Axboe ret = true; 243e34cbd30SJens Axboe } 244e34cbd30SJens Axboe } 245e34cbd30SJens Axboe 246e34cbd30SJens Axboe /* 247e34cbd30SJens Axboe * Set our max/normal/bg queue depths based on how far 248e34cbd30SJens Axboe * we have scaled down (->scale_step). 249e34cbd30SJens Axboe */ 250e34cbd30SJens Axboe rwb->wb_max = depth; 251e34cbd30SJens Axboe rwb->wb_normal = (rwb->wb_max + 1) / 2; 252e34cbd30SJens Axboe rwb->wb_background = (rwb->wb_max + 3) / 4; 253e34cbd30SJens Axboe } 254e34cbd30SJens Axboe 255e34cbd30SJens Axboe return ret; 256e34cbd30SJens Axboe } 257e34cbd30SJens Axboe 2584121d385SArnd Bergmann static inline bool stat_sample_valid(struct blk_rq_stat *stat) 259e34cbd30SJens Axboe { 260e34cbd30SJens Axboe /* 261e34cbd30SJens Axboe * We need at least one read sample, and a minimum of 262e34cbd30SJens Axboe * RWB_MIN_WRITE_SAMPLES. We require some write samples to know 263e34cbd30SJens Axboe * that it's writes impacting us, and not just some sole read on 264e34cbd30SJens Axboe * a device that is in a lower power state. 265e34cbd30SJens Axboe */ 266fa2e39cbSOmar Sandoval return (stat[READ].nr_samples >= 1 && 267fa2e39cbSOmar Sandoval stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); 268e34cbd30SJens Axboe } 269e34cbd30SJens Axboe 270e34cbd30SJens Axboe static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 271e34cbd30SJens Axboe { 2726aa7de05SMark Rutland u64 now, issue = READ_ONCE(rwb->sync_issue); 273e34cbd30SJens Axboe 274e34cbd30SJens Axboe if (!issue || !rwb->sync_cookie) 275e34cbd30SJens Axboe return 0; 276e34cbd30SJens Axboe 277e34cbd30SJens Axboe now = ktime_to_ns(ktime_get()); 278e34cbd30SJens Axboe return now - issue; 279e34cbd30SJens Axboe } 280e34cbd30SJens Axboe 281e34cbd30SJens Axboe enum { 282e34cbd30SJens Axboe LAT_OK = 1, 283e34cbd30SJens Axboe LAT_UNKNOWN, 284e34cbd30SJens Axboe LAT_UNKNOWN_WRITES, 285e34cbd30SJens Axboe LAT_EXCEEDED, 286e34cbd30SJens Axboe }; 287e34cbd30SJens Axboe 28834dbad5dSOmar Sandoval static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 289e34cbd30SJens Axboe { 290dc3b17ccSJan Kara struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 291e34cbd30SJens Axboe u64 thislat; 292e34cbd30SJens Axboe 293e34cbd30SJens Axboe /* 294e34cbd30SJens Axboe * If our stored sync issue exceeds the window size, or it 295e34cbd30SJens Axboe * exceeds our min target AND we haven't logged any entries, 296e34cbd30SJens Axboe * flag the latency as exceeded. wbt works off completion latencies, 297e34cbd30SJens Axboe * but for a flooded device, a single sync IO can take a long time 298e34cbd30SJens Axboe * to complete after being issued. If this time exceeds our 299e34cbd30SJens Axboe * monitoring window AND we didn't see any other completions in that 300e34cbd30SJens Axboe * window, then count that sync IO as a violation of the latency. 301e34cbd30SJens Axboe */ 302e34cbd30SJens Axboe thislat = rwb_sync_issue_lat(rwb); 303e34cbd30SJens Axboe if (thislat > rwb->cur_win_nsec || 304fa2e39cbSOmar Sandoval (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { 305d8a0cbfdSJens Axboe trace_wbt_lat(bdi, thislat); 306e34cbd30SJens Axboe return LAT_EXCEEDED; 307e34cbd30SJens Axboe } 308e34cbd30SJens Axboe 309e34cbd30SJens Axboe /* 310e34cbd30SJens Axboe * No read/write mix, if stat isn't valid 311e34cbd30SJens Axboe */ 312e34cbd30SJens Axboe if (!stat_sample_valid(stat)) { 313e34cbd30SJens Axboe /* 314e34cbd30SJens Axboe * If we had writes in this stat window and the window is 315e34cbd30SJens Axboe * current, we're only doing writes. If a task recently 316e34cbd30SJens Axboe * waited or still has writes in flights, consider us doing 317e34cbd30SJens Axboe * just writes as well. 318e34cbd30SJens Axboe */ 31934dbad5dSOmar Sandoval if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || 32034dbad5dSOmar Sandoval wbt_inflight(rwb)) 321e34cbd30SJens Axboe return LAT_UNKNOWN_WRITES; 322e34cbd30SJens Axboe return LAT_UNKNOWN; 323e34cbd30SJens Axboe } 324e34cbd30SJens Axboe 325e34cbd30SJens Axboe /* 326e34cbd30SJens Axboe * If the 'min' latency exceeds our target, step down. 327e34cbd30SJens Axboe */ 328fa2e39cbSOmar Sandoval if (stat[READ].min > rwb->min_lat_nsec) { 329fa2e39cbSOmar Sandoval trace_wbt_lat(bdi, stat[READ].min); 330d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 331e34cbd30SJens Axboe return LAT_EXCEEDED; 332e34cbd30SJens Axboe } 333e34cbd30SJens Axboe 334e34cbd30SJens Axboe if (rwb->scale_step) 335d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 336e34cbd30SJens Axboe 337e34cbd30SJens Axboe return LAT_OK; 338e34cbd30SJens Axboe } 339e34cbd30SJens Axboe 340e34cbd30SJens Axboe static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 341e34cbd30SJens Axboe { 342dc3b17ccSJan Kara struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 343d8a0cbfdSJens Axboe 344d8a0cbfdSJens Axboe trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, 345e34cbd30SJens Axboe rwb->wb_background, rwb->wb_normal, rwb->wb_max); 346e34cbd30SJens Axboe } 347e34cbd30SJens Axboe 348e34cbd30SJens Axboe static void scale_up(struct rq_wb *rwb) 349e34cbd30SJens Axboe { 350e34cbd30SJens Axboe /* 351e34cbd30SJens Axboe * Hit max in previous round, stop here 352e34cbd30SJens Axboe */ 353e34cbd30SJens Axboe if (rwb->scaled_max) 354e34cbd30SJens Axboe return; 355e34cbd30SJens Axboe 356e34cbd30SJens Axboe rwb->scale_step--; 357e34cbd30SJens Axboe rwb->unknown_cnt = 0; 358e34cbd30SJens Axboe 359e34cbd30SJens Axboe rwb->scaled_max = calc_wb_limits(rwb); 360e34cbd30SJens Axboe 361e34cbd30SJens Axboe rwb_wake_all(rwb); 362e34cbd30SJens Axboe 363e34cbd30SJens Axboe rwb_trace_step(rwb, "step up"); 364e34cbd30SJens Axboe } 365e34cbd30SJens Axboe 366e34cbd30SJens Axboe /* 367e34cbd30SJens Axboe * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 368e34cbd30SJens Axboe * had a latency violation. 369e34cbd30SJens Axboe */ 370e34cbd30SJens Axboe static void scale_down(struct rq_wb *rwb, bool hard_throttle) 371e34cbd30SJens Axboe { 372e34cbd30SJens Axboe /* 373e34cbd30SJens Axboe * Stop scaling down when we've hit the limit. This also prevents 374e34cbd30SJens Axboe * ->scale_step from going to crazy values, if the device can't 375e34cbd30SJens Axboe * keep up. 376e34cbd30SJens Axboe */ 377e34cbd30SJens Axboe if (rwb->wb_max == 1) 378e34cbd30SJens Axboe return; 379e34cbd30SJens Axboe 380e34cbd30SJens Axboe if (rwb->scale_step < 0 && hard_throttle) 381e34cbd30SJens Axboe rwb->scale_step = 0; 382e34cbd30SJens Axboe else 383e34cbd30SJens Axboe rwb->scale_step++; 384e34cbd30SJens Axboe 385e34cbd30SJens Axboe rwb->scaled_max = false; 386e34cbd30SJens Axboe rwb->unknown_cnt = 0; 387e34cbd30SJens Axboe calc_wb_limits(rwb); 388e34cbd30SJens Axboe rwb_trace_step(rwb, "step down"); 389e34cbd30SJens Axboe } 390e34cbd30SJens Axboe 391e34cbd30SJens Axboe static void rwb_arm_timer(struct rq_wb *rwb) 392e34cbd30SJens Axboe { 393e34cbd30SJens Axboe if (rwb->scale_step > 0) { 394e34cbd30SJens Axboe /* 395e34cbd30SJens Axboe * We should speed this up, using some variant of a fast 396e34cbd30SJens Axboe * integer inverse square root calculation. Since we only do 397e34cbd30SJens Axboe * this for every window expiration, it's not a huge deal, 398e34cbd30SJens Axboe * though. 399e34cbd30SJens Axboe */ 400e34cbd30SJens Axboe rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 401e34cbd30SJens Axboe int_sqrt((rwb->scale_step + 1) << 8)); 402e34cbd30SJens Axboe } else { 403e34cbd30SJens Axboe /* 404e34cbd30SJens Axboe * For step < 0, we don't want to increase/decrease the 405e34cbd30SJens Axboe * window size. 406e34cbd30SJens Axboe */ 407e34cbd30SJens Axboe rwb->cur_win_nsec = rwb->win_nsec; 408e34cbd30SJens Axboe } 409e34cbd30SJens Axboe 41034dbad5dSOmar Sandoval blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); 411e34cbd30SJens Axboe } 412e34cbd30SJens Axboe 41334dbad5dSOmar Sandoval static void wb_timer_fn(struct blk_stat_callback *cb) 414e34cbd30SJens Axboe { 41534dbad5dSOmar Sandoval struct rq_wb *rwb = cb->data; 416e34cbd30SJens Axboe unsigned int inflight = wbt_inflight(rwb); 417e34cbd30SJens Axboe int status; 418e34cbd30SJens Axboe 41934dbad5dSOmar Sandoval status = latency_exceeded(rwb, cb->stat); 420e34cbd30SJens Axboe 421dc3b17ccSJan Kara trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 422d8a0cbfdSJens Axboe inflight); 423e34cbd30SJens Axboe 424e34cbd30SJens Axboe /* 425e34cbd30SJens Axboe * If we exceeded the latency target, step down. If we did not, 426e34cbd30SJens Axboe * step one level up. If we don't know enough to say either exceeded 427e34cbd30SJens Axboe * or ok, then don't do anything. 428e34cbd30SJens Axboe */ 429e34cbd30SJens Axboe switch (status) { 430e34cbd30SJens Axboe case LAT_EXCEEDED: 431e34cbd30SJens Axboe scale_down(rwb, true); 432e34cbd30SJens Axboe break; 433e34cbd30SJens Axboe case LAT_OK: 434e34cbd30SJens Axboe scale_up(rwb); 435e34cbd30SJens Axboe break; 436e34cbd30SJens Axboe case LAT_UNKNOWN_WRITES: 437e34cbd30SJens Axboe /* 438e34cbd30SJens Axboe * We started a the center step, but don't have a valid 439e34cbd30SJens Axboe * read/write sample, but we do have writes going on. 440e34cbd30SJens Axboe * Allow step to go negative, to increase write perf. 441e34cbd30SJens Axboe */ 442e34cbd30SJens Axboe scale_up(rwb); 443e34cbd30SJens Axboe break; 444e34cbd30SJens Axboe case LAT_UNKNOWN: 445e34cbd30SJens Axboe if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) 446e34cbd30SJens Axboe break; 447e34cbd30SJens Axboe /* 448e34cbd30SJens Axboe * We get here when previously scaled reduced depth, and we 449e34cbd30SJens Axboe * currently don't have a valid read/write sample. For that 450e34cbd30SJens Axboe * case, slowly return to center state (step == 0). 451e34cbd30SJens Axboe */ 452e34cbd30SJens Axboe if (rwb->scale_step > 0) 453e34cbd30SJens Axboe scale_up(rwb); 454e34cbd30SJens Axboe else if (rwb->scale_step < 0) 455e34cbd30SJens Axboe scale_down(rwb, false); 456e34cbd30SJens Axboe break; 457e34cbd30SJens Axboe default: 458e34cbd30SJens Axboe break; 459e34cbd30SJens Axboe } 460e34cbd30SJens Axboe 461e34cbd30SJens Axboe /* 462e34cbd30SJens Axboe * Re-arm timer, if we have IO in flight 463e34cbd30SJens Axboe */ 464e34cbd30SJens Axboe if (rwb->scale_step || inflight) 465e34cbd30SJens Axboe rwb_arm_timer(rwb); 466e34cbd30SJens Axboe } 467e34cbd30SJens Axboe 468e34cbd30SJens Axboe void wbt_update_limits(struct rq_wb *rwb) 469e34cbd30SJens Axboe { 470e34cbd30SJens Axboe rwb->scale_step = 0; 471e34cbd30SJens Axboe rwb->scaled_max = false; 472e34cbd30SJens Axboe calc_wb_limits(rwb); 473e34cbd30SJens Axboe 474e34cbd30SJens Axboe rwb_wake_all(rwb); 475e34cbd30SJens Axboe } 476e34cbd30SJens Axboe 477e34cbd30SJens Axboe static bool close_io(struct rq_wb *rwb) 478e34cbd30SJens Axboe { 479e34cbd30SJens Axboe const unsigned long now = jiffies; 480e34cbd30SJens Axboe 481e34cbd30SJens Axboe return time_before(now, rwb->last_issue + HZ / 10) || 482e34cbd30SJens Axboe time_before(now, rwb->last_comp + HZ / 10); 483e34cbd30SJens Axboe } 484e34cbd30SJens Axboe 485e34cbd30SJens Axboe #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) 486e34cbd30SJens Axboe 487e34cbd30SJens Axboe static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) 488e34cbd30SJens Axboe { 489e34cbd30SJens Axboe unsigned int limit; 490e34cbd30SJens Axboe 491782f5697SJens Axboe if ((rw & REQ_OP_MASK) == REQ_OP_DISCARD) 492782f5697SJens Axboe return rwb->wb_background; 493782f5697SJens Axboe 494e34cbd30SJens Axboe /* 495e34cbd30SJens Axboe * At this point we know it's a buffered write. If this is 4963dfbdc44Sweiping zhang * kswapd trying to free memory, or REQ_SYNC is set, then 497e34cbd30SJens Axboe * it's WB_SYNC_ALL writeback, and we'll use the max limit for 498e34cbd30SJens Axboe * that. If the write is marked as a background write, then use 499e34cbd30SJens Axboe * the idle limit, or go to normal if we haven't had competing 500e34cbd30SJens Axboe * IO for a bit. 501e34cbd30SJens Axboe */ 502e34cbd30SJens Axboe if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 503e34cbd30SJens Axboe limit = rwb->wb_max; 504e34cbd30SJens Axboe else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 505e34cbd30SJens Axboe /* 506e34cbd30SJens Axboe * If less than 100ms since we completed unrelated IO, 507e34cbd30SJens Axboe * limit us to half the depth for background writeback. 508e34cbd30SJens Axboe */ 509e34cbd30SJens Axboe limit = rwb->wb_background; 510e34cbd30SJens Axboe } else 511e34cbd30SJens Axboe limit = rwb->wb_normal; 512e34cbd30SJens Axboe 513e34cbd30SJens Axboe return limit; 514e34cbd30SJens Axboe } 515e34cbd30SJens Axboe 516e34cbd30SJens Axboe static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, 517ac6424b9SIngo Molnar wait_queue_entry_t *wait, unsigned long rw) 518e34cbd30SJens Axboe { 519e34cbd30SJens Axboe /* 520e34cbd30SJens Axboe * inc it here even if disabled, since we'll dec it at completion. 521e34cbd30SJens Axboe * this only happens if the task was sleeping in __wbt_wait(), 522e34cbd30SJens Axboe * and someone turned it off at the same time. 523e34cbd30SJens Axboe */ 524e34cbd30SJens Axboe if (!rwb_enabled(rwb)) { 525e34cbd30SJens Axboe atomic_inc(&rqw->inflight); 526e34cbd30SJens Axboe return true; 527e34cbd30SJens Axboe } 528e34cbd30SJens Axboe 529e34cbd30SJens Axboe /* 530e34cbd30SJens Axboe * If the waitqueue is already active and we are not the next 531e34cbd30SJens Axboe * in line to be woken up, wait for our turn. 532e34cbd30SJens Axboe */ 533e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait) && 5342055da97SIngo Molnar rqw->wait.head.next != &wait->entry) 535e34cbd30SJens Axboe return false; 536e34cbd30SJens Axboe 537e34cbd30SJens Axboe return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); 538e34cbd30SJens Axboe } 539e34cbd30SJens Axboe 540e34cbd30SJens Axboe /* 541e34cbd30SJens Axboe * Block if we will exceed our limit, or if we are currently waiting for 542e34cbd30SJens Axboe * the timer to kick off queuing again. 543e34cbd30SJens Axboe */ 5448bea6090SJens Axboe static void __wbt_wait(struct rq_wb *rwb, enum wbt_flags wb_acct, 5458bea6090SJens Axboe unsigned long rw, spinlock_t *lock) 5469eca5350SBart Van Assche __releases(lock) 5479eca5350SBart Van Assche __acquires(lock) 548e34cbd30SJens Axboe { 5498bea6090SJens Axboe struct rq_wait *rqw = get_rq_wait(rwb, wb_acct); 550e34cbd30SJens Axboe DEFINE_WAIT(wait); 551e34cbd30SJens Axboe 552e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 553e34cbd30SJens Axboe return; 554e34cbd30SJens Axboe 555e34cbd30SJens Axboe do { 556e34cbd30SJens Axboe prepare_to_wait_exclusive(&rqw->wait, &wait, 557e34cbd30SJens Axboe TASK_UNINTERRUPTIBLE); 558e34cbd30SJens Axboe 559e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 560e34cbd30SJens Axboe break; 561e34cbd30SJens Axboe 5629eca5350SBart Van Assche if (lock) { 563e34cbd30SJens Axboe spin_unlock_irq(lock); 564e34cbd30SJens Axboe io_schedule(); 565e34cbd30SJens Axboe spin_lock_irq(lock); 5669eca5350SBart Van Assche } else 5679eca5350SBart Van Assche io_schedule(); 568e34cbd30SJens Axboe } while (1); 569e34cbd30SJens Axboe 570e34cbd30SJens Axboe finish_wait(&rqw->wait, &wait); 571e34cbd30SJens Axboe } 572e34cbd30SJens Axboe 573e34cbd30SJens Axboe static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 574e34cbd30SJens Axboe { 575782f5697SJens Axboe switch (bio_op(bio)) { 576782f5697SJens Axboe case REQ_OP_WRITE: 577e34cbd30SJens Axboe /* 578e34cbd30SJens Axboe * Don't throttle WRITE_ODIRECT 579e34cbd30SJens Axboe */ 580782f5697SJens Axboe if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == 581782f5697SJens Axboe (REQ_SYNC | REQ_IDLE)) 582e34cbd30SJens Axboe return false; 583782f5697SJens Axboe /* fallthrough */ 584782f5697SJens Axboe case REQ_OP_DISCARD: 585e34cbd30SJens Axboe return true; 586782f5697SJens Axboe default: 587782f5697SJens Axboe return false; 588782f5697SJens Axboe } 589e34cbd30SJens Axboe } 590e34cbd30SJens Axboe 591e34cbd30SJens Axboe /* 592e34cbd30SJens Axboe * Returns true if the IO request should be accounted, false if not. 593e34cbd30SJens Axboe * May sleep, if we have exceeded the writeback limits. Caller can pass 594e34cbd30SJens Axboe * in an irq held spinlock, if it holds one when calling this function. 595e34cbd30SJens Axboe * If we do sleep, we'll release and re-grab it. 596e34cbd30SJens Axboe */ 597f2e0a0b2SBart Van Assche enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) 598e34cbd30SJens Axboe { 5998bea6090SJens Axboe enum wbt_flags ret = 0; 600e34cbd30SJens Axboe 601e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 602e34cbd30SJens Axboe return 0; 603e34cbd30SJens Axboe 604e34cbd30SJens Axboe if (bio_op(bio) == REQ_OP_READ) 605e34cbd30SJens Axboe ret = WBT_READ; 606e34cbd30SJens Axboe 607e34cbd30SJens Axboe if (!wbt_should_throttle(rwb, bio)) { 608e34cbd30SJens Axboe if (ret & WBT_READ) 609e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_issue); 610e34cbd30SJens Axboe return ret; 611e34cbd30SJens Axboe } 612e34cbd30SJens Axboe 6138bea6090SJens Axboe if (current_is_kswapd()) 6148bea6090SJens Axboe ret |= WBT_KSWAPD; 615782f5697SJens Axboe if (bio_op(bio) == REQ_OP_DISCARD) 616782f5697SJens Axboe ret |= WBT_DISCARD; 6178bea6090SJens Axboe 6188bea6090SJens Axboe __wbt_wait(rwb, ret, bio->bi_opf, lock); 619e34cbd30SJens Axboe 62034dbad5dSOmar Sandoval if (!blk_stat_is_active(rwb->cb)) 621e34cbd30SJens Axboe rwb_arm_timer(rwb); 622e34cbd30SJens Axboe 623e34cbd30SJens Axboe return ret | WBT_TRACKED; 624e34cbd30SJens Axboe } 625e34cbd30SJens Axboe 626e34cbd30SJens Axboe void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) 627e34cbd30SJens Axboe { 628e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 629e34cbd30SJens Axboe return; 630e34cbd30SJens Axboe 631e34cbd30SJens Axboe /* 632e34cbd30SJens Axboe * Track sync issue, in case it takes a long time to complete. Allows 633e34cbd30SJens Axboe * us to react quicker, if a sync IO takes a long time to complete. 634e34cbd30SJens Axboe * Note that this is just a hint. 'stat' can go away when the 635e34cbd30SJens Axboe * request completes, so it's important we never dereference it. We 636e34cbd30SJens Axboe * only use the address to compare with, which is why we store the 637e34cbd30SJens Axboe * sync_issue time locally. 638e34cbd30SJens Axboe */ 639e34cbd30SJens Axboe if (wbt_is_read(stat) && !rwb->sync_issue) { 640e34cbd30SJens Axboe rwb->sync_cookie = stat; 641e34cbd30SJens Axboe rwb->sync_issue = blk_stat_time(stat); 642e34cbd30SJens Axboe } 643e34cbd30SJens Axboe } 644e34cbd30SJens Axboe 645e34cbd30SJens Axboe void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) 646e34cbd30SJens Axboe { 647e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 648e34cbd30SJens Axboe return; 649e34cbd30SJens Axboe if (stat == rwb->sync_cookie) { 650e34cbd30SJens Axboe rwb->sync_issue = 0; 651e34cbd30SJens Axboe rwb->sync_cookie = NULL; 652e34cbd30SJens Axboe } 653e34cbd30SJens Axboe } 654e34cbd30SJens Axboe 655e34cbd30SJens Axboe void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 656e34cbd30SJens Axboe { 657e34cbd30SJens Axboe if (rwb) { 658e34cbd30SJens Axboe rwb->queue_depth = depth; 659e34cbd30SJens Axboe wbt_update_limits(rwb); 660e34cbd30SJens Axboe } 661e34cbd30SJens Axboe } 662e34cbd30SJens Axboe 663e34cbd30SJens Axboe void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) 664e34cbd30SJens Axboe { 665e34cbd30SJens Axboe if (rwb) 666e34cbd30SJens Axboe rwb->wc = write_cache_on; 667e34cbd30SJens Axboe } 668e34cbd30SJens Axboe 669fa224eedSJens Axboe /* 670b5dc5d4dSLuca Miccio * Disable wbt, if enabled by default. 671fa224eedSJens Axboe */ 672fa224eedSJens Axboe void wbt_disable_default(struct request_queue *q) 673e34cbd30SJens Axboe { 674fa224eedSJens Axboe struct rq_wb *rwb = q->rq_wb; 675fa224eedSJens Axboe 6763f19cd23SJan Kara if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) 6773f19cd23SJan Kara wbt_exit(q); 678e34cbd30SJens Axboe } 679fa224eedSJens Axboe EXPORT_SYMBOL_GPL(wbt_disable_default); 680e34cbd30SJens Axboe 6818330cdb0SJan Kara /* 6828330cdb0SJan Kara * Enable wbt if defaults are configured that way 6838330cdb0SJan Kara */ 6848330cdb0SJan Kara void wbt_enable_default(struct request_queue *q) 6858330cdb0SJan Kara { 6868330cdb0SJan Kara /* Throttling already enabled? */ 6878330cdb0SJan Kara if (q->rq_wb) 6888330cdb0SJan Kara return; 6898330cdb0SJan Kara 6908330cdb0SJan Kara /* Queue not registered? Maybe shutting down... */ 6918330cdb0SJan Kara if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 6928330cdb0SJan Kara return; 6938330cdb0SJan Kara 6948330cdb0SJan Kara if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 6958330cdb0SJan Kara (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) 6968330cdb0SJan Kara wbt_init(q); 6978330cdb0SJan Kara } 6988330cdb0SJan Kara EXPORT_SYMBOL_GPL(wbt_enable_default); 6998330cdb0SJan Kara 70080e091d1SJens Axboe u64 wbt_default_latency_nsec(struct request_queue *q) 70180e091d1SJens Axboe { 70280e091d1SJens Axboe /* 70380e091d1SJens Axboe * We default to 2msec for non-rotational storage, and 75msec 70480e091d1SJens Axboe * for rotational storage. 70580e091d1SJens Axboe */ 70680e091d1SJens Axboe if (blk_queue_nonrot(q)) 70780e091d1SJens Axboe return 2000000ULL; 70880e091d1SJens Axboe else 70980e091d1SJens Axboe return 75000000ULL; 71080e091d1SJens Axboe } 71180e091d1SJens Axboe 71299c749a4SJens Axboe static int wbt_data_dir(const struct request *rq) 71399c749a4SJens Axboe { 7145235553dSJens Axboe const int op = req_op(rq); 7155235553dSJens Axboe 7165235553dSJens Axboe if (op == REQ_OP_READ) 7175235553dSJens Axboe return READ; 718825843b0SJens Axboe else if (op_is_write(op)) 7195235553dSJens Axboe return WRITE; 7205235553dSJens Axboe 7215235553dSJens Axboe /* don't account */ 7225235553dSJens Axboe return -1; 72399c749a4SJens Axboe } 72499c749a4SJens Axboe 7258054b89fSJens Axboe int wbt_init(struct request_queue *q) 726e34cbd30SJens Axboe { 727e34cbd30SJens Axboe struct rq_wb *rwb; 728e34cbd30SJens Axboe int i; 729e34cbd30SJens Axboe 730e34cbd30SJens Axboe BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 731e34cbd30SJens Axboe 732e34cbd30SJens Axboe rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 733e34cbd30SJens Axboe if (!rwb) 734e34cbd30SJens Axboe return -ENOMEM; 735e34cbd30SJens Axboe 73699c749a4SJens Axboe rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); 73734dbad5dSOmar Sandoval if (!rwb->cb) { 73834dbad5dSOmar Sandoval kfree(rwb); 73934dbad5dSOmar Sandoval return -ENOMEM; 74034dbad5dSOmar Sandoval } 74134dbad5dSOmar Sandoval 742e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 743e34cbd30SJens Axboe atomic_set(&rwb->rq_wait[i].inflight, 0); 744e34cbd30SJens Axboe init_waitqueue_head(&rwb->rq_wait[i].wait); 745e34cbd30SJens Axboe } 746e34cbd30SJens Axboe 747e34cbd30SJens Axboe rwb->last_comp = rwb->last_issue = jiffies; 748d8a0cbfdSJens Axboe rwb->queue = q; 749e34cbd30SJens Axboe rwb->win_nsec = RWB_WINDOW_NSEC; 750d62118b6SJens Axboe rwb->enable_state = WBT_STATE_ON_DEFAULT; 751e34cbd30SJens Axboe wbt_update_limits(rwb); 752e34cbd30SJens Axboe 753e34cbd30SJens Axboe /* 75434dbad5dSOmar Sandoval * Assign rwb and add the stats callback. 755e34cbd30SJens Axboe */ 756e34cbd30SJens Axboe q->rq_wb = rwb; 75734dbad5dSOmar Sandoval blk_stat_add_callback(q, rwb->cb); 758e34cbd30SJens Axboe 75980e091d1SJens Axboe rwb->min_lat_nsec = wbt_default_latency_nsec(q); 760e34cbd30SJens Axboe 761e34cbd30SJens Axboe wbt_set_queue_depth(rwb, blk_queue_depth(q)); 762e34cbd30SJens Axboe wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 763e34cbd30SJens Axboe 764e34cbd30SJens Axboe return 0; 765e34cbd30SJens Axboe } 766e34cbd30SJens Axboe 767e34cbd30SJens Axboe void wbt_exit(struct request_queue *q) 768e34cbd30SJens Axboe { 769e34cbd30SJens Axboe struct rq_wb *rwb = q->rq_wb; 770e34cbd30SJens Axboe 771e34cbd30SJens Axboe if (rwb) { 77234dbad5dSOmar Sandoval blk_stat_remove_callback(q, rwb->cb); 77334dbad5dSOmar Sandoval blk_stat_free_callback(rwb->cb); 774e34cbd30SJens Axboe q->rq_wb = NULL; 775e34cbd30SJens Axboe kfree(rwb); 776e34cbd30SJens Axboe } 777e34cbd30SJens Axboe } 778