1e34cbd30SJens Axboe /* 2e34cbd30SJens Axboe * buffered writeback throttling. loosely based on CoDel. We can't drop 3e34cbd30SJens Axboe * packets for IO scheduling, so the logic is something like this: 4e34cbd30SJens Axboe * 5e34cbd30SJens Axboe * - Monitor latencies in a defined window of time. 6e34cbd30SJens Axboe * - If the minimum latency in the above window exceeds some target, increment 7e34cbd30SJens Axboe * scaling step and scale down queue depth by a factor of 2x. The monitoring 8e34cbd30SJens Axboe * window is then shrunk to 100 / sqrt(scaling step + 1). 9e34cbd30SJens Axboe * - For any window where we don't have solid data on what the latencies 10e34cbd30SJens Axboe * look like, retain status quo. 11e34cbd30SJens Axboe * - If latencies look good, decrement scaling step. 12e34cbd30SJens Axboe * - If we're only doing writes, allow the scaling step to go negative. This 13e34cbd30SJens Axboe * will temporarily boost write performance, snapping back to a stable 14e34cbd30SJens Axboe * scaling step of 0 if reads show up or the heavy writers finish. Unlike 15e34cbd30SJens Axboe * positive scaling steps where we shrink the monitoring window, a negative 16e34cbd30SJens Axboe * scaling step retains the default step==0 window size. 17e34cbd30SJens Axboe * 18e34cbd30SJens Axboe * Copyright (C) 2016 Jens Axboe 19e34cbd30SJens Axboe * 20e34cbd30SJens Axboe */ 21e34cbd30SJens Axboe #include <linux/kernel.h> 22e34cbd30SJens Axboe #include <linux/blk_types.h> 23e34cbd30SJens Axboe #include <linux/slab.h> 24e34cbd30SJens Axboe #include <linux/backing-dev.h> 25e34cbd30SJens Axboe #include <linux/swap.h> 26e34cbd30SJens Axboe 27e34cbd30SJens Axboe #include "blk-wbt.h" 28e34cbd30SJens Axboe 29e34cbd30SJens Axboe #define CREATE_TRACE_POINTS 30e34cbd30SJens Axboe #include <trace/events/wbt.h> 31e34cbd30SJens Axboe 32e34cbd30SJens Axboe enum { 33e34cbd30SJens Axboe /* 34e34cbd30SJens Axboe * Default setting, we'll scale up (to 75% of QD max) or down (min 1) 35e34cbd30SJens Axboe * from here depending on device stats 36e34cbd30SJens Axboe */ 37e34cbd30SJens Axboe RWB_DEF_DEPTH = 16, 38e34cbd30SJens Axboe 39e34cbd30SJens Axboe /* 40e34cbd30SJens Axboe * 100msec window 41e34cbd30SJens Axboe */ 42e34cbd30SJens Axboe RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, 43e34cbd30SJens Axboe 44e34cbd30SJens Axboe /* 45e34cbd30SJens Axboe * Disregard stats, if we don't meet this minimum 46e34cbd30SJens Axboe */ 47e34cbd30SJens Axboe RWB_MIN_WRITE_SAMPLES = 3, 48e34cbd30SJens Axboe 49e34cbd30SJens Axboe /* 50e34cbd30SJens Axboe * If we have this number of consecutive windows with not enough 51e34cbd30SJens Axboe * information to scale up or down, scale up. 52e34cbd30SJens Axboe */ 53e34cbd30SJens Axboe RWB_UNKNOWN_BUMP = 5, 54e34cbd30SJens Axboe }; 55e34cbd30SJens Axboe 56e34cbd30SJens Axboe static inline bool rwb_enabled(struct rq_wb *rwb) 57e34cbd30SJens Axboe { 58e34cbd30SJens Axboe return rwb && rwb->wb_normal != 0; 59e34cbd30SJens Axboe } 60e34cbd30SJens Axboe 61e34cbd30SJens Axboe /* 62e34cbd30SJens Axboe * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 63e34cbd30SJens Axboe * false if 'v' + 1 would be bigger than 'below'. 64e34cbd30SJens Axboe */ 65e34cbd30SJens Axboe static bool atomic_inc_below(atomic_t *v, int below) 66e34cbd30SJens Axboe { 67e34cbd30SJens Axboe int cur = atomic_read(v); 68e34cbd30SJens Axboe 69e34cbd30SJens Axboe for (;;) { 70e34cbd30SJens Axboe int old; 71e34cbd30SJens Axboe 72e34cbd30SJens Axboe if (cur >= below) 73e34cbd30SJens Axboe return false; 74e34cbd30SJens Axboe old = atomic_cmpxchg(v, cur, cur + 1); 75e34cbd30SJens Axboe if (old == cur) 76e34cbd30SJens Axboe break; 77e34cbd30SJens Axboe cur = old; 78e34cbd30SJens Axboe } 79e34cbd30SJens Axboe 80e34cbd30SJens Axboe return true; 81e34cbd30SJens Axboe } 82e34cbd30SJens Axboe 83e34cbd30SJens Axboe static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 84e34cbd30SJens Axboe { 85e34cbd30SJens Axboe if (rwb_enabled(rwb)) { 86e34cbd30SJens Axboe const unsigned long cur = jiffies; 87e34cbd30SJens Axboe 88e34cbd30SJens Axboe if (cur != *var) 89e34cbd30SJens Axboe *var = cur; 90e34cbd30SJens Axboe } 91e34cbd30SJens Axboe } 92e34cbd30SJens Axboe 93e34cbd30SJens Axboe /* 94e34cbd30SJens Axboe * If a task was rate throttled in balance_dirty_pages() within the last 95e34cbd30SJens Axboe * second or so, use that to indicate a higher cleaning rate. 96e34cbd30SJens Axboe */ 97e34cbd30SJens Axboe static bool wb_recent_wait(struct rq_wb *rwb) 98e34cbd30SJens Axboe { 99d8a0cbfdSJens Axboe struct bdi_writeback *wb = &rwb->queue->backing_dev_info.wb; 100e34cbd30SJens Axboe 101e34cbd30SJens Axboe return time_before(jiffies, wb->dirty_sleep + HZ); 102e34cbd30SJens Axboe } 103e34cbd30SJens Axboe 104e34cbd30SJens Axboe static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) 105e34cbd30SJens Axboe { 106e34cbd30SJens Axboe return &rwb->rq_wait[is_kswapd]; 107e34cbd30SJens Axboe } 108e34cbd30SJens Axboe 109e34cbd30SJens Axboe static void rwb_wake_all(struct rq_wb *rwb) 110e34cbd30SJens Axboe { 111e34cbd30SJens Axboe int i; 112e34cbd30SJens Axboe 113e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 114e34cbd30SJens Axboe struct rq_wait *rqw = &rwb->rq_wait[i]; 115e34cbd30SJens Axboe 116e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) 117e34cbd30SJens Axboe wake_up_all(&rqw->wait); 118e34cbd30SJens Axboe } 119e34cbd30SJens Axboe } 120e34cbd30SJens Axboe 121e34cbd30SJens Axboe void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) 122e34cbd30SJens Axboe { 123e34cbd30SJens Axboe struct rq_wait *rqw; 124e34cbd30SJens Axboe int inflight, limit; 125e34cbd30SJens Axboe 126e34cbd30SJens Axboe if (!(wb_acct & WBT_TRACKED)) 127e34cbd30SJens Axboe return; 128e34cbd30SJens Axboe 129e34cbd30SJens Axboe rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); 130e34cbd30SJens Axboe inflight = atomic_dec_return(&rqw->inflight); 131e34cbd30SJens Axboe 132e34cbd30SJens Axboe /* 133e34cbd30SJens Axboe * wbt got disabled with IO in flight. Wake up any potential 134e34cbd30SJens Axboe * waiters, we don't have to do more than that. 135e34cbd30SJens Axboe */ 136e34cbd30SJens Axboe if (unlikely(!rwb_enabled(rwb))) { 137e34cbd30SJens Axboe rwb_wake_all(rwb); 138e34cbd30SJens Axboe return; 139e34cbd30SJens Axboe } 140e34cbd30SJens Axboe 141e34cbd30SJens Axboe /* 142e34cbd30SJens Axboe * If the device does write back caching, drop further down 143e34cbd30SJens Axboe * before we wake people up. 144e34cbd30SJens Axboe */ 145e34cbd30SJens Axboe if (rwb->wc && !wb_recent_wait(rwb)) 146e34cbd30SJens Axboe limit = 0; 147e34cbd30SJens Axboe else 148e34cbd30SJens Axboe limit = rwb->wb_normal; 149e34cbd30SJens Axboe 150e34cbd30SJens Axboe /* 151e34cbd30SJens Axboe * Don't wake anyone up if we are above the normal limit. 152e34cbd30SJens Axboe */ 153e34cbd30SJens Axboe if (inflight && inflight >= limit) 154e34cbd30SJens Axboe return; 155e34cbd30SJens Axboe 156e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) { 157e34cbd30SJens Axboe int diff = limit - inflight; 158e34cbd30SJens Axboe 159e34cbd30SJens Axboe if (!inflight || diff >= rwb->wb_background / 2) 160e34cbd30SJens Axboe wake_up_all(&rqw->wait); 161e34cbd30SJens Axboe } 162e34cbd30SJens Axboe } 163e34cbd30SJens Axboe 164e34cbd30SJens Axboe /* 165e34cbd30SJens Axboe * Called on completion of a request. Note that it's also called when 166e34cbd30SJens Axboe * a request is merged, when the request gets freed. 167e34cbd30SJens Axboe */ 168e34cbd30SJens Axboe void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) 169e34cbd30SJens Axboe { 170e34cbd30SJens Axboe if (!rwb) 171e34cbd30SJens Axboe return; 172e34cbd30SJens Axboe 173e34cbd30SJens Axboe if (!wbt_is_tracked(stat)) { 174e34cbd30SJens Axboe if (rwb->sync_cookie == stat) { 175e34cbd30SJens Axboe rwb->sync_issue = 0; 176e34cbd30SJens Axboe rwb->sync_cookie = NULL; 177e34cbd30SJens Axboe } 178e34cbd30SJens Axboe 179e34cbd30SJens Axboe if (wbt_is_read(stat)) 180e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_comp); 181e34cbd30SJens Axboe wbt_clear_state(stat); 182e34cbd30SJens Axboe } else { 183e34cbd30SJens Axboe WARN_ON_ONCE(stat == rwb->sync_cookie); 184e34cbd30SJens Axboe __wbt_done(rwb, wbt_stat_to_mask(stat)); 185e34cbd30SJens Axboe wbt_clear_state(stat); 186e34cbd30SJens Axboe } 187e34cbd30SJens Axboe } 188e34cbd30SJens Axboe 189e34cbd30SJens Axboe /* 190e34cbd30SJens Axboe * Return true, if we can't increase the depth further by scaling 191e34cbd30SJens Axboe */ 192e34cbd30SJens Axboe static bool calc_wb_limits(struct rq_wb *rwb) 193e34cbd30SJens Axboe { 194e34cbd30SJens Axboe unsigned int depth; 195e34cbd30SJens Axboe bool ret = false; 196e34cbd30SJens Axboe 197e34cbd30SJens Axboe if (!rwb->min_lat_nsec) { 198e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; 199e34cbd30SJens Axboe return false; 200e34cbd30SJens Axboe } 201e34cbd30SJens Axboe 202e34cbd30SJens Axboe /* 203e34cbd30SJens Axboe * For QD=1 devices, this is a special case. It's important for those 204e34cbd30SJens Axboe * to have one request ready when one completes, so force a depth of 205e34cbd30SJens Axboe * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 206e34cbd30SJens Axboe * since the device can't have more than that in flight. If we're 207e34cbd30SJens Axboe * scaling down, then keep a setting of 1/1/1. 208e34cbd30SJens Axboe */ 209e34cbd30SJens Axboe if (rwb->queue_depth == 1) { 210e34cbd30SJens Axboe if (rwb->scale_step > 0) 211e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 1; 212e34cbd30SJens Axboe else { 213e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 2; 214e34cbd30SJens Axboe ret = true; 215e34cbd30SJens Axboe } 216e34cbd30SJens Axboe rwb->wb_background = 1; 217e34cbd30SJens Axboe } else { 218e34cbd30SJens Axboe /* 219e34cbd30SJens Axboe * scale_step == 0 is our default state. If we have suffered 220e34cbd30SJens Axboe * latency spikes, step will be > 0, and we shrink the 221e34cbd30SJens Axboe * allowed write depths. If step is < 0, we're only doing 222e34cbd30SJens Axboe * writes, and we allow a temporarily higher depth to 223e34cbd30SJens Axboe * increase performance. 224e34cbd30SJens Axboe */ 225e34cbd30SJens Axboe depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); 226e34cbd30SJens Axboe if (rwb->scale_step > 0) 227e34cbd30SJens Axboe depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); 228e34cbd30SJens Axboe else if (rwb->scale_step < 0) { 229e34cbd30SJens Axboe unsigned int maxd = 3 * rwb->queue_depth / 4; 230e34cbd30SJens Axboe 231e34cbd30SJens Axboe depth = 1 + ((depth - 1) << -rwb->scale_step); 232e34cbd30SJens Axboe if (depth > maxd) { 233e34cbd30SJens Axboe depth = maxd; 234e34cbd30SJens Axboe ret = true; 235e34cbd30SJens Axboe } 236e34cbd30SJens Axboe } 237e34cbd30SJens Axboe 238e34cbd30SJens Axboe /* 239e34cbd30SJens Axboe * Set our max/normal/bg queue depths based on how far 240e34cbd30SJens Axboe * we have scaled down (->scale_step). 241e34cbd30SJens Axboe */ 242e34cbd30SJens Axboe rwb->wb_max = depth; 243e34cbd30SJens Axboe rwb->wb_normal = (rwb->wb_max + 1) / 2; 244e34cbd30SJens Axboe rwb->wb_background = (rwb->wb_max + 3) / 4; 245e34cbd30SJens Axboe } 246e34cbd30SJens Axboe 247e34cbd30SJens Axboe return ret; 248e34cbd30SJens Axboe } 249e34cbd30SJens Axboe 2504121d385SArnd Bergmann static inline bool stat_sample_valid(struct blk_rq_stat *stat) 251e34cbd30SJens Axboe { 252e34cbd30SJens Axboe /* 253e34cbd30SJens Axboe * We need at least one read sample, and a minimum of 254e34cbd30SJens Axboe * RWB_MIN_WRITE_SAMPLES. We require some write samples to know 255e34cbd30SJens Axboe * that it's writes impacting us, and not just some sole read on 256e34cbd30SJens Axboe * a device that is in a lower power state. 257e34cbd30SJens Axboe */ 258382cf633SJens Axboe return stat[BLK_STAT_READ].nr_samples >= 1 && 259382cf633SJens Axboe stat[BLK_STAT_WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES; 260e34cbd30SJens Axboe } 261e34cbd30SJens Axboe 262e34cbd30SJens Axboe static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 263e34cbd30SJens Axboe { 264e34cbd30SJens Axboe u64 now, issue = ACCESS_ONCE(rwb->sync_issue); 265e34cbd30SJens Axboe 266e34cbd30SJens Axboe if (!issue || !rwb->sync_cookie) 267e34cbd30SJens Axboe return 0; 268e34cbd30SJens Axboe 269e34cbd30SJens Axboe now = ktime_to_ns(ktime_get()); 270e34cbd30SJens Axboe return now - issue; 271e34cbd30SJens Axboe } 272e34cbd30SJens Axboe 273e34cbd30SJens Axboe enum { 274e34cbd30SJens Axboe LAT_OK = 1, 275e34cbd30SJens Axboe LAT_UNKNOWN, 276e34cbd30SJens Axboe LAT_UNKNOWN_WRITES, 277e34cbd30SJens Axboe LAT_EXCEEDED, 278e34cbd30SJens Axboe }; 279e34cbd30SJens Axboe 280e34cbd30SJens Axboe static int __latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 281e34cbd30SJens Axboe { 282d8a0cbfdSJens Axboe struct backing_dev_info *bdi = &rwb->queue->backing_dev_info; 283e34cbd30SJens Axboe u64 thislat; 284e34cbd30SJens Axboe 285e34cbd30SJens Axboe /* 286e34cbd30SJens Axboe * If our stored sync issue exceeds the window size, or it 287e34cbd30SJens Axboe * exceeds our min target AND we haven't logged any entries, 288e34cbd30SJens Axboe * flag the latency as exceeded. wbt works off completion latencies, 289e34cbd30SJens Axboe * but for a flooded device, a single sync IO can take a long time 290e34cbd30SJens Axboe * to complete after being issued. If this time exceeds our 291e34cbd30SJens Axboe * monitoring window AND we didn't see any other completions in that 292e34cbd30SJens Axboe * window, then count that sync IO as a violation of the latency. 293e34cbd30SJens Axboe */ 294e34cbd30SJens Axboe thislat = rwb_sync_issue_lat(rwb); 295e34cbd30SJens Axboe if (thislat > rwb->cur_win_nsec || 296382cf633SJens Axboe (thislat > rwb->min_lat_nsec && !stat[BLK_STAT_READ].nr_samples)) { 297d8a0cbfdSJens Axboe trace_wbt_lat(bdi, thislat); 298e34cbd30SJens Axboe return LAT_EXCEEDED; 299e34cbd30SJens Axboe } 300e34cbd30SJens Axboe 301e34cbd30SJens Axboe /* 302e34cbd30SJens Axboe * No read/write mix, if stat isn't valid 303e34cbd30SJens Axboe */ 304e34cbd30SJens Axboe if (!stat_sample_valid(stat)) { 305e34cbd30SJens Axboe /* 306e34cbd30SJens Axboe * If we had writes in this stat window and the window is 307e34cbd30SJens Axboe * current, we're only doing writes. If a task recently 308e34cbd30SJens Axboe * waited or still has writes in flights, consider us doing 309e34cbd30SJens Axboe * just writes as well. 310e34cbd30SJens Axboe */ 311382cf633SJens Axboe if ((stat[BLK_STAT_WRITE].nr_samples && blk_stat_is_current(stat)) || 312e34cbd30SJens Axboe wb_recent_wait(rwb) || wbt_inflight(rwb)) 313e34cbd30SJens Axboe return LAT_UNKNOWN_WRITES; 314e34cbd30SJens Axboe return LAT_UNKNOWN; 315e34cbd30SJens Axboe } 316e34cbd30SJens Axboe 317e34cbd30SJens Axboe /* 318e34cbd30SJens Axboe * If the 'min' latency exceeds our target, step down. 319e34cbd30SJens Axboe */ 320382cf633SJens Axboe if (stat[BLK_STAT_READ].min > rwb->min_lat_nsec) { 321382cf633SJens Axboe trace_wbt_lat(bdi, stat[BLK_STAT_READ].min); 322d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 323e34cbd30SJens Axboe return LAT_EXCEEDED; 324e34cbd30SJens Axboe } 325e34cbd30SJens Axboe 326e34cbd30SJens Axboe if (rwb->scale_step) 327d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 328e34cbd30SJens Axboe 329e34cbd30SJens Axboe return LAT_OK; 330e34cbd30SJens Axboe } 331e34cbd30SJens Axboe 332e34cbd30SJens Axboe static int latency_exceeded(struct rq_wb *rwb) 333e34cbd30SJens Axboe { 334e34cbd30SJens Axboe struct blk_rq_stat stat[2]; 335e34cbd30SJens Axboe 3368054b89fSJens Axboe blk_queue_stat_get(rwb->queue, stat); 337e34cbd30SJens Axboe return __latency_exceeded(rwb, stat); 338e34cbd30SJens Axboe } 339e34cbd30SJens Axboe 340e34cbd30SJens Axboe static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 341e34cbd30SJens Axboe { 342d8a0cbfdSJens Axboe struct backing_dev_info *bdi = &rwb->queue->backing_dev_info; 343d8a0cbfdSJens Axboe 344d8a0cbfdSJens Axboe trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, 345e34cbd30SJens Axboe rwb->wb_background, rwb->wb_normal, rwb->wb_max); 346e34cbd30SJens Axboe } 347e34cbd30SJens Axboe 348e34cbd30SJens Axboe static void scale_up(struct rq_wb *rwb) 349e34cbd30SJens Axboe { 350e34cbd30SJens Axboe /* 351e34cbd30SJens Axboe * Hit max in previous round, stop here 352e34cbd30SJens Axboe */ 353e34cbd30SJens Axboe if (rwb->scaled_max) 354e34cbd30SJens Axboe return; 355e34cbd30SJens Axboe 356e34cbd30SJens Axboe rwb->scale_step--; 357e34cbd30SJens Axboe rwb->unknown_cnt = 0; 3588054b89fSJens Axboe blk_stat_clear(rwb->queue); 359e34cbd30SJens Axboe 360e34cbd30SJens Axboe rwb->scaled_max = calc_wb_limits(rwb); 361e34cbd30SJens Axboe 362e34cbd30SJens Axboe rwb_wake_all(rwb); 363e34cbd30SJens Axboe 364e34cbd30SJens Axboe rwb_trace_step(rwb, "step up"); 365e34cbd30SJens Axboe } 366e34cbd30SJens Axboe 367e34cbd30SJens Axboe /* 368e34cbd30SJens Axboe * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 369e34cbd30SJens Axboe * had a latency violation. 370e34cbd30SJens Axboe */ 371e34cbd30SJens Axboe static void scale_down(struct rq_wb *rwb, bool hard_throttle) 372e34cbd30SJens Axboe { 373e34cbd30SJens Axboe /* 374e34cbd30SJens Axboe * Stop scaling down when we've hit the limit. This also prevents 375e34cbd30SJens Axboe * ->scale_step from going to crazy values, if the device can't 376e34cbd30SJens Axboe * keep up. 377e34cbd30SJens Axboe */ 378e34cbd30SJens Axboe if (rwb->wb_max == 1) 379e34cbd30SJens Axboe return; 380e34cbd30SJens Axboe 381e34cbd30SJens Axboe if (rwb->scale_step < 0 && hard_throttle) 382e34cbd30SJens Axboe rwb->scale_step = 0; 383e34cbd30SJens Axboe else 384e34cbd30SJens Axboe rwb->scale_step++; 385e34cbd30SJens Axboe 386e34cbd30SJens Axboe rwb->scaled_max = false; 387e34cbd30SJens Axboe rwb->unknown_cnt = 0; 3888054b89fSJens Axboe blk_stat_clear(rwb->queue); 389e34cbd30SJens Axboe calc_wb_limits(rwb); 390e34cbd30SJens Axboe rwb_trace_step(rwb, "step down"); 391e34cbd30SJens Axboe } 392e34cbd30SJens Axboe 393e34cbd30SJens Axboe static void rwb_arm_timer(struct rq_wb *rwb) 394e34cbd30SJens Axboe { 395e34cbd30SJens Axboe unsigned long expires; 396e34cbd30SJens Axboe 397e34cbd30SJens Axboe if (rwb->scale_step > 0) { 398e34cbd30SJens Axboe /* 399e34cbd30SJens Axboe * We should speed this up, using some variant of a fast 400e34cbd30SJens Axboe * integer inverse square root calculation. Since we only do 401e34cbd30SJens Axboe * this for every window expiration, it's not a huge deal, 402e34cbd30SJens Axboe * though. 403e34cbd30SJens Axboe */ 404e34cbd30SJens Axboe rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 405e34cbd30SJens Axboe int_sqrt((rwb->scale_step + 1) << 8)); 406e34cbd30SJens Axboe } else { 407e34cbd30SJens Axboe /* 408e34cbd30SJens Axboe * For step < 0, we don't want to increase/decrease the 409e34cbd30SJens Axboe * window size. 410e34cbd30SJens Axboe */ 411e34cbd30SJens Axboe rwb->cur_win_nsec = rwb->win_nsec; 412e34cbd30SJens Axboe } 413e34cbd30SJens Axboe 414e34cbd30SJens Axboe expires = jiffies + nsecs_to_jiffies(rwb->cur_win_nsec); 415e34cbd30SJens Axboe mod_timer(&rwb->window_timer, expires); 416e34cbd30SJens Axboe } 417e34cbd30SJens Axboe 418e34cbd30SJens Axboe static void wb_timer_fn(unsigned long data) 419e34cbd30SJens Axboe { 420e34cbd30SJens Axboe struct rq_wb *rwb = (struct rq_wb *) data; 421e34cbd30SJens Axboe unsigned int inflight = wbt_inflight(rwb); 422e34cbd30SJens Axboe int status; 423e34cbd30SJens Axboe 424e34cbd30SJens Axboe status = latency_exceeded(rwb); 425e34cbd30SJens Axboe 426d8a0cbfdSJens Axboe trace_wbt_timer(&rwb->queue->backing_dev_info, status, rwb->scale_step, 427d8a0cbfdSJens Axboe inflight); 428e34cbd30SJens Axboe 429e34cbd30SJens Axboe /* 430e34cbd30SJens Axboe * If we exceeded the latency target, step down. If we did not, 431e34cbd30SJens Axboe * step one level up. If we don't know enough to say either exceeded 432e34cbd30SJens Axboe * or ok, then don't do anything. 433e34cbd30SJens Axboe */ 434e34cbd30SJens Axboe switch (status) { 435e34cbd30SJens Axboe case LAT_EXCEEDED: 436e34cbd30SJens Axboe scale_down(rwb, true); 437e34cbd30SJens Axboe break; 438e34cbd30SJens Axboe case LAT_OK: 439e34cbd30SJens Axboe scale_up(rwb); 440e34cbd30SJens Axboe break; 441e34cbd30SJens Axboe case LAT_UNKNOWN_WRITES: 442e34cbd30SJens Axboe /* 443e34cbd30SJens Axboe * We started a the center step, but don't have a valid 444e34cbd30SJens Axboe * read/write sample, but we do have writes going on. 445e34cbd30SJens Axboe * Allow step to go negative, to increase write perf. 446e34cbd30SJens Axboe */ 447e34cbd30SJens Axboe scale_up(rwb); 448e34cbd30SJens Axboe break; 449e34cbd30SJens Axboe case LAT_UNKNOWN: 450e34cbd30SJens Axboe if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) 451e34cbd30SJens Axboe break; 452e34cbd30SJens Axboe /* 453e34cbd30SJens Axboe * We get here when previously scaled reduced depth, and we 454e34cbd30SJens Axboe * currently don't have a valid read/write sample. For that 455e34cbd30SJens Axboe * case, slowly return to center state (step == 0). 456e34cbd30SJens Axboe */ 457e34cbd30SJens Axboe if (rwb->scale_step > 0) 458e34cbd30SJens Axboe scale_up(rwb); 459e34cbd30SJens Axboe else if (rwb->scale_step < 0) 460e34cbd30SJens Axboe scale_down(rwb, false); 461e34cbd30SJens Axboe break; 462e34cbd30SJens Axboe default: 463e34cbd30SJens Axboe break; 464e34cbd30SJens Axboe } 465e34cbd30SJens Axboe 466e34cbd30SJens Axboe /* 467e34cbd30SJens Axboe * Re-arm timer, if we have IO in flight 468e34cbd30SJens Axboe */ 469e34cbd30SJens Axboe if (rwb->scale_step || inflight) 470e34cbd30SJens Axboe rwb_arm_timer(rwb); 471e34cbd30SJens Axboe } 472e34cbd30SJens Axboe 473e34cbd30SJens Axboe void wbt_update_limits(struct rq_wb *rwb) 474e34cbd30SJens Axboe { 475e34cbd30SJens Axboe rwb->scale_step = 0; 476e34cbd30SJens Axboe rwb->scaled_max = false; 477e34cbd30SJens Axboe calc_wb_limits(rwb); 478e34cbd30SJens Axboe 479e34cbd30SJens Axboe rwb_wake_all(rwb); 480e34cbd30SJens Axboe } 481e34cbd30SJens Axboe 482e34cbd30SJens Axboe static bool close_io(struct rq_wb *rwb) 483e34cbd30SJens Axboe { 484e34cbd30SJens Axboe const unsigned long now = jiffies; 485e34cbd30SJens Axboe 486e34cbd30SJens Axboe return time_before(now, rwb->last_issue + HZ / 10) || 487e34cbd30SJens Axboe time_before(now, rwb->last_comp + HZ / 10); 488e34cbd30SJens Axboe } 489e34cbd30SJens Axboe 490e34cbd30SJens Axboe #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) 491e34cbd30SJens Axboe 492e34cbd30SJens Axboe static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) 493e34cbd30SJens Axboe { 494e34cbd30SJens Axboe unsigned int limit; 495e34cbd30SJens Axboe 496e34cbd30SJens Axboe /* 497e34cbd30SJens Axboe * At this point we know it's a buffered write. If this is 498e34cbd30SJens Axboe * kswapd trying to free memory, or REQ_SYNC is set, set, then 499e34cbd30SJens Axboe * it's WB_SYNC_ALL writeback, and we'll use the max limit for 500e34cbd30SJens Axboe * that. If the write is marked as a background write, then use 501e34cbd30SJens Axboe * the idle limit, or go to normal if we haven't had competing 502e34cbd30SJens Axboe * IO for a bit. 503e34cbd30SJens Axboe */ 504e34cbd30SJens Axboe if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 505e34cbd30SJens Axboe limit = rwb->wb_max; 506e34cbd30SJens Axboe else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 507e34cbd30SJens Axboe /* 508e34cbd30SJens Axboe * If less than 100ms since we completed unrelated IO, 509e34cbd30SJens Axboe * limit us to half the depth for background writeback. 510e34cbd30SJens Axboe */ 511e34cbd30SJens Axboe limit = rwb->wb_background; 512e34cbd30SJens Axboe } else 513e34cbd30SJens Axboe limit = rwb->wb_normal; 514e34cbd30SJens Axboe 515e34cbd30SJens Axboe return limit; 516e34cbd30SJens Axboe } 517e34cbd30SJens Axboe 518e34cbd30SJens Axboe static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, 519e34cbd30SJens Axboe wait_queue_t *wait, unsigned long rw) 520e34cbd30SJens Axboe { 521e34cbd30SJens Axboe /* 522e34cbd30SJens Axboe * inc it here even if disabled, since we'll dec it at completion. 523e34cbd30SJens Axboe * this only happens if the task was sleeping in __wbt_wait(), 524e34cbd30SJens Axboe * and someone turned it off at the same time. 525e34cbd30SJens Axboe */ 526e34cbd30SJens Axboe if (!rwb_enabled(rwb)) { 527e34cbd30SJens Axboe atomic_inc(&rqw->inflight); 528e34cbd30SJens Axboe return true; 529e34cbd30SJens Axboe } 530e34cbd30SJens Axboe 531e34cbd30SJens Axboe /* 532e34cbd30SJens Axboe * If the waitqueue is already active and we are not the next 533e34cbd30SJens Axboe * in line to be woken up, wait for our turn. 534e34cbd30SJens Axboe */ 535e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait) && 536e34cbd30SJens Axboe rqw->wait.task_list.next != &wait->task_list) 537e34cbd30SJens Axboe return false; 538e34cbd30SJens Axboe 539e34cbd30SJens Axboe return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); 540e34cbd30SJens Axboe } 541e34cbd30SJens Axboe 542e34cbd30SJens Axboe /* 543e34cbd30SJens Axboe * Block if we will exceed our limit, or if we are currently waiting for 544e34cbd30SJens Axboe * the timer to kick off queuing again. 545e34cbd30SJens Axboe */ 546e34cbd30SJens Axboe static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) 547e34cbd30SJens Axboe { 548e34cbd30SJens Axboe struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); 549e34cbd30SJens Axboe DEFINE_WAIT(wait); 550e34cbd30SJens Axboe 551e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 552e34cbd30SJens Axboe return; 553e34cbd30SJens Axboe 554e34cbd30SJens Axboe do { 555e34cbd30SJens Axboe prepare_to_wait_exclusive(&rqw->wait, &wait, 556e34cbd30SJens Axboe TASK_UNINTERRUPTIBLE); 557e34cbd30SJens Axboe 558e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 559e34cbd30SJens Axboe break; 560e34cbd30SJens Axboe 561e34cbd30SJens Axboe if (lock) 562e34cbd30SJens Axboe spin_unlock_irq(lock); 563e34cbd30SJens Axboe 564e34cbd30SJens Axboe io_schedule(); 565e34cbd30SJens Axboe 566e34cbd30SJens Axboe if (lock) 567e34cbd30SJens Axboe spin_lock_irq(lock); 568e34cbd30SJens Axboe } while (1); 569e34cbd30SJens Axboe 570e34cbd30SJens Axboe finish_wait(&rqw->wait, &wait); 571e34cbd30SJens Axboe } 572e34cbd30SJens Axboe 573e34cbd30SJens Axboe static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 574e34cbd30SJens Axboe { 575e34cbd30SJens Axboe const int op = bio_op(bio); 576e34cbd30SJens Axboe 577e34cbd30SJens Axboe /* 578be07e14fSChristoph Hellwig * If not a WRITE, do nothing 579e34cbd30SJens Axboe */ 580be07e14fSChristoph Hellwig if (op != REQ_OP_WRITE) 581e34cbd30SJens Axboe return false; 582e34cbd30SJens Axboe 583e34cbd30SJens Axboe /* 584e34cbd30SJens Axboe * Don't throttle WRITE_ODIRECT 585e34cbd30SJens Axboe */ 586e34cbd30SJens Axboe if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) 587e34cbd30SJens Axboe return false; 588e34cbd30SJens Axboe 589e34cbd30SJens Axboe return true; 590e34cbd30SJens Axboe } 591e34cbd30SJens Axboe 592e34cbd30SJens Axboe /* 593e34cbd30SJens Axboe * Returns true if the IO request should be accounted, false if not. 594e34cbd30SJens Axboe * May sleep, if we have exceeded the writeback limits. Caller can pass 595e34cbd30SJens Axboe * in an irq held spinlock, if it holds one when calling this function. 596e34cbd30SJens Axboe * If we do sleep, we'll release and re-grab it. 597e34cbd30SJens Axboe */ 598e34cbd30SJens Axboe unsigned int wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) 599e34cbd30SJens Axboe { 600e34cbd30SJens Axboe unsigned int ret = 0; 601e34cbd30SJens Axboe 602e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 603e34cbd30SJens Axboe return 0; 604e34cbd30SJens Axboe 605e34cbd30SJens Axboe if (bio_op(bio) == REQ_OP_READ) 606e34cbd30SJens Axboe ret = WBT_READ; 607e34cbd30SJens Axboe 608e34cbd30SJens Axboe if (!wbt_should_throttle(rwb, bio)) { 609e34cbd30SJens Axboe if (ret & WBT_READ) 610e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_issue); 611e34cbd30SJens Axboe return ret; 612e34cbd30SJens Axboe } 613e34cbd30SJens Axboe 614e34cbd30SJens Axboe __wbt_wait(rwb, bio->bi_opf, lock); 615e34cbd30SJens Axboe 616e34cbd30SJens Axboe if (!timer_pending(&rwb->window_timer)) 617e34cbd30SJens Axboe rwb_arm_timer(rwb); 618e34cbd30SJens Axboe 619e34cbd30SJens Axboe if (current_is_kswapd()) 620e34cbd30SJens Axboe ret |= WBT_KSWAPD; 621e34cbd30SJens Axboe 622e34cbd30SJens Axboe return ret | WBT_TRACKED; 623e34cbd30SJens Axboe } 624e34cbd30SJens Axboe 625e34cbd30SJens Axboe void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) 626e34cbd30SJens Axboe { 627e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 628e34cbd30SJens Axboe return; 629e34cbd30SJens Axboe 630e34cbd30SJens Axboe /* 631e34cbd30SJens Axboe * Track sync issue, in case it takes a long time to complete. Allows 632e34cbd30SJens Axboe * us to react quicker, if a sync IO takes a long time to complete. 633e34cbd30SJens Axboe * Note that this is just a hint. 'stat' can go away when the 634e34cbd30SJens Axboe * request completes, so it's important we never dereference it. We 635e34cbd30SJens Axboe * only use the address to compare with, which is why we store the 636e34cbd30SJens Axboe * sync_issue time locally. 637e34cbd30SJens Axboe */ 638e34cbd30SJens Axboe if (wbt_is_read(stat) && !rwb->sync_issue) { 639e34cbd30SJens Axboe rwb->sync_cookie = stat; 640e34cbd30SJens Axboe rwb->sync_issue = blk_stat_time(stat); 641e34cbd30SJens Axboe } 642e34cbd30SJens Axboe } 643e34cbd30SJens Axboe 644e34cbd30SJens Axboe void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) 645e34cbd30SJens Axboe { 646e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 647e34cbd30SJens Axboe return; 648e34cbd30SJens Axboe if (stat == rwb->sync_cookie) { 649e34cbd30SJens Axboe rwb->sync_issue = 0; 650e34cbd30SJens Axboe rwb->sync_cookie = NULL; 651e34cbd30SJens Axboe } 652e34cbd30SJens Axboe } 653e34cbd30SJens Axboe 654e34cbd30SJens Axboe void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 655e34cbd30SJens Axboe { 656e34cbd30SJens Axboe if (rwb) { 657e34cbd30SJens Axboe rwb->queue_depth = depth; 658e34cbd30SJens Axboe wbt_update_limits(rwb); 659e34cbd30SJens Axboe } 660e34cbd30SJens Axboe } 661e34cbd30SJens Axboe 662e34cbd30SJens Axboe void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) 663e34cbd30SJens Axboe { 664e34cbd30SJens Axboe if (rwb) 665e34cbd30SJens Axboe rwb->wc = write_cache_on; 666e34cbd30SJens Axboe } 667e34cbd30SJens Axboe 668fa224eedSJens Axboe /* 669fa224eedSJens Axboe * Disable wbt, if enabled by default. Only called from CFQ, if we have 670fa224eedSJens Axboe * cgroups enabled 671fa224eedSJens Axboe */ 672fa224eedSJens Axboe void wbt_disable_default(struct request_queue *q) 673e34cbd30SJens Axboe { 674fa224eedSJens Axboe struct rq_wb *rwb = q->rq_wb; 675fa224eedSJens Axboe 676d62118b6SJens Axboe if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) { 677e34cbd30SJens Axboe del_timer_sync(&rwb->window_timer); 678e34cbd30SJens Axboe rwb->win_nsec = rwb->min_lat_nsec = 0; 679e34cbd30SJens Axboe wbt_update_limits(rwb); 680e34cbd30SJens Axboe } 681e34cbd30SJens Axboe } 682fa224eedSJens Axboe EXPORT_SYMBOL_GPL(wbt_disable_default); 683e34cbd30SJens Axboe 68480e091d1SJens Axboe u64 wbt_default_latency_nsec(struct request_queue *q) 68580e091d1SJens Axboe { 68680e091d1SJens Axboe /* 68780e091d1SJens Axboe * We default to 2msec for non-rotational storage, and 75msec 68880e091d1SJens Axboe * for rotational storage. 68980e091d1SJens Axboe */ 69080e091d1SJens Axboe if (blk_queue_nonrot(q)) 69180e091d1SJens Axboe return 2000000ULL; 69280e091d1SJens Axboe else 69380e091d1SJens Axboe return 75000000ULL; 69480e091d1SJens Axboe } 69580e091d1SJens Axboe 6968054b89fSJens Axboe int wbt_init(struct request_queue *q) 697e34cbd30SJens Axboe { 698e34cbd30SJens Axboe struct rq_wb *rwb; 699e34cbd30SJens Axboe int i; 700e34cbd30SJens Axboe 701e34cbd30SJens Axboe /* 702e34cbd30SJens Axboe * For now, we depend on the stats window being larger than 703e34cbd30SJens Axboe * our monitoring window. Ensure that this isn't inadvertently 704e34cbd30SJens Axboe * violated. 705e34cbd30SJens Axboe */ 706e34cbd30SJens Axboe BUILD_BUG_ON(RWB_WINDOW_NSEC > BLK_STAT_NSEC); 707e34cbd30SJens Axboe BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 708e34cbd30SJens Axboe 709e34cbd30SJens Axboe rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 710e34cbd30SJens Axboe if (!rwb) 711e34cbd30SJens Axboe return -ENOMEM; 712e34cbd30SJens Axboe 713e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 714e34cbd30SJens Axboe atomic_set(&rwb->rq_wait[i].inflight, 0); 715e34cbd30SJens Axboe init_waitqueue_head(&rwb->rq_wait[i].wait); 716e34cbd30SJens Axboe } 717e34cbd30SJens Axboe 718e34cbd30SJens Axboe setup_timer(&rwb->window_timer, wb_timer_fn, (unsigned long) rwb); 719e34cbd30SJens Axboe rwb->wc = 1; 720e34cbd30SJens Axboe rwb->queue_depth = RWB_DEF_DEPTH; 721e34cbd30SJens Axboe rwb->last_comp = rwb->last_issue = jiffies; 722d8a0cbfdSJens Axboe rwb->queue = q; 723e34cbd30SJens Axboe rwb->win_nsec = RWB_WINDOW_NSEC; 724d62118b6SJens Axboe rwb->enable_state = WBT_STATE_ON_DEFAULT; 725e34cbd30SJens Axboe wbt_update_limits(rwb); 726e34cbd30SJens Axboe 727e34cbd30SJens Axboe /* 728e34cbd30SJens Axboe * Assign rwb, and turn on stats tracking for this queue 729e34cbd30SJens Axboe */ 730e34cbd30SJens Axboe q->rq_wb = rwb; 731e34cbd30SJens Axboe blk_stat_enable(q); 732e34cbd30SJens Axboe 73380e091d1SJens Axboe rwb->min_lat_nsec = wbt_default_latency_nsec(q); 734e34cbd30SJens Axboe 735e34cbd30SJens Axboe wbt_set_queue_depth(rwb, blk_queue_depth(q)); 736e34cbd30SJens Axboe wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 737e34cbd30SJens Axboe 738e34cbd30SJens Axboe return 0; 739e34cbd30SJens Axboe } 740e34cbd30SJens Axboe 741e34cbd30SJens Axboe void wbt_exit(struct request_queue *q) 742e34cbd30SJens Axboe { 743e34cbd30SJens Axboe struct rq_wb *rwb = q->rq_wb; 744e34cbd30SJens Axboe 745e34cbd30SJens Axboe if (rwb) { 746e34cbd30SJens Axboe del_timer_sync(&rwb->window_timer); 747e34cbd30SJens Axboe q->rq_wb = NULL; 748e34cbd30SJens Axboe kfree(rwb); 749e34cbd30SJens Axboe } 750e34cbd30SJens Axboe } 751