1e34cbd30SJens Axboe /* 2e34cbd30SJens Axboe * buffered writeback throttling. loosely based on CoDel. We can't drop 3e34cbd30SJens Axboe * packets for IO scheduling, so the logic is something like this: 4e34cbd30SJens Axboe * 5e34cbd30SJens Axboe * - Monitor latencies in a defined window of time. 6e34cbd30SJens Axboe * - If the minimum latency in the above window exceeds some target, increment 7e34cbd30SJens Axboe * scaling step and scale down queue depth by a factor of 2x. The monitoring 8e34cbd30SJens Axboe * window is then shrunk to 100 / sqrt(scaling step + 1). 9e34cbd30SJens Axboe * - For any window where we don't have solid data on what the latencies 10e34cbd30SJens Axboe * look like, retain status quo. 11e34cbd30SJens Axboe * - If latencies look good, decrement scaling step. 12e34cbd30SJens Axboe * - If we're only doing writes, allow the scaling step to go negative. This 13e34cbd30SJens Axboe * will temporarily boost write performance, snapping back to a stable 14e34cbd30SJens Axboe * scaling step of 0 if reads show up or the heavy writers finish. Unlike 15e34cbd30SJens Axboe * positive scaling steps where we shrink the monitoring window, a negative 16e34cbd30SJens Axboe * scaling step retains the default step==0 window size. 17e34cbd30SJens Axboe * 18e34cbd30SJens Axboe * Copyright (C) 2016 Jens Axboe 19e34cbd30SJens Axboe * 20e34cbd30SJens Axboe */ 21e34cbd30SJens Axboe #include <linux/kernel.h> 22e34cbd30SJens Axboe #include <linux/blk_types.h> 23e34cbd30SJens Axboe #include <linux/slab.h> 24e34cbd30SJens Axboe #include <linux/backing-dev.h> 25e34cbd30SJens Axboe #include <linux/swap.h> 26e34cbd30SJens Axboe 27e34cbd30SJens Axboe #include "blk-wbt.h" 28e34cbd30SJens Axboe 29e34cbd30SJens Axboe #define CREATE_TRACE_POINTS 30e34cbd30SJens Axboe #include <trace/events/wbt.h> 31e34cbd30SJens Axboe 32e34cbd30SJens Axboe enum { 33e34cbd30SJens Axboe /* 34e34cbd30SJens Axboe * Default setting, we'll scale up (to 75% of QD max) or down (min 1) 35e34cbd30SJens Axboe * from here depending on device stats 36e34cbd30SJens Axboe */ 37e34cbd30SJens Axboe RWB_DEF_DEPTH = 16, 38e34cbd30SJens Axboe 39e34cbd30SJens Axboe /* 40e34cbd30SJens Axboe * 100msec window 41e34cbd30SJens Axboe */ 42e34cbd30SJens Axboe RWB_WINDOW_NSEC = 100 * 1000 * 1000ULL, 43e34cbd30SJens Axboe 44e34cbd30SJens Axboe /* 45e34cbd30SJens Axboe * Disregard stats, if we don't meet this minimum 46e34cbd30SJens Axboe */ 47e34cbd30SJens Axboe RWB_MIN_WRITE_SAMPLES = 3, 48e34cbd30SJens Axboe 49e34cbd30SJens Axboe /* 50e34cbd30SJens Axboe * If we have this number of consecutive windows with not enough 51e34cbd30SJens Axboe * information to scale up or down, scale up. 52e34cbd30SJens Axboe */ 53e34cbd30SJens Axboe RWB_UNKNOWN_BUMP = 5, 54e34cbd30SJens Axboe }; 55e34cbd30SJens Axboe 56e34cbd30SJens Axboe static inline bool rwb_enabled(struct rq_wb *rwb) 57e34cbd30SJens Axboe { 58e34cbd30SJens Axboe return rwb && rwb->wb_normal != 0; 59e34cbd30SJens Axboe } 60e34cbd30SJens Axboe 61e34cbd30SJens Axboe /* 62e34cbd30SJens Axboe * Increment 'v', if 'v' is below 'below'. Returns true if we succeeded, 63e34cbd30SJens Axboe * false if 'v' + 1 would be bigger than 'below'. 64e34cbd30SJens Axboe */ 65e34cbd30SJens Axboe static bool atomic_inc_below(atomic_t *v, int below) 66e34cbd30SJens Axboe { 67e34cbd30SJens Axboe int cur = atomic_read(v); 68e34cbd30SJens Axboe 69e34cbd30SJens Axboe for (;;) { 70e34cbd30SJens Axboe int old; 71e34cbd30SJens Axboe 72e34cbd30SJens Axboe if (cur >= below) 73e34cbd30SJens Axboe return false; 74e34cbd30SJens Axboe old = atomic_cmpxchg(v, cur, cur + 1); 75e34cbd30SJens Axboe if (old == cur) 76e34cbd30SJens Axboe break; 77e34cbd30SJens Axboe cur = old; 78e34cbd30SJens Axboe } 79e34cbd30SJens Axboe 80e34cbd30SJens Axboe return true; 81e34cbd30SJens Axboe } 82e34cbd30SJens Axboe 83e34cbd30SJens Axboe static void wb_timestamp(struct rq_wb *rwb, unsigned long *var) 84e34cbd30SJens Axboe { 85e34cbd30SJens Axboe if (rwb_enabled(rwb)) { 86e34cbd30SJens Axboe const unsigned long cur = jiffies; 87e34cbd30SJens Axboe 88e34cbd30SJens Axboe if (cur != *var) 89e34cbd30SJens Axboe *var = cur; 90e34cbd30SJens Axboe } 91e34cbd30SJens Axboe } 92e34cbd30SJens Axboe 93e34cbd30SJens Axboe /* 94e34cbd30SJens Axboe * If a task was rate throttled in balance_dirty_pages() within the last 95e34cbd30SJens Axboe * second or so, use that to indicate a higher cleaning rate. 96e34cbd30SJens Axboe */ 97e34cbd30SJens Axboe static bool wb_recent_wait(struct rq_wb *rwb) 98e34cbd30SJens Axboe { 99dc3b17ccSJan Kara struct bdi_writeback *wb = &rwb->queue->backing_dev_info->wb; 100e34cbd30SJens Axboe 101e34cbd30SJens Axboe return time_before(jiffies, wb->dirty_sleep + HZ); 102e34cbd30SJens Axboe } 103e34cbd30SJens Axboe 104e34cbd30SJens Axboe static inline struct rq_wait *get_rq_wait(struct rq_wb *rwb, bool is_kswapd) 105e34cbd30SJens Axboe { 106e34cbd30SJens Axboe return &rwb->rq_wait[is_kswapd]; 107e34cbd30SJens Axboe } 108e34cbd30SJens Axboe 109e34cbd30SJens Axboe static void rwb_wake_all(struct rq_wb *rwb) 110e34cbd30SJens Axboe { 111e34cbd30SJens Axboe int i; 112e34cbd30SJens Axboe 113e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 114e34cbd30SJens Axboe struct rq_wait *rqw = &rwb->rq_wait[i]; 115e34cbd30SJens Axboe 116e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) 117e34cbd30SJens Axboe wake_up_all(&rqw->wait); 118e34cbd30SJens Axboe } 119e34cbd30SJens Axboe } 120e34cbd30SJens Axboe 121e34cbd30SJens Axboe void __wbt_done(struct rq_wb *rwb, enum wbt_flags wb_acct) 122e34cbd30SJens Axboe { 123e34cbd30SJens Axboe struct rq_wait *rqw; 124e34cbd30SJens Axboe int inflight, limit; 125e34cbd30SJens Axboe 126e34cbd30SJens Axboe if (!(wb_acct & WBT_TRACKED)) 127e34cbd30SJens Axboe return; 128e34cbd30SJens Axboe 129e34cbd30SJens Axboe rqw = get_rq_wait(rwb, wb_acct & WBT_KSWAPD); 130e34cbd30SJens Axboe inflight = atomic_dec_return(&rqw->inflight); 131e34cbd30SJens Axboe 132e34cbd30SJens Axboe /* 133e34cbd30SJens Axboe * wbt got disabled with IO in flight. Wake up any potential 134e34cbd30SJens Axboe * waiters, we don't have to do more than that. 135e34cbd30SJens Axboe */ 136e34cbd30SJens Axboe if (unlikely(!rwb_enabled(rwb))) { 137e34cbd30SJens Axboe rwb_wake_all(rwb); 138e34cbd30SJens Axboe return; 139e34cbd30SJens Axboe } 140e34cbd30SJens Axboe 141e34cbd30SJens Axboe /* 142e34cbd30SJens Axboe * If the device does write back caching, drop further down 143e34cbd30SJens Axboe * before we wake people up. 144e34cbd30SJens Axboe */ 145e34cbd30SJens Axboe if (rwb->wc && !wb_recent_wait(rwb)) 146e34cbd30SJens Axboe limit = 0; 147e34cbd30SJens Axboe else 148e34cbd30SJens Axboe limit = rwb->wb_normal; 149e34cbd30SJens Axboe 150e34cbd30SJens Axboe /* 151e34cbd30SJens Axboe * Don't wake anyone up if we are above the normal limit. 152e34cbd30SJens Axboe */ 153e34cbd30SJens Axboe if (inflight && inflight >= limit) 154e34cbd30SJens Axboe return; 155e34cbd30SJens Axboe 156e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait)) { 157e34cbd30SJens Axboe int diff = limit - inflight; 158e34cbd30SJens Axboe 159e34cbd30SJens Axboe if (!inflight || diff >= rwb->wb_background / 2) 160e34cbd30SJens Axboe wake_up_all(&rqw->wait); 161e34cbd30SJens Axboe } 162e34cbd30SJens Axboe } 163e34cbd30SJens Axboe 164e34cbd30SJens Axboe /* 165e34cbd30SJens Axboe * Called on completion of a request. Note that it's also called when 166e34cbd30SJens Axboe * a request is merged, when the request gets freed. 167e34cbd30SJens Axboe */ 168e34cbd30SJens Axboe void wbt_done(struct rq_wb *rwb, struct blk_issue_stat *stat) 169e34cbd30SJens Axboe { 170e34cbd30SJens Axboe if (!rwb) 171e34cbd30SJens Axboe return; 172e34cbd30SJens Axboe 173e34cbd30SJens Axboe if (!wbt_is_tracked(stat)) { 174e34cbd30SJens Axboe if (rwb->sync_cookie == stat) { 175e34cbd30SJens Axboe rwb->sync_issue = 0; 176e34cbd30SJens Axboe rwb->sync_cookie = NULL; 177e34cbd30SJens Axboe } 178e34cbd30SJens Axboe 179e34cbd30SJens Axboe if (wbt_is_read(stat)) 180e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_comp); 181e34cbd30SJens Axboe wbt_clear_state(stat); 182e34cbd30SJens Axboe } else { 183e34cbd30SJens Axboe WARN_ON_ONCE(stat == rwb->sync_cookie); 184e34cbd30SJens Axboe __wbt_done(rwb, wbt_stat_to_mask(stat)); 185e34cbd30SJens Axboe wbt_clear_state(stat); 186e34cbd30SJens Axboe } 187e34cbd30SJens Axboe } 188e34cbd30SJens Axboe 189e34cbd30SJens Axboe /* 190e34cbd30SJens Axboe * Return true, if we can't increase the depth further by scaling 191e34cbd30SJens Axboe */ 192e34cbd30SJens Axboe static bool calc_wb_limits(struct rq_wb *rwb) 193e34cbd30SJens Axboe { 194e34cbd30SJens Axboe unsigned int depth; 195e34cbd30SJens Axboe bool ret = false; 196e34cbd30SJens Axboe 197e34cbd30SJens Axboe if (!rwb->min_lat_nsec) { 198e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = rwb->wb_background = 0; 199e34cbd30SJens Axboe return false; 200e34cbd30SJens Axboe } 201e34cbd30SJens Axboe 202e34cbd30SJens Axboe /* 203e34cbd30SJens Axboe * For QD=1 devices, this is a special case. It's important for those 204e34cbd30SJens Axboe * to have one request ready when one completes, so force a depth of 205e34cbd30SJens Axboe * 2 for those devices. On the backend, it'll be a depth of 1 anyway, 206e34cbd30SJens Axboe * since the device can't have more than that in flight. If we're 207e34cbd30SJens Axboe * scaling down, then keep a setting of 1/1/1. 208e34cbd30SJens Axboe */ 209e34cbd30SJens Axboe if (rwb->queue_depth == 1) { 210e34cbd30SJens Axboe if (rwb->scale_step > 0) 211e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 1; 212e34cbd30SJens Axboe else { 213e34cbd30SJens Axboe rwb->wb_max = rwb->wb_normal = 2; 214e34cbd30SJens Axboe ret = true; 215e34cbd30SJens Axboe } 216e34cbd30SJens Axboe rwb->wb_background = 1; 217e34cbd30SJens Axboe } else { 218e34cbd30SJens Axboe /* 219e34cbd30SJens Axboe * scale_step == 0 is our default state. If we have suffered 220e34cbd30SJens Axboe * latency spikes, step will be > 0, and we shrink the 221e34cbd30SJens Axboe * allowed write depths. If step is < 0, we're only doing 222e34cbd30SJens Axboe * writes, and we allow a temporarily higher depth to 223e34cbd30SJens Axboe * increase performance. 224e34cbd30SJens Axboe */ 225e34cbd30SJens Axboe depth = min_t(unsigned int, RWB_DEF_DEPTH, rwb->queue_depth); 226e34cbd30SJens Axboe if (rwb->scale_step > 0) 227e34cbd30SJens Axboe depth = 1 + ((depth - 1) >> min(31, rwb->scale_step)); 228e34cbd30SJens Axboe else if (rwb->scale_step < 0) { 229e34cbd30SJens Axboe unsigned int maxd = 3 * rwb->queue_depth / 4; 230e34cbd30SJens Axboe 231e34cbd30SJens Axboe depth = 1 + ((depth - 1) << -rwb->scale_step); 232e34cbd30SJens Axboe if (depth > maxd) { 233e34cbd30SJens Axboe depth = maxd; 234e34cbd30SJens Axboe ret = true; 235e34cbd30SJens Axboe } 236e34cbd30SJens Axboe } 237e34cbd30SJens Axboe 238e34cbd30SJens Axboe /* 239e34cbd30SJens Axboe * Set our max/normal/bg queue depths based on how far 240e34cbd30SJens Axboe * we have scaled down (->scale_step). 241e34cbd30SJens Axboe */ 242e34cbd30SJens Axboe rwb->wb_max = depth; 243e34cbd30SJens Axboe rwb->wb_normal = (rwb->wb_max + 1) / 2; 244e34cbd30SJens Axboe rwb->wb_background = (rwb->wb_max + 3) / 4; 245e34cbd30SJens Axboe } 246e34cbd30SJens Axboe 247e34cbd30SJens Axboe return ret; 248e34cbd30SJens Axboe } 249e34cbd30SJens Axboe 2504121d385SArnd Bergmann static inline bool stat_sample_valid(struct blk_rq_stat *stat) 251e34cbd30SJens Axboe { 252e34cbd30SJens Axboe /* 253e34cbd30SJens Axboe * We need at least one read sample, and a minimum of 254e34cbd30SJens Axboe * RWB_MIN_WRITE_SAMPLES. We require some write samples to know 255e34cbd30SJens Axboe * that it's writes impacting us, and not just some sole read on 256e34cbd30SJens Axboe * a device that is in a lower power state. 257e34cbd30SJens Axboe */ 258fa2e39cbSOmar Sandoval return (stat[READ].nr_samples >= 1 && 259fa2e39cbSOmar Sandoval stat[WRITE].nr_samples >= RWB_MIN_WRITE_SAMPLES); 260e34cbd30SJens Axboe } 261e34cbd30SJens Axboe 262e34cbd30SJens Axboe static u64 rwb_sync_issue_lat(struct rq_wb *rwb) 263e34cbd30SJens Axboe { 264e34cbd30SJens Axboe u64 now, issue = ACCESS_ONCE(rwb->sync_issue); 265e34cbd30SJens Axboe 266e34cbd30SJens Axboe if (!issue || !rwb->sync_cookie) 267e34cbd30SJens Axboe return 0; 268e34cbd30SJens Axboe 269e34cbd30SJens Axboe now = ktime_to_ns(ktime_get()); 270e34cbd30SJens Axboe return now - issue; 271e34cbd30SJens Axboe } 272e34cbd30SJens Axboe 273e34cbd30SJens Axboe enum { 274e34cbd30SJens Axboe LAT_OK = 1, 275e34cbd30SJens Axboe LAT_UNKNOWN, 276e34cbd30SJens Axboe LAT_UNKNOWN_WRITES, 277e34cbd30SJens Axboe LAT_EXCEEDED, 278e34cbd30SJens Axboe }; 279e34cbd30SJens Axboe 28034dbad5dSOmar Sandoval static int latency_exceeded(struct rq_wb *rwb, struct blk_rq_stat *stat) 281e34cbd30SJens Axboe { 282dc3b17ccSJan Kara struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 283e34cbd30SJens Axboe u64 thislat; 284e34cbd30SJens Axboe 285e34cbd30SJens Axboe /* 286e34cbd30SJens Axboe * If our stored sync issue exceeds the window size, or it 287e34cbd30SJens Axboe * exceeds our min target AND we haven't logged any entries, 288e34cbd30SJens Axboe * flag the latency as exceeded. wbt works off completion latencies, 289e34cbd30SJens Axboe * but for a flooded device, a single sync IO can take a long time 290e34cbd30SJens Axboe * to complete after being issued. If this time exceeds our 291e34cbd30SJens Axboe * monitoring window AND we didn't see any other completions in that 292e34cbd30SJens Axboe * window, then count that sync IO as a violation of the latency. 293e34cbd30SJens Axboe */ 294e34cbd30SJens Axboe thislat = rwb_sync_issue_lat(rwb); 295e34cbd30SJens Axboe if (thislat > rwb->cur_win_nsec || 296fa2e39cbSOmar Sandoval (thislat > rwb->min_lat_nsec && !stat[READ].nr_samples)) { 297d8a0cbfdSJens Axboe trace_wbt_lat(bdi, thislat); 298e34cbd30SJens Axboe return LAT_EXCEEDED; 299e34cbd30SJens Axboe } 300e34cbd30SJens Axboe 301e34cbd30SJens Axboe /* 302e34cbd30SJens Axboe * No read/write mix, if stat isn't valid 303e34cbd30SJens Axboe */ 304e34cbd30SJens Axboe if (!stat_sample_valid(stat)) { 305e34cbd30SJens Axboe /* 306e34cbd30SJens Axboe * If we had writes in this stat window and the window is 307e34cbd30SJens Axboe * current, we're only doing writes. If a task recently 308e34cbd30SJens Axboe * waited or still has writes in flights, consider us doing 309e34cbd30SJens Axboe * just writes as well. 310e34cbd30SJens Axboe */ 31134dbad5dSOmar Sandoval if (stat[WRITE].nr_samples || wb_recent_wait(rwb) || 31234dbad5dSOmar Sandoval wbt_inflight(rwb)) 313e34cbd30SJens Axboe return LAT_UNKNOWN_WRITES; 314e34cbd30SJens Axboe return LAT_UNKNOWN; 315e34cbd30SJens Axboe } 316e34cbd30SJens Axboe 317e34cbd30SJens Axboe /* 318e34cbd30SJens Axboe * If the 'min' latency exceeds our target, step down. 319e34cbd30SJens Axboe */ 320fa2e39cbSOmar Sandoval if (stat[READ].min > rwb->min_lat_nsec) { 321fa2e39cbSOmar Sandoval trace_wbt_lat(bdi, stat[READ].min); 322d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 323e34cbd30SJens Axboe return LAT_EXCEEDED; 324e34cbd30SJens Axboe } 325e34cbd30SJens Axboe 326e34cbd30SJens Axboe if (rwb->scale_step) 327d8a0cbfdSJens Axboe trace_wbt_stat(bdi, stat); 328e34cbd30SJens Axboe 329e34cbd30SJens Axboe return LAT_OK; 330e34cbd30SJens Axboe } 331e34cbd30SJens Axboe 332e34cbd30SJens Axboe static void rwb_trace_step(struct rq_wb *rwb, const char *msg) 333e34cbd30SJens Axboe { 334dc3b17ccSJan Kara struct backing_dev_info *bdi = rwb->queue->backing_dev_info; 335d8a0cbfdSJens Axboe 336d8a0cbfdSJens Axboe trace_wbt_step(bdi, msg, rwb->scale_step, rwb->cur_win_nsec, 337e34cbd30SJens Axboe rwb->wb_background, rwb->wb_normal, rwb->wb_max); 338e34cbd30SJens Axboe } 339e34cbd30SJens Axboe 340e34cbd30SJens Axboe static void scale_up(struct rq_wb *rwb) 341e34cbd30SJens Axboe { 342e34cbd30SJens Axboe /* 343e34cbd30SJens Axboe * Hit max in previous round, stop here 344e34cbd30SJens Axboe */ 345e34cbd30SJens Axboe if (rwb->scaled_max) 346e34cbd30SJens Axboe return; 347e34cbd30SJens Axboe 348e34cbd30SJens Axboe rwb->scale_step--; 349e34cbd30SJens Axboe rwb->unknown_cnt = 0; 350e34cbd30SJens Axboe 351e34cbd30SJens Axboe rwb->scaled_max = calc_wb_limits(rwb); 352e34cbd30SJens Axboe 353e34cbd30SJens Axboe rwb_wake_all(rwb); 354e34cbd30SJens Axboe 355e34cbd30SJens Axboe rwb_trace_step(rwb, "step up"); 356e34cbd30SJens Axboe } 357e34cbd30SJens Axboe 358e34cbd30SJens Axboe /* 359e34cbd30SJens Axboe * Scale rwb down. If 'hard_throttle' is set, do it quicker, since we 360e34cbd30SJens Axboe * had a latency violation. 361e34cbd30SJens Axboe */ 362e34cbd30SJens Axboe static void scale_down(struct rq_wb *rwb, bool hard_throttle) 363e34cbd30SJens Axboe { 364e34cbd30SJens Axboe /* 365e34cbd30SJens Axboe * Stop scaling down when we've hit the limit. This also prevents 366e34cbd30SJens Axboe * ->scale_step from going to crazy values, if the device can't 367e34cbd30SJens Axboe * keep up. 368e34cbd30SJens Axboe */ 369e34cbd30SJens Axboe if (rwb->wb_max == 1) 370e34cbd30SJens Axboe return; 371e34cbd30SJens Axboe 372e34cbd30SJens Axboe if (rwb->scale_step < 0 && hard_throttle) 373e34cbd30SJens Axboe rwb->scale_step = 0; 374e34cbd30SJens Axboe else 375e34cbd30SJens Axboe rwb->scale_step++; 376e34cbd30SJens Axboe 377e34cbd30SJens Axboe rwb->scaled_max = false; 378e34cbd30SJens Axboe rwb->unknown_cnt = 0; 379e34cbd30SJens Axboe calc_wb_limits(rwb); 380e34cbd30SJens Axboe rwb_trace_step(rwb, "step down"); 381e34cbd30SJens Axboe } 382e34cbd30SJens Axboe 383e34cbd30SJens Axboe static void rwb_arm_timer(struct rq_wb *rwb) 384e34cbd30SJens Axboe { 385e34cbd30SJens Axboe if (rwb->scale_step > 0) { 386e34cbd30SJens Axboe /* 387e34cbd30SJens Axboe * We should speed this up, using some variant of a fast 388e34cbd30SJens Axboe * integer inverse square root calculation. Since we only do 389e34cbd30SJens Axboe * this for every window expiration, it's not a huge deal, 390e34cbd30SJens Axboe * though. 391e34cbd30SJens Axboe */ 392e34cbd30SJens Axboe rwb->cur_win_nsec = div_u64(rwb->win_nsec << 4, 393e34cbd30SJens Axboe int_sqrt((rwb->scale_step + 1) << 8)); 394e34cbd30SJens Axboe } else { 395e34cbd30SJens Axboe /* 396e34cbd30SJens Axboe * For step < 0, we don't want to increase/decrease the 397e34cbd30SJens Axboe * window size. 398e34cbd30SJens Axboe */ 399e34cbd30SJens Axboe rwb->cur_win_nsec = rwb->win_nsec; 400e34cbd30SJens Axboe } 401e34cbd30SJens Axboe 40234dbad5dSOmar Sandoval blk_stat_activate_nsecs(rwb->cb, rwb->cur_win_nsec); 403e34cbd30SJens Axboe } 404e34cbd30SJens Axboe 40534dbad5dSOmar Sandoval static void wb_timer_fn(struct blk_stat_callback *cb) 406e34cbd30SJens Axboe { 40734dbad5dSOmar Sandoval struct rq_wb *rwb = cb->data; 408e34cbd30SJens Axboe unsigned int inflight = wbt_inflight(rwb); 409e34cbd30SJens Axboe int status; 410e34cbd30SJens Axboe 41134dbad5dSOmar Sandoval status = latency_exceeded(rwb, cb->stat); 412e34cbd30SJens Axboe 413dc3b17ccSJan Kara trace_wbt_timer(rwb->queue->backing_dev_info, status, rwb->scale_step, 414d8a0cbfdSJens Axboe inflight); 415e34cbd30SJens Axboe 416e34cbd30SJens Axboe /* 417e34cbd30SJens Axboe * If we exceeded the latency target, step down. If we did not, 418e34cbd30SJens Axboe * step one level up. If we don't know enough to say either exceeded 419e34cbd30SJens Axboe * or ok, then don't do anything. 420e34cbd30SJens Axboe */ 421e34cbd30SJens Axboe switch (status) { 422e34cbd30SJens Axboe case LAT_EXCEEDED: 423e34cbd30SJens Axboe scale_down(rwb, true); 424e34cbd30SJens Axboe break; 425e34cbd30SJens Axboe case LAT_OK: 426e34cbd30SJens Axboe scale_up(rwb); 427e34cbd30SJens Axboe break; 428e34cbd30SJens Axboe case LAT_UNKNOWN_WRITES: 429e34cbd30SJens Axboe /* 430e34cbd30SJens Axboe * We started a the center step, but don't have a valid 431e34cbd30SJens Axboe * read/write sample, but we do have writes going on. 432e34cbd30SJens Axboe * Allow step to go negative, to increase write perf. 433e34cbd30SJens Axboe */ 434e34cbd30SJens Axboe scale_up(rwb); 435e34cbd30SJens Axboe break; 436e34cbd30SJens Axboe case LAT_UNKNOWN: 437e34cbd30SJens Axboe if (++rwb->unknown_cnt < RWB_UNKNOWN_BUMP) 438e34cbd30SJens Axboe break; 439e34cbd30SJens Axboe /* 440e34cbd30SJens Axboe * We get here when previously scaled reduced depth, and we 441e34cbd30SJens Axboe * currently don't have a valid read/write sample. For that 442e34cbd30SJens Axboe * case, slowly return to center state (step == 0). 443e34cbd30SJens Axboe */ 444e34cbd30SJens Axboe if (rwb->scale_step > 0) 445e34cbd30SJens Axboe scale_up(rwb); 446e34cbd30SJens Axboe else if (rwb->scale_step < 0) 447e34cbd30SJens Axboe scale_down(rwb, false); 448e34cbd30SJens Axboe break; 449e34cbd30SJens Axboe default: 450e34cbd30SJens Axboe break; 451e34cbd30SJens Axboe } 452e34cbd30SJens Axboe 453e34cbd30SJens Axboe /* 454e34cbd30SJens Axboe * Re-arm timer, if we have IO in flight 455e34cbd30SJens Axboe */ 456e34cbd30SJens Axboe if (rwb->scale_step || inflight) 457e34cbd30SJens Axboe rwb_arm_timer(rwb); 458e34cbd30SJens Axboe } 459e34cbd30SJens Axboe 460e34cbd30SJens Axboe void wbt_update_limits(struct rq_wb *rwb) 461e34cbd30SJens Axboe { 462e34cbd30SJens Axboe rwb->scale_step = 0; 463e34cbd30SJens Axboe rwb->scaled_max = false; 464e34cbd30SJens Axboe calc_wb_limits(rwb); 465e34cbd30SJens Axboe 466e34cbd30SJens Axboe rwb_wake_all(rwb); 467e34cbd30SJens Axboe } 468e34cbd30SJens Axboe 469e34cbd30SJens Axboe static bool close_io(struct rq_wb *rwb) 470e34cbd30SJens Axboe { 471e34cbd30SJens Axboe const unsigned long now = jiffies; 472e34cbd30SJens Axboe 473e34cbd30SJens Axboe return time_before(now, rwb->last_issue + HZ / 10) || 474e34cbd30SJens Axboe time_before(now, rwb->last_comp + HZ / 10); 475e34cbd30SJens Axboe } 476e34cbd30SJens Axboe 477e34cbd30SJens Axboe #define REQ_HIPRIO (REQ_SYNC | REQ_META | REQ_PRIO) 478e34cbd30SJens Axboe 479e34cbd30SJens Axboe static inline unsigned int get_limit(struct rq_wb *rwb, unsigned long rw) 480e34cbd30SJens Axboe { 481e34cbd30SJens Axboe unsigned int limit; 482e34cbd30SJens Axboe 483e34cbd30SJens Axboe /* 484e34cbd30SJens Axboe * At this point we know it's a buffered write. If this is 485e34cbd30SJens Axboe * kswapd trying to free memory, or REQ_SYNC is set, set, then 486e34cbd30SJens Axboe * it's WB_SYNC_ALL writeback, and we'll use the max limit for 487e34cbd30SJens Axboe * that. If the write is marked as a background write, then use 488e34cbd30SJens Axboe * the idle limit, or go to normal if we haven't had competing 489e34cbd30SJens Axboe * IO for a bit. 490e34cbd30SJens Axboe */ 491e34cbd30SJens Axboe if ((rw & REQ_HIPRIO) || wb_recent_wait(rwb) || current_is_kswapd()) 492e34cbd30SJens Axboe limit = rwb->wb_max; 493e34cbd30SJens Axboe else if ((rw & REQ_BACKGROUND) || close_io(rwb)) { 494e34cbd30SJens Axboe /* 495e34cbd30SJens Axboe * If less than 100ms since we completed unrelated IO, 496e34cbd30SJens Axboe * limit us to half the depth for background writeback. 497e34cbd30SJens Axboe */ 498e34cbd30SJens Axboe limit = rwb->wb_background; 499e34cbd30SJens Axboe } else 500e34cbd30SJens Axboe limit = rwb->wb_normal; 501e34cbd30SJens Axboe 502e34cbd30SJens Axboe return limit; 503e34cbd30SJens Axboe } 504e34cbd30SJens Axboe 505e34cbd30SJens Axboe static inline bool may_queue(struct rq_wb *rwb, struct rq_wait *rqw, 506ac6424b9SIngo Molnar wait_queue_entry_t *wait, unsigned long rw) 507e34cbd30SJens Axboe { 508e34cbd30SJens Axboe /* 509e34cbd30SJens Axboe * inc it here even if disabled, since we'll dec it at completion. 510e34cbd30SJens Axboe * this only happens if the task was sleeping in __wbt_wait(), 511e34cbd30SJens Axboe * and someone turned it off at the same time. 512e34cbd30SJens Axboe */ 513e34cbd30SJens Axboe if (!rwb_enabled(rwb)) { 514e34cbd30SJens Axboe atomic_inc(&rqw->inflight); 515e34cbd30SJens Axboe return true; 516e34cbd30SJens Axboe } 517e34cbd30SJens Axboe 518e34cbd30SJens Axboe /* 519e34cbd30SJens Axboe * If the waitqueue is already active and we are not the next 520e34cbd30SJens Axboe * in line to be woken up, wait for our turn. 521e34cbd30SJens Axboe */ 522e34cbd30SJens Axboe if (waitqueue_active(&rqw->wait) && 523e34cbd30SJens Axboe rqw->wait.task_list.next != &wait->task_list) 524e34cbd30SJens Axboe return false; 525e34cbd30SJens Axboe 526e34cbd30SJens Axboe return atomic_inc_below(&rqw->inflight, get_limit(rwb, rw)); 527e34cbd30SJens Axboe } 528e34cbd30SJens Axboe 529e34cbd30SJens Axboe /* 530e34cbd30SJens Axboe * Block if we will exceed our limit, or if we are currently waiting for 531e34cbd30SJens Axboe * the timer to kick off queuing again. 532e34cbd30SJens Axboe */ 533e34cbd30SJens Axboe static void __wbt_wait(struct rq_wb *rwb, unsigned long rw, spinlock_t *lock) 5349eca5350SBart Van Assche __releases(lock) 5359eca5350SBart Van Assche __acquires(lock) 536e34cbd30SJens Axboe { 537e34cbd30SJens Axboe struct rq_wait *rqw = get_rq_wait(rwb, current_is_kswapd()); 538e34cbd30SJens Axboe DEFINE_WAIT(wait); 539e34cbd30SJens Axboe 540e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 541e34cbd30SJens Axboe return; 542e34cbd30SJens Axboe 543e34cbd30SJens Axboe do { 544e34cbd30SJens Axboe prepare_to_wait_exclusive(&rqw->wait, &wait, 545e34cbd30SJens Axboe TASK_UNINTERRUPTIBLE); 546e34cbd30SJens Axboe 547e34cbd30SJens Axboe if (may_queue(rwb, rqw, &wait, rw)) 548e34cbd30SJens Axboe break; 549e34cbd30SJens Axboe 5509eca5350SBart Van Assche if (lock) { 551e34cbd30SJens Axboe spin_unlock_irq(lock); 552e34cbd30SJens Axboe io_schedule(); 553e34cbd30SJens Axboe spin_lock_irq(lock); 5549eca5350SBart Van Assche } else 5559eca5350SBart Van Assche io_schedule(); 556e34cbd30SJens Axboe } while (1); 557e34cbd30SJens Axboe 558e34cbd30SJens Axboe finish_wait(&rqw->wait, &wait); 559e34cbd30SJens Axboe } 560e34cbd30SJens Axboe 561e34cbd30SJens Axboe static inline bool wbt_should_throttle(struct rq_wb *rwb, struct bio *bio) 562e34cbd30SJens Axboe { 563e34cbd30SJens Axboe const int op = bio_op(bio); 564e34cbd30SJens Axboe 565e34cbd30SJens Axboe /* 566be07e14fSChristoph Hellwig * If not a WRITE, do nothing 567e34cbd30SJens Axboe */ 568be07e14fSChristoph Hellwig if (op != REQ_OP_WRITE) 569e34cbd30SJens Axboe return false; 570e34cbd30SJens Axboe 571e34cbd30SJens Axboe /* 572e34cbd30SJens Axboe * Don't throttle WRITE_ODIRECT 573e34cbd30SJens Axboe */ 574e34cbd30SJens Axboe if ((bio->bi_opf & (REQ_SYNC | REQ_IDLE)) == (REQ_SYNC | REQ_IDLE)) 575e34cbd30SJens Axboe return false; 576e34cbd30SJens Axboe 577e34cbd30SJens Axboe return true; 578e34cbd30SJens Axboe } 579e34cbd30SJens Axboe 580e34cbd30SJens Axboe /* 581e34cbd30SJens Axboe * Returns true if the IO request should be accounted, false if not. 582e34cbd30SJens Axboe * May sleep, if we have exceeded the writeback limits. Caller can pass 583e34cbd30SJens Axboe * in an irq held spinlock, if it holds one when calling this function. 584e34cbd30SJens Axboe * If we do sleep, we'll release and re-grab it. 585e34cbd30SJens Axboe */ 586f2e0a0b2SBart Van Assche enum wbt_flags wbt_wait(struct rq_wb *rwb, struct bio *bio, spinlock_t *lock) 587e34cbd30SJens Axboe { 588e34cbd30SJens Axboe unsigned int ret = 0; 589e34cbd30SJens Axboe 590e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 591e34cbd30SJens Axboe return 0; 592e34cbd30SJens Axboe 593e34cbd30SJens Axboe if (bio_op(bio) == REQ_OP_READ) 594e34cbd30SJens Axboe ret = WBT_READ; 595e34cbd30SJens Axboe 596e34cbd30SJens Axboe if (!wbt_should_throttle(rwb, bio)) { 597e34cbd30SJens Axboe if (ret & WBT_READ) 598e34cbd30SJens Axboe wb_timestamp(rwb, &rwb->last_issue); 599e34cbd30SJens Axboe return ret; 600e34cbd30SJens Axboe } 601e34cbd30SJens Axboe 602e34cbd30SJens Axboe __wbt_wait(rwb, bio->bi_opf, lock); 603e34cbd30SJens Axboe 60434dbad5dSOmar Sandoval if (!blk_stat_is_active(rwb->cb)) 605e34cbd30SJens Axboe rwb_arm_timer(rwb); 606e34cbd30SJens Axboe 607e34cbd30SJens Axboe if (current_is_kswapd()) 608e34cbd30SJens Axboe ret |= WBT_KSWAPD; 609e34cbd30SJens Axboe 610e34cbd30SJens Axboe return ret | WBT_TRACKED; 611e34cbd30SJens Axboe } 612e34cbd30SJens Axboe 613e34cbd30SJens Axboe void wbt_issue(struct rq_wb *rwb, struct blk_issue_stat *stat) 614e34cbd30SJens Axboe { 615e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 616e34cbd30SJens Axboe return; 617e34cbd30SJens Axboe 618e34cbd30SJens Axboe /* 619e34cbd30SJens Axboe * Track sync issue, in case it takes a long time to complete. Allows 620e34cbd30SJens Axboe * us to react quicker, if a sync IO takes a long time to complete. 621e34cbd30SJens Axboe * Note that this is just a hint. 'stat' can go away when the 622e34cbd30SJens Axboe * request completes, so it's important we never dereference it. We 623e34cbd30SJens Axboe * only use the address to compare with, which is why we store the 624e34cbd30SJens Axboe * sync_issue time locally. 625e34cbd30SJens Axboe */ 626e34cbd30SJens Axboe if (wbt_is_read(stat) && !rwb->sync_issue) { 627e34cbd30SJens Axboe rwb->sync_cookie = stat; 628e34cbd30SJens Axboe rwb->sync_issue = blk_stat_time(stat); 629e34cbd30SJens Axboe } 630e34cbd30SJens Axboe } 631e34cbd30SJens Axboe 632e34cbd30SJens Axboe void wbt_requeue(struct rq_wb *rwb, struct blk_issue_stat *stat) 633e34cbd30SJens Axboe { 634e34cbd30SJens Axboe if (!rwb_enabled(rwb)) 635e34cbd30SJens Axboe return; 636e34cbd30SJens Axboe if (stat == rwb->sync_cookie) { 637e34cbd30SJens Axboe rwb->sync_issue = 0; 638e34cbd30SJens Axboe rwb->sync_cookie = NULL; 639e34cbd30SJens Axboe } 640e34cbd30SJens Axboe } 641e34cbd30SJens Axboe 642e34cbd30SJens Axboe void wbt_set_queue_depth(struct rq_wb *rwb, unsigned int depth) 643e34cbd30SJens Axboe { 644e34cbd30SJens Axboe if (rwb) { 645e34cbd30SJens Axboe rwb->queue_depth = depth; 646e34cbd30SJens Axboe wbt_update_limits(rwb); 647e34cbd30SJens Axboe } 648e34cbd30SJens Axboe } 649e34cbd30SJens Axboe 650e34cbd30SJens Axboe void wbt_set_write_cache(struct rq_wb *rwb, bool write_cache_on) 651e34cbd30SJens Axboe { 652e34cbd30SJens Axboe if (rwb) 653e34cbd30SJens Axboe rwb->wc = write_cache_on; 654e34cbd30SJens Axboe } 655e34cbd30SJens Axboe 656fa224eedSJens Axboe /* 6573f19cd23SJan Kara * Disable wbt, if enabled by default. Only called from CFQ. 658fa224eedSJens Axboe */ 659fa224eedSJens Axboe void wbt_disable_default(struct request_queue *q) 660e34cbd30SJens Axboe { 661fa224eedSJens Axboe struct rq_wb *rwb = q->rq_wb; 662fa224eedSJens Axboe 6633f19cd23SJan Kara if (rwb && rwb->enable_state == WBT_STATE_ON_DEFAULT) 6643f19cd23SJan Kara wbt_exit(q); 665e34cbd30SJens Axboe } 666fa224eedSJens Axboe EXPORT_SYMBOL_GPL(wbt_disable_default); 667e34cbd30SJens Axboe 6688330cdb0SJan Kara /* 6698330cdb0SJan Kara * Enable wbt if defaults are configured that way 6708330cdb0SJan Kara */ 6718330cdb0SJan Kara void wbt_enable_default(struct request_queue *q) 6728330cdb0SJan Kara { 6738330cdb0SJan Kara /* Throttling already enabled? */ 6748330cdb0SJan Kara if (q->rq_wb) 6758330cdb0SJan Kara return; 6768330cdb0SJan Kara 6778330cdb0SJan Kara /* Queue not registered? Maybe shutting down... */ 6788330cdb0SJan Kara if (!test_bit(QUEUE_FLAG_REGISTERED, &q->queue_flags)) 6798330cdb0SJan Kara return; 6808330cdb0SJan Kara 6818330cdb0SJan Kara if ((q->mq_ops && IS_ENABLED(CONFIG_BLK_WBT_MQ)) || 6828330cdb0SJan Kara (q->request_fn && IS_ENABLED(CONFIG_BLK_WBT_SQ))) 6838330cdb0SJan Kara wbt_init(q); 6848330cdb0SJan Kara } 6858330cdb0SJan Kara EXPORT_SYMBOL_GPL(wbt_enable_default); 6868330cdb0SJan Kara 68780e091d1SJens Axboe u64 wbt_default_latency_nsec(struct request_queue *q) 68880e091d1SJens Axboe { 68980e091d1SJens Axboe /* 69080e091d1SJens Axboe * We default to 2msec for non-rotational storage, and 75msec 69180e091d1SJens Axboe * for rotational storage. 69280e091d1SJens Axboe */ 69380e091d1SJens Axboe if (blk_queue_nonrot(q)) 69480e091d1SJens Axboe return 2000000ULL; 69580e091d1SJens Axboe else 69680e091d1SJens Axboe return 75000000ULL; 69780e091d1SJens Axboe } 69880e091d1SJens Axboe 69999c749a4SJens Axboe static int wbt_data_dir(const struct request *rq) 70099c749a4SJens Axboe { 70199c749a4SJens Axboe return rq_data_dir(rq); 70299c749a4SJens Axboe } 70399c749a4SJens Axboe 7048054b89fSJens Axboe int wbt_init(struct request_queue *q) 705e34cbd30SJens Axboe { 706e34cbd30SJens Axboe struct rq_wb *rwb; 707e34cbd30SJens Axboe int i; 708e34cbd30SJens Axboe 709e34cbd30SJens Axboe BUILD_BUG_ON(WBT_NR_BITS > BLK_STAT_RES_BITS); 710e34cbd30SJens Axboe 711e34cbd30SJens Axboe rwb = kzalloc(sizeof(*rwb), GFP_KERNEL); 712e34cbd30SJens Axboe if (!rwb) 713e34cbd30SJens Axboe return -ENOMEM; 714e34cbd30SJens Axboe 71599c749a4SJens Axboe rwb->cb = blk_stat_alloc_callback(wb_timer_fn, wbt_data_dir, 2, rwb); 71634dbad5dSOmar Sandoval if (!rwb->cb) { 71734dbad5dSOmar Sandoval kfree(rwb); 71834dbad5dSOmar Sandoval return -ENOMEM; 71934dbad5dSOmar Sandoval } 72034dbad5dSOmar Sandoval 721e34cbd30SJens Axboe for (i = 0; i < WBT_NUM_RWQ; i++) { 722e34cbd30SJens Axboe atomic_set(&rwb->rq_wait[i].inflight, 0); 723e34cbd30SJens Axboe init_waitqueue_head(&rwb->rq_wait[i].wait); 724e34cbd30SJens Axboe } 725e34cbd30SJens Axboe 726e34cbd30SJens Axboe rwb->wc = 1; 727e34cbd30SJens Axboe rwb->queue_depth = RWB_DEF_DEPTH; 728e34cbd30SJens Axboe rwb->last_comp = rwb->last_issue = jiffies; 729d8a0cbfdSJens Axboe rwb->queue = q; 730e34cbd30SJens Axboe rwb->win_nsec = RWB_WINDOW_NSEC; 731d62118b6SJens Axboe rwb->enable_state = WBT_STATE_ON_DEFAULT; 732e34cbd30SJens Axboe wbt_update_limits(rwb); 733e34cbd30SJens Axboe 734e34cbd30SJens Axboe /* 73534dbad5dSOmar Sandoval * Assign rwb and add the stats callback. 736e34cbd30SJens Axboe */ 737e34cbd30SJens Axboe q->rq_wb = rwb; 73834dbad5dSOmar Sandoval blk_stat_add_callback(q, rwb->cb); 739e34cbd30SJens Axboe 74080e091d1SJens Axboe rwb->min_lat_nsec = wbt_default_latency_nsec(q); 741e34cbd30SJens Axboe 742e34cbd30SJens Axboe wbt_set_queue_depth(rwb, blk_queue_depth(q)); 743e34cbd30SJens Axboe wbt_set_write_cache(rwb, test_bit(QUEUE_FLAG_WC, &q->queue_flags)); 744e34cbd30SJens Axboe 745e34cbd30SJens Axboe return 0; 746e34cbd30SJens Axboe } 747e34cbd30SJens Axboe 748e34cbd30SJens Axboe void wbt_exit(struct request_queue *q) 749e34cbd30SJens Axboe { 750e34cbd30SJens Axboe struct rq_wb *rwb = q->rq_wb; 751e34cbd30SJens Axboe 752e34cbd30SJens Axboe if (rwb) { 75334dbad5dSOmar Sandoval blk_stat_remove_callback(q, rwb->cb); 75434dbad5dSOmar Sandoval blk_stat_free_callback(rwb->cb); 755e34cbd30SJens Axboe q->rq_wb = NULL; 756e34cbd30SJens Axboe kfree(rwb); 757e34cbd30SJens Axboe } 758e34cbd30SJens Axboe } 759