1 /* 2 * background writeback - scan btree for dirty data and write it to the backing 3 * device 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcache.h" 10 #include "btree.h" 11 #include "debug.h" 12 #include "writeback.h" 13 14 #include <trace/events/bcache.h> 15 16 static struct workqueue_struct *dirty_wq; 17 18 static void read_dirty(struct closure *); 19 20 struct dirty_io { 21 struct closure cl; 22 struct cached_dev *dc; 23 struct bio bio; 24 }; 25 26 /* Rate limiting */ 27 28 static void __update_writeback_rate(struct cached_dev *dc) 29 { 30 struct cache_set *c = dc->disk.c; 31 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size; 32 uint64_t cache_dirty_target = 33 div_u64(cache_sectors * dc->writeback_percent, 100); 34 35 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 36 c->cached_dev_sectors); 37 38 /* PD controller */ 39 40 int change = 0; 41 int64_t error; 42 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 43 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 44 45 dc->disk.sectors_dirty_last = dirty; 46 47 derivative *= dc->writeback_rate_d_term; 48 derivative = clamp(derivative, -dirty, dirty); 49 50 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 51 dc->writeback_rate_d_smooth, 0); 52 53 /* Avoid divide by zero */ 54 if (!target) 55 goto out; 56 57 error = div64_s64((dirty + derivative - target) << 8, target); 58 59 change = div_s64((dc->writeback_rate.rate * error) >> 8, 60 dc->writeback_rate_p_term_inverse); 61 62 /* Don't increase writeback rate if the device isn't keeping up */ 63 if (change > 0 && 64 time_after64(local_clock(), 65 dc->writeback_rate.next + 10 * NSEC_PER_MSEC)) 66 change = 0; 67 68 dc->writeback_rate.rate = 69 clamp_t(int64_t, dc->writeback_rate.rate + change, 70 1, NSEC_PER_MSEC); 71 out: 72 dc->writeback_rate_derivative = derivative; 73 dc->writeback_rate_change = change; 74 dc->writeback_rate_target = target; 75 76 schedule_delayed_work(&dc->writeback_rate_update, 77 dc->writeback_rate_update_seconds * HZ); 78 } 79 80 static void update_writeback_rate(struct work_struct *work) 81 { 82 struct cached_dev *dc = container_of(to_delayed_work(work), 83 struct cached_dev, 84 writeback_rate_update); 85 86 down_read(&dc->writeback_lock); 87 88 if (atomic_read(&dc->has_dirty) && 89 dc->writeback_percent) 90 __update_writeback_rate(dc); 91 92 up_read(&dc->writeback_lock); 93 } 94 95 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 96 { 97 uint64_t ret; 98 99 if (atomic_read(&dc->disk.detaching) || 100 !dc->writeback_percent) 101 return 0; 102 103 ret = bch_next_delay(&dc->writeback_rate, sectors * 10000000ULL); 104 105 return min_t(uint64_t, ret, HZ); 106 } 107 108 /* Background writeback */ 109 110 static bool dirty_pred(struct keybuf *buf, struct bkey *k) 111 { 112 return KEY_DIRTY(k); 113 } 114 115 static bool dirty_full_stripe_pred(struct keybuf *buf, struct bkey *k) 116 { 117 uint64_t stripe; 118 unsigned nr_sectors = KEY_SIZE(k); 119 struct cached_dev *dc = container_of(buf, struct cached_dev, 120 writeback_keys); 121 unsigned stripe_size = 1 << dc->disk.stripe_size_bits; 122 123 if (!KEY_DIRTY(k)) 124 return false; 125 126 stripe = KEY_START(k) >> dc->disk.stripe_size_bits; 127 while (1) { 128 if (atomic_read(dc->disk.stripe_sectors_dirty + stripe) != 129 stripe_size) 130 return false; 131 132 if (nr_sectors <= stripe_size) 133 return true; 134 135 nr_sectors -= stripe_size; 136 stripe++; 137 } 138 } 139 140 static void dirty_init(struct keybuf_key *w) 141 { 142 struct dirty_io *io = w->private; 143 struct bio *bio = &io->bio; 144 145 bio_init(bio); 146 if (!io->dc->writeback_percent) 147 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 148 149 bio->bi_size = KEY_SIZE(&w->key) << 9; 150 bio->bi_max_vecs = DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS); 151 bio->bi_private = w; 152 bio->bi_io_vec = bio->bi_inline_vecs; 153 bch_bio_map(bio, NULL); 154 } 155 156 static void refill_dirty(struct closure *cl) 157 { 158 struct cached_dev *dc = container_of(cl, struct cached_dev, 159 writeback.cl); 160 struct keybuf *buf = &dc->writeback_keys; 161 bool searched_from_start = false; 162 struct bkey end = MAX_KEY; 163 SET_KEY_INODE(&end, dc->disk.id); 164 165 if (!atomic_read(&dc->disk.detaching) && 166 !dc->writeback_running) 167 closure_return(cl); 168 169 down_write(&dc->writeback_lock); 170 171 if (!atomic_read(&dc->has_dirty)) { 172 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 173 bch_write_bdev_super(dc, NULL); 174 175 up_write(&dc->writeback_lock); 176 closure_return(cl); 177 } 178 179 if (bkey_cmp(&buf->last_scanned, &end) >= 0) { 180 buf->last_scanned = KEY(dc->disk.id, 0, 0); 181 searched_from_start = true; 182 } 183 184 if (dc->partial_stripes_expensive) { 185 uint64_t i; 186 187 for (i = 0; i < dc->disk.nr_stripes; i++) 188 if (atomic_read(dc->disk.stripe_sectors_dirty + i) == 189 1 << dc->disk.stripe_size_bits) 190 goto full_stripes; 191 192 goto normal_refill; 193 full_stripes: 194 bch_refill_keybuf(dc->disk.c, buf, &end, 195 dirty_full_stripe_pred); 196 } else { 197 normal_refill: 198 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 199 } 200 201 if (bkey_cmp(&buf->last_scanned, &end) >= 0 && searched_from_start) { 202 /* Searched the entire btree - delay awhile */ 203 204 if (RB_EMPTY_ROOT(&buf->keys)) { 205 atomic_set(&dc->has_dirty, 0); 206 cached_dev_put(dc); 207 } 208 209 if (!atomic_read(&dc->disk.detaching)) 210 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 211 } 212 213 up_write(&dc->writeback_lock); 214 215 bch_ratelimit_reset(&dc->writeback_rate); 216 217 /* Punt to workqueue only so we don't recurse and blow the stack */ 218 continue_at(cl, read_dirty, dirty_wq); 219 } 220 221 void bch_writeback_queue(struct cached_dev *dc) 222 { 223 if (closure_trylock(&dc->writeback.cl, &dc->disk.cl)) { 224 if (!atomic_read(&dc->disk.detaching)) 225 closure_delay(&dc->writeback, dc->writeback_delay * HZ); 226 227 continue_at(&dc->writeback.cl, refill_dirty, dirty_wq); 228 } 229 } 230 231 void bch_writeback_add(struct cached_dev *dc) 232 { 233 if (!atomic_read(&dc->has_dirty) && 234 !atomic_xchg(&dc->has_dirty, 1)) { 235 atomic_inc(&dc->count); 236 237 if (BDEV_STATE(&dc->sb) != BDEV_STATE_DIRTY) { 238 SET_BDEV_STATE(&dc->sb, BDEV_STATE_DIRTY); 239 /* XXX: should do this synchronously */ 240 bch_write_bdev_super(dc, NULL); 241 } 242 243 bch_writeback_queue(dc); 244 245 if (dc->writeback_percent) 246 schedule_delayed_work(&dc->writeback_rate_update, 247 dc->writeback_rate_update_seconds * HZ); 248 } 249 } 250 251 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 252 uint64_t offset, int nr_sectors) 253 { 254 struct bcache_device *d = c->devices[inode]; 255 unsigned stripe_size, stripe_offset; 256 uint64_t stripe; 257 258 if (!d) 259 return; 260 261 stripe_size = 1 << d->stripe_size_bits; 262 stripe = offset >> d->stripe_size_bits; 263 stripe_offset = offset & (stripe_size - 1); 264 265 while (nr_sectors) { 266 int s = min_t(unsigned, abs(nr_sectors), 267 stripe_size - stripe_offset); 268 269 if (nr_sectors < 0) 270 s = -s; 271 272 atomic_add(s, d->stripe_sectors_dirty + stripe); 273 nr_sectors -= s; 274 stripe_offset = 0; 275 stripe++; 276 } 277 } 278 279 /* Background writeback - IO loop */ 280 281 static void dirty_io_destructor(struct closure *cl) 282 { 283 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 284 kfree(io); 285 } 286 287 static void write_dirty_finish(struct closure *cl) 288 { 289 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 290 struct keybuf_key *w = io->bio.bi_private; 291 struct cached_dev *dc = io->dc; 292 struct bio_vec *bv; 293 int i; 294 295 bio_for_each_segment_all(bv, &io->bio, i) 296 __free_page(bv->bv_page); 297 298 /* This is kind of a dumb way of signalling errors. */ 299 if (KEY_DIRTY(&w->key)) { 300 unsigned i; 301 struct btree_op op; 302 bch_btree_op_init_stack(&op); 303 304 op.type = BTREE_REPLACE; 305 bkey_copy(&op.replace, &w->key); 306 307 SET_KEY_DIRTY(&w->key, false); 308 bch_keylist_add(&op.keys, &w->key); 309 310 for (i = 0; i < KEY_PTRS(&w->key); i++) 311 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 312 313 bch_btree_insert(&op, dc->disk.c); 314 closure_sync(&op.cl); 315 316 if (op.insert_collision) 317 trace_bcache_writeback_collision(&w->key); 318 319 atomic_long_inc(op.insert_collision 320 ? &dc->disk.c->writeback_keys_failed 321 : &dc->disk.c->writeback_keys_done); 322 } 323 324 bch_keybuf_del(&dc->writeback_keys, w); 325 up(&dc->in_flight); 326 327 closure_return_with_destructor(cl, dirty_io_destructor); 328 } 329 330 static void dirty_endio(struct bio *bio, int error) 331 { 332 struct keybuf_key *w = bio->bi_private; 333 struct dirty_io *io = w->private; 334 335 if (error) 336 SET_KEY_DIRTY(&w->key, false); 337 338 closure_put(&io->cl); 339 } 340 341 static void write_dirty(struct closure *cl) 342 { 343 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 344 struct keybuf_key *w = io->bio.bi_private; 345 346 dirty_init(w); 347 io->bio.bi_rw = WRITE; 348 io->bio.bi_sector = KEY_START(&w->key); 349 io->bio.bi_bdev = io->dc->bdev; 350 io->bio.bi_end_io = dirty_endio; 351 352 closure_bio_submit(&io->bio, cl, &io->dc->disk); 353 354 continue_at(cl, write_dirty_finish, system_wq); 355 } 356 357 static void read_dirty_endio(struct bio *bio, int error) 358 { 359 struct keybuf_key *w = bio->bi_private; 360 struct dirty_io *io = w->private; 361 362 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 363 error, "reading dirty data from cache"); 364 365 dirty_endio(bio, error); 366 } 367 368 static void read_dirty_submit(struct closure *cl) 369 { 370 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 371 372 closure_bio_submit(&io->bio, cl, &io->dc->disk); 373 374 continue_at(cl, write_dirty, system_wq); 375 } 376 377 static void read_dirty(struct closure *cl) 378 { 379 struct cached_dev *dc = container_of(cl, struct cached_dev, 380 writeback.cl); 381 unsigned delay = writeback_delay(dc, 0); 382 struct keybuf_key *w; 383 struct dirty_io *io; 384 385 /* 386 * XXX: if we error, background writeback just spins. Should use some 387 * mempools. 388 */ 389 390 while (1) { 391 w = bch_keybuf_next(&dc->writeback_keys); 392 if (!w) 393 break; 394 395 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 396 397 if (delay > 0 && 398 (KEY_START(&w->key) != dc->last_read || 399 jiffies_to_msecs(delay) > 50)) 400 delay = schedule_timeout_uninterruptible(delay); 401 402 dc->last_read = KEY_OFFSET(&w->key); 403 404 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 405 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 406 GFP_KERNEL); 407 if (!io) 408 goto err; 409 410 w->private = io; 411 io->dc = dc; 412 413 dirty_init(w); 414 io->bio.bi_sector = PTR_OFFSET(&w->key, 0); 415 io->bio.bi_bdev = PTR_CACHE(dc->disk.c, 416 &w->key, 0)->bdev; 417 io->bio.bi_rw = READ; 418 io->bio.bi_end_io = read_dirty_endio; 419 420 if (bio_alloc_pages(&io->bio, GFP_KERNEL)) 421 goto err_free; 422 423 trace_bcache_writeback(&w->key); 424 425 down(&dc->in_flight); 426 closure_call(&io->cl, read_dirty_submit, NULL, cl); 427 428 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 429 } 430 431 if (0) { 432 err_free: 433 kfree(w->private); 434 err: 435 bch_keybuf_del(&dc->writeback_keys, w); 436 } 437 438 /* 439 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 440 * freed) before refilling again 441 */ 442 continue_at(cl, refill_dirty, dirty_wq); 443 } 444 445 /* Init */ 446 447 static int bch_btree_sectors_dirty_init(struct btree *b, struct btree_op *op, 448 struct cached_dev *dc) 449 { 450 struct bkey *k; 451 struct btree_iter iter; 452 453 bch_btree_iter_init(b, &iter, &KEY(dc->disk.id, 0, 0)); 454 while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) 455 if (!b->level) { 456 if (KEY_INODE(k) > dc->disk.id) 457 break; 458 459 if (KEY_DIRTY(k)) 460 bcache_dev_sectors_dirty_add(b->c, dc->disk.id, 461 KEY_START(k), 462 KEY_SIZE(k)); 463 } else { 464 btree(sectors_dirty_init, k, b, op, dc); 465 if (KEY_INODE(k) > dc->disk.id) 466 break; 467 468 cond_resched(); 469 } 470 471 return 0; 472 } 473 474 void bch_sectors_dirty_init(struct cached_dev *dc) 475 { 476 struct btree_op op; 477 478 bch_btree_op_init_stack(&op); 479 btree_root(sectors_dirty_init, dc->disk.c, &op, dc); 480 } 481 482 void bch_cached_dev_writeback_init(struct cached_dev *dc) 483 { 484 sema_init(&dc->in_flight, 64); 485 closure_init_unlocked(&dc->writeback); 486 init_rwsem(&dc->writeback_lock); 487 488 bch_keybuf_init(&dc->writeback_keys); 489 490 dc->writeback_metadata = true; 491 dc->writeback_running = true; 492 dc->writeback_percent = 10; 493 dc->writeback_delay = 30; 494 dc->writeback_rate.rate = 1024; 495 496 dc->writeback_rate_update_seconds = 30; 497 dc->writeback_rate_d_term = 16; 498 dc->writeback_rate_p_term_inverse = 64; 499 dc->writeback_rate_d_smooth = 8; 500 501 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 502 schedule_delayed_work(&dc->writeback_rate_update, 503 dc->writeback_rate_update_seconds * HZ); 504 } 505 506 void bch_writeback_exit(void) 507 { 508 if (dirty_wq) 509 destroy_workqueue(dirty_wq); 510 } 511 512 int __init bch_writeback_init(void) 513 { 514 dirty_wq = create_workqueue("bcache_writeback"); 515 if (!dirty_wq) 516 return -ENOMEM; 517 518 return 0; 519 } 520