1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * background writeback - scan btree for dirty data and write it to the backing 4 * device 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcache.h" 11 #include "btree.h" 12 #include "debug.h" 13 #include "writeback.h" 14 15 #include <linux/delay.h> 16 #include <linux/kthread.h> 17 #include <linux/sched/clock.h> 18 #include <trace/events/bcache.h> 19 20 /* Rate limiting */ 21 22 static void __update_writeback_rate(struct cached_dev *dc) 23 { 24 struct cache_set *c = dc->disk.c; 25 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - 26 bcache_flash_devs_sectors_dirty(c); 27 uint64_t cache_dirty_target = 28 div_u64(cache_sectors * dc->writeback_percent, 100); 29 30 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 31 c->cached_dev_sectors); 32 33 /* PD controller */ 34 35 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 36 int64_t derivative = dirty - dc->disk.sectors_dirty_last; 37 int64_t proportional = dirty - target; 38 int64_t change; 39 40 dc->disk.sectors_dirty_last = dirty; 41 42 /* Scale to sectors per second */ 43 44 proportional *= dc->writeback_rate_update_seconds; 45 proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse); 46 47 derivative = div_s64(derivative, dc->writeback_rate_update_seconds); 48 49 derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative, 50 (dc->writeback_rate_d_term / 51 dc->writeback_rate_update_seconds) ?: 1, 0); 52 53 derivative *= dc->writeback_rate_d_term; 54 derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse); 55 56 change = proportional + derivative; 57 58 /* Don't increase writeback rate if the device isn't keeping up */ 59 if (change > 0 && 60 time_after64(local_clock(), 61 dc->writeback_rate.next + NSEC_PER_MSEC)) 62 change = 0; 63 64 dc->writeback_rate.rate = 65 clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change, 66 1, NSEC_PER_MSEC); 67 68 dc->writeback_rate_proportional = proportional; 69 dc->writeback_rate_derivative = derivative; 70 dc->writeback_rate_change = change; 71 dc->writeback_rate_target = target; 72 } 73 74 static void update_writeback_rate(struct work_struct *work) 75 { 76 struct cached_dev *dc = container_of(to_delayed_work(work), 77 struct cached_dev, 78 writeback_rate_update); 79 80 down_read(&dc->writeback_lock); 81 82 if (atomic_read(&dc->has_dirty) && 83 dc->writeback_percent) 84 __update_writeback_rate(dc); 85 86 up_read(&dc->writeback_lock); 87 88 schedule_delayed_work(&dc->writeback_rate_update, 89 dc->writeback_rate_update_seconds * HZ); 90 } 91 92 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 93 { 94 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 95 !dc->writeback_percent) 96 return 0; 97 98 return bch_next_delay(&dc->writeback_rate, sectors); 99 } 100 101 struct dirty_io { 102 struct closure cl; 103 struct cached_dev *dc; 104 struct bio bio; 105 }; 106 107 static void dirty_init(struct keybuf_key *w) 108 { 109 struct dirty_io *io = w->private; 110 struct bio *bio = &io->bio; 111 112 bio_init(bio, bio->bi_inline_vecs, 113 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); 114 if (!io->dc->writeback_percent) 115 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 116 117 bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; 118 bio->bi_private = w; 119 bch_bio_map(bio, NULL); 120 } 121 122 static void dirty_io_destructor(struct closure *cl) 123 { 124 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 125 kfree(io); 126 } 127 128 static void write_dirty_finish(struct closure *cl) 129 { 130 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 131 struct keybuf_key *w = io->bio.bi_private; 132 struct cached_dev *dc = io->dc; 133 134 bio_free_pages(&io->bio); 135 136 /* This is kind of a dumb way of signalling errors. */ 137 if (KEY_DIRTY(&w->key)) { 138 int ret; 139 unsigned i; 140 struct keylist keys; 141 142 bch_keylist_init(&keys); 143 144 bkey_copy(keys.top, &w->key); 145 SET_KEY_DIRTY(keys.top, false); 146 bch_keylist_push(&keys); 147 148 for (i = 0; i < KEY_PTRS(&w->key); i++) 149 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 150 151 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); 152 153 if (ret) 154 trace_bcache_writeback_collision(&w->key); 155 156 atomic_long_inc(ret 157 ? &dc->disk.c->writeback_keys_failed 158 : &dc->disk.c->writeback_keys_done); 159 } 160 161 bch_keybuf_del(&dc->writeback_keys, w); 162 up(&dc->in_flight); 163 164 closure_return_with_destructor(cl, dirty_io_destructor); 165 } 166 167 static void dirty_endio(struct bio *bio) 168 { 169 struct keybuf_key *w = bio->bi_private; 170 struct dirty_io *io = w->private; 171 172 if (bio->bi_status) 173 SET_KEY_DIRTY(&w->key, false); 174 175 closure_put(&io->cl); 176 } 177 178 static void write_dirty(struct closure *cl) 179 { 180 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 181 struct keybuf_key *w = io->bio.bi_private; 182 183 dirty_init(w); 184 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); 185 io->bio.bi_iter.bi_sector = KEY_START(&w->key); 186 bio_set_dev(&io->bio, io->dc->bdev); 187 io->bio.bi_end_io = dirty_endio; 188 189 closure_bio_submit(&io->bio, cl); 190 191 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); 192 } 193 194 static void read_dirty_endio(struct bio *bio) 195 { 196 struct keybuf_key *w = bio->bi_private; 197 struct dirty_io *io = w->private; 198 199 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 200 bio->bi_status, "reading dirty data from cache"); 201 202 dirty_endio(bio); 203 } 204 205 static void read_dirty_submit(struct closure *cl) 206 { 207 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 208 209 closure_bio_submit(&io->bio, cl); 210 211 continue_at(cl, write_dirty, io->dc->writeback_write_wq); 212 } 213 214 static void read_dirty(struct cached_dev *dc) 215 { 216 unsigned delay = 0; 217 struct keybuf_key *w; 218 struct dirty_io *io; 219 struct closure cl; 220 221 closure_init_stack(&cl); 222 223 /* 224 * XXX: if we error, background writeback just spins. Should use some 225 * mempools. 226 */ 227 228 while (!kthread_should_stop()) { 229 230 w = bch_keybuf_next(&dc->writeback_keys); 231 if (!w) 232 break; 233 234 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 235 236 if (KEY_START(&w->key) != dc->last_read || 237 jiffies_to_msecs(delay) > 50) 238 while (!kthread_should_stop() && delay) 239 delay = schedule_timeout_interruptible(delay); 240 241 dc->last_read = KEY_OFFSET(&w->key); 242 243 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 244 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 245 GFP_KERNEL); 246 if (!io) 247 goto err; 248 249 w->private = io; 250 io->dc = dc; 251 252 dirty_init(w); 253 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); 254 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); 255 bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); 256 io->bio.bi_end_io = read_dirty_endio; 257 258 if (bio_alloc_pages(&io->bio, GFP_KERNEL)) 259 goto err_free; 260 261 trace_bcache_writeback(&w->key); 262 263 down(&dc->in_flight); 264 closure_call(&io->cl, read_dirty_submit, NULL, &cl); 265 266 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 267 } 268 269 if (0) { 270 err_free: 271 kfree(w->private); 272 err: 273 bch_keybuf_del(&dc->writeback_keys, w); 274 } 275 276 /* 277 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 278 * freed) before refilling again 279 */ 280 closure_sync(&cl); 281 } 282 283 /* Scan for dirty data */ 284 285 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 286 uint64_t offset, int nr_sectors) 287 { 288 struct bcache_device *d = c->devices[inode]; 289 unsigned stripe_offset, stripe, sectors_dirty; 290 291 if (!d) 292 return; 293 294 stripe = offset_to_stripe(d, offset); 295 stripe_offset = offset & (d->stripe_size - 1); 296 297 while (nr_sectors) { 298 int s = min_t(unsigned, abs(nr_sectors), 299 d->stripe_size - stripe_offset); 300 301 if (nr_sectors < 0) 302 s = -s; 303 304 if (stripe >= d->nr_stripes) 305 return; 306 307 sectors_dirty = atomic_add_return(s, 308 d->stripe_sectors_dirty + stripe); 309 if (sectors_dirty == d->stripe_size) 310 set_bit(stripe, d->full_dirty_stripes); 311 else 312 clear_bit(stripe, d->full_dirty_stripes); 313 314 nr_sectors -= s; 315 stripe_offset = 0; 316 stripe++; 317 } 318 } 319 320 static bool dirty_pred(struct keybuf *buf, struct bkey *k) 321 { 322 struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); 323 324 BUG_ON(KEY_INODE(k) != dc->disk.id); 325 326 return KEY_DIRTY(k); 327 } 328 329 static void refill_full_stripes(struct cached_dev *dc) 330 { 331 struct keybuf *buf = &dc->writeback_keys; 332 unsigned start_stripe, stripe, next_stripe; 333 bool wrapped = false; 334 335 stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); 336 337 if (stripe >= dc->disk.nr_stripes) 338 stripe = 0; 339 340 start_stripe = stripe; 341 342 while (1) { 343 stripe = find_next_bit(dc->disk.full_dirty_stripes, 344 dc->disk.nr_stripes, stripe); 345 346 if (stripe == dc->disk.nr_stripes) 347 goto next; 348 349 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, 350 dc->disk.nr_stripes, stripe); 351 352 buf->last_scanned = KEY(dc->disk.id, 353 stripe * dc->disk.stripe_size, 0); 354 355 bch_refill_keybuf(dc->disk.c, buf, 356 &KEY(dc->disk.id, 357 next_stripe * dc->disk.stripe_size, 0), 358 dirty_pred); 359 360 if (array_freelist_empty(&buf->freelist)) 361 return; 362 363 stripe = next_stripe; 364 next: 365 if (wrapped && stripe > start_stripe) 366 return; 367 368 if (stripe == dc->disk.nr_stripes) { 369 stripe = 0; 370 wrapped = true; 371 } 372 } 373 } 374 375 /* 376 * Returns true if we scanned the entire disk 377 */ 378 static bool refill_dirty(struct cached_dev *dc) 379 { 380 struct keybuf *buf = &dc->writeback_keys; 381 struct bkey start = KEY(dc->disk.id, 0, 0); 382 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); 383 struct bkey start_pos; 384 385 /* 386 * make sure keybuf pos is inside the range for this disk - at bringup 387 * we might not be attached yet so this disk's inode nr isn't 388 * initialized then 389 */ 390 if (bkey_cmp(&buf->last_scanned, &start) < 0 || 391 bkey_cmp(&buf->last_scanned, &end) > 0) 392 buf->last_scanned = start; 393 394 if (dc->partial_stripes_expensive) { 395 refill_full_stripes(dc); 396 if (array_freelist_empty(&buf->freelist)) 397 return false; 398 } 399 400 start_pos = buf->last_scanned; 401 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 402 403 if (bkey_cmp(&buf->last_scanned, &end) < 0) 404 return false; 405 406 /* 407 * If we get to the end start scanning again from the beginning, and 408 * only scan up to where we initially started scanning from: 409 */ 410 buf->last_scanned = start; 411 bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); 412 413 return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; 414 } 415 416 static int bch_writeback_thread(void *arg) 417 { 418 struct cached_dev *dc = arg; 419 bool searched_full_index; 420 421 while (!kthread_should_stop()) { 422 down_write(&dc->writeback_lock); 423 if (!atomic_read(&dc->has_dirty) || 424 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && 425 !dc->writeback_running)) { 426 up_write(&dc->writeback_lock); 427 set_current_state(TASK_INTERRUPTIBLE); 428 429 if (kthread_should_stop()) 430 return 0; 431 432 schedule(); 433 continue; 434 } 435 436 searched_full_index = refill_dirty(dc); 437 438 if (searched_full_index && 439 RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { 440 atomic_set(&dc->has_dirty, 0); 441 cached_dev_put(dc); 442 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 443 bch_write_bdev_super(dc, NULL); 444 } 445 446 up_write(&dc->writeback_lock); 447 448 bch_ratelimit_reset(&dc->writeback_rate); 449 read_dirty(dc); 450 451 if (searched_full_index) { 452 unsigned delay = dc->writeback_delay * HZ; 453 454 while (delay && 455 !kthread_should_stop() && 456 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 457 delay = schedule_timeout_interruptible(delay); 458 } 459 } 460 461 return 0; 462 } 463 464 /* Init */ 465 466 struct sectors_dirty_init { 467 struct btree_op op; 468 unsigned inode; 469 }; 470 471 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, 472 struct bkey *k) 473 { 474 struct sectors_dirty_init *op = container_of(_op, 475 struct sectors_dirty_init, op); 476 if (KEY_INODE(k) > op->inode) 477 return MAP_DONE; 478 479 if (KEY_DIRTY(k)) 480 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 481 KEY_START(k), KEY_SIZE(k)); 482 483 return MAP_CONTINUE; 484 } 485 486 void bch_sectors_dirty_init(struct bcache_device *d) 487 { 488 struct sectors_dirty_init op; 489 490 bch_btree_op_init(&op.op, -1); 491 op.inode = d->id; 492 493 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), 494 sectors_dirty_init_fn, 0); 495 496 d->sectors_dirty_last = bcache_dev_sectors_dirty(d); 497 } 498 499 void bch_cached_dev_writeback_init(struct cached_dev *dc) 500 { 501 sema_init(&dc->in_flight, 64); 502 init_rwsem(&dc->writeback_lock); 503 bch_keybuf_init(&dc->writeback_keys); 504 505 dc->writeback_metadata = true; 506 dc->writeback_running = true; 507 dc->writeback_percent = 10; 508 dc->writeback_delay = 30; 509 dc->writeback_rate.rate = 1024; 510 511 dc->writeback_rate_update_seconds = 5; 512 dc->writeback_rate_d_term = 30; 513 dc->writeback_rate_p_term_inverse = 6000; 514 515 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 516 } 517 518 int bch_cached_dev_writeback_start(struct cached_dev *dc) 519 { 520 dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", 521 WQ_MEM_RECLAIM, 0); 522 if (!dc->writeback_write_wq) 523 return -ENOMEM; 524 525 dc->writeback_thread = kthread_create(bch_writeback_thread, dc, 526 "bcache_writeback"); 527 if (IS_ERR(dc->writeback_thread)) 528 return PTR_ERR(dc->writeback_thread); 529 530 schedule_delayed_work(&dc->writeback_rate_update, 531 dc->writeback_rate_update_seconds * HZ); 532 533 bch_writeback_queue(dc); 534 535 return 0; 536 } 537