1 // SPDX-License-Identifier: GPL-2.0 2 /* 3 * background writeback - scan btree for dirty data and write it to the backing 4 * device 5 * 6 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 7 * Copyright 2012 Google, Inc. 8 */ 9 10 #include "bcache.h" 11 #include "btree.h" 12 #include "debug.h" 13 #include "writeback.h" 14 15 #include <linux/delay.h> 16 #include <linux/kthread.h> 17 #include <linux/sched/clock.h> 18 #include <trace/events/bcache.h> 19 20 /* Rate limiting */ 21 22 static void __update_writeback_rate(struct cached_dev *dc) 23 { 24 struct cache_set *c = dc->disk.c; 25 uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size - 26 bcache_flash_devs_sectors_dirty(c); 27 uint64_t cache_dirty_target = 28 div_u64(cache_sectors * dc->writeback_percent, 100); 29 int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev), 30 c->cached_dev_sectors); 31 32 /* 33 * PI controller: 34 * Figures out the amount that should be written per second. 35 * 36 * First, the error (number of sectors that are dirty beyond our 37 * target) is calculated. The error is accumulated (numerically 38 * integrated). 39 * 40 * Then, the proportional value and integral value are scaled 41 * based on configured values. These are stored as inverses to 42 * avoid fixed point math and to make configuration easy-- e.g. 43 * the default value of 40 for writeback_rate_p_term_inverse 44 * attempts to write at a rate that would retire all the dirty 45 * blocks in 40 seconds. 46 * 47 * The writeback_rate_i_inverse value of 10000 means that 1/10000th 48 * of the error is accumulated in the integral term per second. 49 * This acts as a slow, long-term average that is not subject to 50 * variations in usage like the p term. 51 */ 52 int64_t dirty = bcache_dev_sectors_dirty(&dc->disk); 53 int64_t error = dirty - target; 54 int64_t proportional_scaled = 55 div_s64(error, dc->writeback_rate_p_term_inverse); 56 int64_t integral_scaled; 57 uint32_t new_rate; 58 59 if ((error < 0 && dc->writeback_rate_integral > 0) || 60 (error > 0 && time_before64(local_clock(), 61 dc->writeback_rate.next + NSEC_PER_MSEC))) { 62 /* 63 * Only decrease the integral term if it's more than 64 * zero. Only increase the integral term if the device 65 * is keeping up. (Don't wind up the integral 66 * ineffectively in either case). 67 * 68 * It's necessary to scale this by 69 * writeback_rate_update_seconds to keep the integral 70 * term dimensioned properly. 71 */ 72 dc->writeback_rate_integral += error * 73 dc->writeback_rate_update_seconds; 74 } 75 76 integral_scaled = div_s64(dc->writeback_rate_integral, 77 dc->writeback_rate_i_term_inverse); 78 79 new_rate = clamp_t(int32_t, (proportional_scaled + integral_scaled), 80 dc->writeback_rate_minimum, NSEC_PER_SEC); 81 82 dc->writeback_rate_proportional = proportional_scaled; 83 dc->writeback_rate_integral_scaled = integral_scaled; 84 dc->writeback_rate_change = new_rate - dc->writeback_rate.rate; 85 dc->writeback_rate.rate = new_rate; 86 dc->writeback_rate_target = target; 87 } 88 89 static void update_writeback_rate(struct work_struct *work) 90 { 91 struct cached_dev *dc = container_of(to_delayed_work(work), 92 struct cached_dev, 93 writeback_rate_update); 94 95 down_read(&dc->writeback_lock); 96 97 if (atomic_read(&dc->has_dirty) && 98 dc->writeback_percent) 99 __update_writeback_rate(dc); 100 101 up_read(&dc->writeback_lock); 102 103 schedule_delayed_work(&dc->writeback_rate_update, 104 dc->writeback_rate_update_seconds * HZ); 105 } 106 107 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors) 108 { 109 if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) || 110 !dc->writeback_percent) 111 return 0; 112 113 return bch_next_delay(&dc->writeback_rate, sectors); 114 } 115 116 struct dirty_io { 117 struct closure cl; 118 struct cached_dev *dc; 119 struct bio bio; 120 }; 121 122 static void dirty_init(struct keybuf_key *w) 123 { 124 struct dirty_io *io = w->private; 125 struct bio *bio = &io->bio; 126 127 bio_init(bio, bio->bi_inline_vecs, 128 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS)); 129 if (!io->dc->writeback_percent) 130 bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)); 131 132 bio->bi_iter.bi_size = KEY_SIZE(&w->key) << 9; 133 bio->bi_private = w; 134 bch_bio_map(bio, NULL); 135 } 136 137 static void dirty_io_destructor(struct closure *cl) 138 { 139 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 140 kfree(io); 141 } 142 143 static void write_dirty_finish(struct closure *cl) 144 { 145 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 146 struct keybuf_key *w = io->bio.bi_private; 147 struct cached_dev *dc = io->dc; 148 149 bio_free_pages(&io->bio); 150 151 /* This is kind of a dumb way of signalling errors. */ 152 if (KEY_DIRTY(&w->key)) { 153 int ret; 154 unsigned i; 155 struct keylist keys; 156 157 bch_keylist_init(&keys); 158 159 bkey_copy(keys.top, &w->key); 160 SET_KEY_DIRTY(keys.top, false); 161 bch_keylist_push(&keys); 162 163 for (i = 0; i < KEY_PTRS(&w->key); i++) 164 atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin); 165 166 ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key); 167 168 if (ret) 169 trace_bcache_writeback_collision(&w->key); 170 171 atomic_long_inc(ret 172 ? &dc->disk.c->writeback_keys_failed 173 : &dc->disk.c->writeback_keys_done); 174 } 175 176 bch_keybuf_del(&dc->writeback_keys, w); 177 up(&dc->in_flight); 178 179 closure_return_with_destructor(cl, dirty_io_destructor); 180 } 181 182 static void dirty_endio(struct bio *bio) 183 { 184 struct keybuf_key *w = bio->bi_private; 185 struct dirty_io *io = w->private; 186 187 if (bio->bi_status) 188 SET_KEY_DIRTY(&w->key, false); 189 190 closure_put(&io->cl); 191 } 192 193 static void write_dirty(struct closure *cl) 194 { 195 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 196 struct keybuf_key *w = io->bio.bi_private; 197 198 /* 199 * IO errors are signalled using the dirty bit on the key. 200 * If we failed to read, we should not attempt to write to the 201 * backing device. Instead, immediately go to write_dirty_finish 202 * to clean up. 203 */ 204 if (KEY_DIRTY(&w->key)) { 205 dirty_init(w); 206 bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0); 207 io->bio.bi_iter.bi_sector = KEY_START(&w->key); 208 bio_set_dev(&io->bio, io->dc->bdev); 209 io->bio.bi_end_io = dirty_endio; 210 211 closure_bio_submit(&io->bio, cl); 212 } 213 214 continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq); 215 } 216 217 static void read_dirty_endio(struct bio *bio) 218 { 219 struct keybuf_key *w = bio->bi_private; 220 struct dirty_io *io = w->private; 221 222 bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0), 223 bio->bi_status, "reading dirty data from cache"); 224 225 dirty_endio(bio); 226 } 227 228 static void read_dirty_submit(struct closure *cl) 229 { 230 struct dirty_io *io = container_of(cl, struct dirty_io, cl); 231 232 closure_bio_submit(&io->bio, cl); 233 234 continue_at(cl, write_dirty, io->dc->writeback_write_wq); 235 } 236 237 static void read_dirty(struct cached_dev *dc) 238 { 239 unsigned delay = 0; 240 struct keybuf_key *w; 241 struct dirty_io *io; 242 struct closure cl; 243 244 closure_init_stack(&cl); 245 246 /* 247 * XXX: if we error, background writeback just spins. Should use some 248 * mempools. 249 */ 250 251 while (!kthread_should_stop()) { 252 253 w = bch_keybuf_next(&dc->writeback_keys); 254 if (!w) 255 break; 256 257 BUG_ON(ptr_stale(dc->disk.c, &w->key, 0)); 258 259 if (KEY_START(&w->key) != dc->last_read || 260 jiffies_to_msecs(delay) > 50) 261 while (!kthread_should_stop() && delay) 262 delay = schedule_timeout_interruptible(delay); 263 264 dc->last_read = KEY_OFFSET(&w->key); 265 266 io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec) 267 * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS), 268 GFP_KERNEL); 269 if (!io) 270 goto err; 271 272 w->private = io; 273 io->dc = dc; 274 275 dirty_init(w); 276 bio_set_op_attrs(&io->bio, REQ_OP_READ, 0); 277 io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0); 278 bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev); 279 io->bio.bi_end_io = read_dirty_endio; 280 281 if (bio_alloc_pages(&io->bio, GFP_KERNEL)) 282 goto err_free; 283 284 trace_bcache_writeback(&w->key); 285 286 down(&dc->in_flight); 287 closure_call(&io->cl, read_dirty_submit, NULL, &cl); 288 289 delay = writeback_delay(dc, KEY_SIZE(&w->key)); 290 } 291 292 if (0) { 293 err_free: 294 kfree(w->private); 295 err: 296 bch_keybuf_del(&dc->writeback_keys, w); 297 } 298 299 /* 300 * Wait for outstanding writeback IOs to finish (and keybuf slots to be 301 * freed) before refilling again 302 */ 303 closure_sync(&cl); 304 } 305 306 /* Scan for dirty data */ 307 308 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode, 309 uint64_t offset, int nr_sectors) 310 { 311 struct bcache_device *d = c->devices[inode]; 312 unsigned stripe_offset, stripe, sectors_dirty; 313 314 if (!d) 315 return; 316 317 stripe = offset_to_stripe(d, offset); 318 stripe_offset = offset & (d->stripe_size - 1); 319 320 while (nr_sectors) { 321 int s = min_t(unsigned, abs(nr_sectors), 322 d->stripe_size - stripe_offset); 323 324 if (nr_sectors < 0) 325 s = -s; 326 327 if (stripe >= d->nr_stripes) 328 return; 329 330 sectors_dirty = atomic_add_return(s, 331 d->stripe_sectors_dirty + stripe); 332 if (sectors_dirty == d->stripe_size) 333 set_bit(stripe, d->full_dirty_stripes); 334 else 335 clear_bit(stripe, d->full_dirty_stripes); 336 337 nr_sectors -= s; 338 stripe_offset = 0; 339 stripe++; 340 } 341 } 342 343 static bool dirty_pred(struct keybuf *buf, struct bkey *k) 344 { 345 struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys); 346 347 BUG_ON(KEY_INODE(k) != dc->disk.id); 348 349 return KEY_DIRTY(k); 350 } 351 352 static void refill_full_stripes(struct cached_dev *dc) 353 { 354 struct keybuf *buf = &dc->writeback_keys; 355 unsigned start_stripe, stripe, next_stripe; 356 bool wrapped = false; 357 358 stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned)); 359 360 if (stripe >= dc->disk.nr_stripes) 361 stripe = 0; 362 363 start_stripe = stripe; 364 365 while (1) { 366 stripe = find_next_bit(dc->disk.full_dirty_stripes, 367 dc->disk.nr_stripes, stripe); 368 369 if (stripe == dc->disk.nr_stripes) 370 goto next; 371 372 next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes, 373 dc->disk.nr_stripes, stripe); 374 375 buf->last_scanned = KEY(dc->disk.id, 376 stripe * dc->disk.stripe_size, 0); 377 378 bch_refill_keybuf(dc->disk.c, buf, 379 &KEY(dc->disk.id, 380 next_stripe * dc->disk.stripe_size, 0), 381 dirty_pred); 382 383 if (array_freelist_empty(&buf->freelist)) 384 return; 385 386 stripe = next_stripe; 387 next: 388 if (wrapped && stripe > start_stripe) 389 return; 390 391 if (stripe == dc->disk.nr_stripes) { 392 stripe = 0; 393 wrapped = true; 394 } 395 } 396 } 397 398 /* 399 * Returns true if we scanned the entire disk 400 */ 401 static bool refill_dirty(struct cached_dev *dc) 402 { 403 struct keybuf *buf = &dc->writeback_keys; 404 struct bkey start = KEY(dc->disk.id, 0, 0); 405 struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0); 406 struct bkey start_pos; 407 408 /* 409 * make sure keybuf pos is inside the range for this disk - at bringup 410 * we might not be attached yet so this disk's inode nr isn't 411 * initialized then 412 */ 413 if (bkey_cmp(&buf->last_scanned, &start) < 0 || 414 bkey_cmp(&buf->last_scanned, &end) > 0) 415 buf->last_scanned = start; 416 417 if (dc->partial_stripes_expensive) { 418 refill_full_stripes(dc); 419 if (array_freelist_empty(&buf->freelist)) 420 return false; 421 } 422 423 start_pos = buf->last_scanned; 424 bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred); 425 426 if (bkey_cmp(&buf->last_scanned, &end) < 0) 427 return false; 428 429 /* 430 * If we get to the end start scanning again from the beginning, and 431 * only scan up to where we initially started scanning from: 432 */ 433 buf->last_scanned = start; 434 bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred); 435 436 return bkey_cmp(&buf->last_scanned, &start_pos) >= 0; 437 } 438 439 static int bch_writeback_thread(void *arg) 440 { 441 struct cached_dev *dc = arg; 442 bool searched_full_index; 443 444 bch_ratelimit_reset(&dc->writeback_rate); 445 446 while (!kthread_should_stop()) { 447 down_write(&dc->writeback_lock); 448 if (!atomic_read(&dc->has_dirty) || 449 (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) && 450 !dc->writeback_running)) { 451 up_write(&dc->writeback_lock); 452 set_current_state(TASK_INTERRUPTIBLE); 453 454 if (kthread_should_stop()) 455 return 0; 456 457 schedule(); 458 continue; 459 } 460 461 searched_full_index = refill_dirty(dc); 462 463 if (searched_full_index && 464 RB_EMPTY_ROOT(&dc->writeback_keys.keys)) { 465 atomic_set(&dc->has_dirty, 0); 466 cached_dev_put(dc); 467 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 468 bch_write_bdev_super(dc, NULL); 469 } 470 471 up_write(&dc->writeback_lock); 472 473 read_dirty(dc); 474 475 if (searched_full_index) { 476 unsigned delay = dc->writeback_delay * HZ; 477 478 while (delay && 479 !kthread_should_stop() && 480 !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags)) 481 delay = schedule_timeout_interruptible(delay); 482 483 bch_ratelimit_reset(&dc->writeback_rate); 484 } 485 } 486 487 return 0; 488 } 489 490 /* Init */ 491 492 struct sectors_dirty_init { 493 struct btree_op op; 494 unsigned inode; 495 }; 496 497 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b, 498 struct bkey *k) 499 { 500 struct sectors_dirty_init *op = container_of(_op, 501 struct sectors_dirty_init, op); 502 if (KEY_INODE(k) > op->inode) 503 return MAP_DONE; 504 505 if (KEY_DIRTY(k)) 506 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 507 KEY_START(k), KEY_SIZE(k)); 508 509 return MAP_CONTINUE; 510 } 511 512 void bch_sectors_dirty_init(struct bcache_device *d) 513 { 514 struct sectors_dirty_init op; 515 516 bch_btree_op_init(&op.op, -1); 517 op.inode = d->id; 518 519 bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0), 520 sectors_dirty_init_fn, 0); 521 } 522 523 void bch_cached_dev_writeback_init(struct cached_dev *dc) 524 { 525 sema_init(&dc->in_flight, 64); 526 init_rwsem(&dc->writeback_lock); 527 bch_keybuf_init(&dc->writeback_keys); 528 529 dc->writeback_metadata = true; 530 dc->writeback_running = true; 531 dc->writeback_percent = 10; 532 dc->writeback_delay = 30; 533 dc->writeback_rate.rate = 1024; 534 dc->writeback_rate_minimum = 8; 535 536 dc->writeback_rate_update_seconds = 5; 537 dc->writeback_rate_p_term_inverse = 40; 538 dc->writeback_rate_i_term_inverse = 10000; 539 540 INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate); 541 } 542 543 int bch_cached_dev_writeback_start(struct cached_dev *dc) 544 { 545 dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq", 546 WQ_MEM_RECLAIM, 0); 547 if (!dc->writeback_write_wq) 548 return -ENOMEM; 549 550 dc->writeback_thread = kthread_create(bch_writeback_thread, dc, 551 "bcache_writeback"); 552 if (IS_ERR(dc->writeback_thread)) 553 return PTR_ERR(dc->writeback_thread); 554 555 schedule_delayed_work(&dc->writeback_rate_update, 556 dc->writeback_rate_update_seconds * HZ); 557 558 bch_writeback_queue(dc); 559 560 return 0; 561 } 562