xref: /openbmc/linux/drivers/md/bcache/writeback.c (revision b24413180f5600bcb3bb70fbed5cf186b60864bd)
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3  * background writeback - scan btree for dirty data and write it to the backing
4  * device
5  *
6  * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com>
7  * Copyright 2012 Google, Inc.
8  */
9 
10 #include "bcache.h"
11 #include "btree.h"
12 #include "debug.h"
13 #include "writeback.h"
14 
15 #include <linux/delay.h>
16 #include <linux/kthread.h>
17 #include <linux/sched/clock.h>
18 #include <trace/events/bcache.h>
19 
20 /* Rate limiting */
21 
22 static void __update_writeback_rate(struct cached_dev *dc)
23 {
24 	struct cache_set *c = dc->disk.c;
25 	uint64_t cache_sectors = c->nbuckets * c->sb.bucket_size -
26 				bcache_flash_devs_sectors_dirty(c);
27 	uint64_t cache_dirty_target =
28 		div_u64(cache_sectors * dc->writeback_percent, 100);
29 
30 	int64_t target = div64_u64(cache_dirty_target * bdev_sectors(dc->bdev),
31 				   c->cached_dev_sectors);
32 
33 	/* PD controller */
34 
35 	int64_t dirty = bcache_dev_sectors_dirty(&dc->disk);
36 	int64_t derivative = dirty - dc->disk.sectors_dirty_last;
37 	int64_t proportional = dirty - target;
38 	int64_t change;
39 
40 	dc->disk.sectors_dirty_last = dirty;
41 
42 	/* Scale to sectors per second */
43 
44 	proportional *= dc->writeback_rate_update_seconds;
45 	proportional = div_s64(proportional, dc->writeback_rate_p_term_inverse);
46 
47 	derivative = div_s64(derivative, dc->writeback_rate_update_seconds);
48 
49 	derivative = ewma_add(dc->disk.sectors_dirty_derivative, derivative,
50 			      (dc->writeback_rate_d_term /
51 			       dc->writeback_rate_update_seconds) ?: 1, 0);
52 
53 	derivative *= dc->writeback_rate_d_term;
54 	derivative = div_s64(derivative, dc->writeback_rate_p_term_inverse);
55 
56 	change = proportional + derivative;
57 
58 	/* Don't increase writeback rate if the device isn't keeping up */
59 	if (change > 0 &&
60 	    time_after64(local_clock(),
61 			 dc->writeback_rate.next + NSEC_PER_MSEC))
62 		change = 0;
63 
64 	dc->writeback_rate.rate =
65 		clamp_t(int64_t, (int64_t) dc->writeback_rate.rate + change,
66 			1, NSEC_PER_MSEC);
67 
68 	dc->writeback_rate_proportional = proportional;
69 	dc->writeback_rate_derivative = derivative;
70 	dc->writeback_rate_change = change;
71 	dc->writeback_rate_target = target;
72 }
73 
74 static void update_writeback_rate(struct work_struct *work)
75 {
76 	struct cached_dev *dc = container_of(to_delayed_work(work),
77 					     struct cached_dev,
78 					     writeback_rate_update);
79 
80 	down_read(&dc->writeback_lock);
81 
82 	if (atomic_read(&dc->has_dirty) &&
83 	    dc->writeback_percent)
84 		__update_writeback_rate(dc);
85 
86 	up_read(&dc->writeback_lock);
87 
88 	schedule_delayed_work(&dc->writeback_rate_update,
89 			      dc->writeback_rate_update_seconds * HZ);
90 }
91 
92 static unsigned writeback_delay(struct cached_dev *dc, unsigned sectors)
93 {
94 	if (test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) ||
95 	    !dc->writeback_percent)
96 		return 0;
97 
98 	return bch_next_delay(&dc->writeback_rate, sectors);
99 }
100 
101 struct dirty_io {
102 	struct closure		cl;
103 	struct cached_dev	*dc;
104 	struct bio		bio;
105 };
106 
107 static void dirty_init(struct keybuf_key *w)
108 {
109 	struct dirty_io *io = w->private;
110 	struct bio *bio = &io->bio;
111 
112 	bio_init(bio, bio->bi_inline_vecs,
113 		 DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS));
114 	if (!io->dc->writeback_percent)
115 		bio_set_prio(bio, IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0));
116 
117 	bio->bi_iter.bi_size	= KEY_SIZE(&w->key) << 9;
118 	bio->bi_private		= w;
119 	bch_bio_map(bio, NULL);
120 }
121 
122 static void dirty_io_destructor(struct closure *cl)
123 {
124 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
125 	kfree(io);
126 }
127 
128 static void write_dirty_finish(struct closure *cl)
129 {
130 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
131 	struct keybuf_key *w = io->bio.bi_private;
132 	struct cached_dev *dc = io->dc;
133 
134 	bio_free_pages(&io->bio);
135 
136 	/* This is kind of a dumb way of signalling errors. */
137 	if (KEY_DIRTY(&w->key)) {
138 		int ret;
139 		unsigned i;
140 		struct keylist keys;
141 
142 		bch_keylist_init(&keys);
143 
144 		bkey_copy(keys.top, &w->key);
145 		SET_KEY_DIRTY(keys.top, false);
146 		bch_keylist_push(&keys);
147 
148 		for (i = 0; i < KEY_PTRS(&w->key); i++)
149 			atomic_inc(&PTR_BUCKET(dc->disk.c, &w->key, i)->pin);
150 
151 		ret = bch_btree_insert(dc->disk.c, &keys, NULL, &w->key);
152 
153 		if (ret)
154 			trace_bcache_writeback_collision(&w->key);
155 
156 		atomic_long_inc(ret
157 				? &dc->disk.c->writeback_keys_failed
158 				: &dc->disk.c->writeback_keys_done);
159 	}
160 
161 	bch_keybuf_del(&dc->writeback_keys, w);
162 	up(&dc->in_flight);
163 
164 	closure_return_with_destructor(cl, dirty_io_destructor);
165 }
166 
167 static void dirty_endio(struct bio *bio)
168 {
169 	struct keybuf_key *w = bio->bi_private;
170 	struct dirty_io *io = w->private;
171 
172 	if (bio->bi_status)
173 		SET_KEY_DIRTY(&w->key, false);
174 
175 	closure_put(&io->cl);
176 }
177 
178 static void write_dirty(struct closure *cl)
179 {
180 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
181 	struct keybuf_key *w = io->bio.bi_private;
182 
183 	dirty_init(w);
184 	bio_set_op_attrs(&io->bio, REQ_OP_WRITE, 0);
185 	io->bio.bi_iter.bi_sector = KEY_START(&w->key);
186 	bio_set_dev(&io->bio, io->dc->bdev);
187 	io->bio.bi_end_io	= dirty_endio;
188 
189 	closure_bio_submit(&io->bio, cl);
190 
191 	continue_at(cl, write_dirty_finish, io->dc->writeback_write_wq);
192 }
193 
194 static void read_dirty_endio(struct bio *bio)
195 {
196 	struct keybuf_key *w = bio->bi_private;
197 	struct dirty_io *io = w->private;
198 
199 	bch_count_io_errors(PTR_CACHE(io->dc->disk.c, &w->key, 0),
200 			    bio->bi_status, "reading dirty data from cache");
201 
202 	dirty_endio(bio);
203 }
204 
205 static void read_dirty_submit(struct closure *cl)
206 {
207 	struct dirty_io *io = container_of(cl, struct dirty_io, cl);
208 
209 	closure_bio_submit(&io->bio, cl);
210 
211 	continue_at(cl, write_dirty, io->dc->writeback_write_wq);
212 }
213 
214 static void read_dirty(struct cached_dev *dc)
215 {
216 	unsigned delay = 0;
217 	struct keybuf_key *w;
218 	struct dirty_io *io;
219 	struct closure cl;
220 
221 	closure_init_stack(&cl);
222 
223 	/*
224 	 * XXX: if we error, background writeback just spins. Should use some
225 	 * mempools.
226 	 */
227 
228 	while (!kthread_should_stop()) {
229 
230 		w = bch_keybuf_next(&dc->writeback_keys);
231 		if (!w)
232 			break;
233 
234 		BUG_ON(ptr_stale(dc->disk.c, &w->key, 0));
235 
236 		if (KEY_START(&w->key) != dc->last_read ||
237 		    jiffies_to_msecs(delay) > 50)
238 			while (!kthread_should_stop() && delay)
239 				delay = schedule_timeout_interruptible(delay);
240 
241 		dc->last_read	= KEY_OFFSET(&w->key);
242 
243 		io = kzalloc(sizeof(struct dirty_io) + sizeof(struct bio_vec)
244 			     * DIV_ROUND_UP(KEY_SIZE(&w->key), PAGE_SECTORS),
245 			     GFP_KERNEL);
246 		if (!io)
247 			goto err;
248 
249 		w->private	= io;
250 		io->dc		= dc;
251 
252 		dirty_init(w);
253 		bio_set_op_attrs(&io->bio, REQ_OP_READ, 0);
254 		io->bio.bi_iter.bi_sector = PTR_OFFSET(&w->key, 0);
255 		bio_set_dev(&io->bio, PTR_CACHE(dc->disk.c, &w->key, 0)->bdev);
256 		io->bio.bi_end_io	= read_dirty_endio;
257 
258 		if (bio_alloc_pages(&io->bio, GFP_KERNEL))
259 			goto err_free;
260 
261 		trace_bcache_writeback(&w->key);
262 
263 		down(&dc->in_flight);
264 		closure_call(&io->cl, read_dirty_submit, NULL, &cl);
265 
266 		delay = writeback_delay(dc, KEY_SIZE(&w->key));
267 	}
268 
269 	if (0) {
270 err_free:
271 		kfree(w->private);
272 err:
273 		bch_keybuf_del(&dc->writeback_keys, w);
274 	}
275 
276 	/*
277 	 * Wait for outstanding writeback IOs to finish (and keybuf slots to be
278 	 * freed) before refilling again
279 	 */
280 	closure_sync(&cl);
281 }
282 
283 /* Scan for dirty data */
284 
285 void bcache_dev_sectors_dirty_add(struct cache_set *c, unsigned inode,
286 				  uint64_t offset, int nr_sectors)
287 {
288 	struct bcache_device *d = c->devices[inode];
289 	unsigned stripe_offset, stripe, sectors_dirty;
290 
291 	if (!d)
292 		return;
293 
294 	stripe = offset_to_stripe(d, offset);
295 	stripe_offset = offset & (d->stripe_size - 1);
296 
297 	while (nr_sectors) {
298 		int s = min_t(unsigned, abs(nr_sectors),
299 			      d->stripe_size - stripe_offset);
300 
301 		if (nr_sectors < 0)
302 			s = -s;
303 
304 		if (stripe >= d->nr_stripes)
305 			return;
306 
307 		sectors_dirty = atomic_add_return(s,
308 					d->stripe_sectors_dirty + stripe);
309 		if (sectors_dirty == d->stripe_size)
310 			set_bit(stripe, d->full_dirty_stripes);
311 		else
312 			clear_bit(stripe, d->full_dirty_stripes);
313 
314 		nr_sectors -= s;
315 		stripe_offset = 0;
316 		stripe++;
317 	}
318 }
319 
320 static bool dirty_pred(struct keybuf *buf, struct bkey *k)
321 {
322 	struct cached_dev *dc = container_of(buf, struct cached_dev, writeback_keys);
323 
324 	BUG_ON(KEY_INODE(k) != dc->disk.id);
325 
326 	return KEY_DIRTY(k);
327 }
328 
329 static void refill_full_stripes(struct cached_dev *dc)
330 {
331 	struct keybuf *buf = &dc->writeback_keys;
332 	unsigned start_stripe, stripe, next_stripe;
333 	bool wrapped = false;
334 
335 	stripe = offset_to_stripe(&dc->disk, KEY_OFFSET(&buf->last_scanned));
336 
337 	if (stripe >= dc->disk.nr_stripes)
338 		stripe = 0;
339 
340 	start_stripe = stripe;
341 
342 	while (1) {
343 		stripe = find_next_bit(dc->disk.full_dirty_stripes,
344 				       dc->disk.nr_stripes, stripe);
345 
346 		if (stripe == dc->disk.nr_stripes)
347 			goto next;
348 
349 		next_stripe = find_next_zero_bit(dc->disk.full_dirty_stripes,
350 						 dc->disk.nr_stripes, stripe);
351 
352 		buf->last_scanned = KEY(dc->disk.id,
353 					stripe * dc->disk.stripe_size, 0);
354 
355 		bch_refill_keybuf(dc->disk.c, buf,
356 				  &KEY(dc->disk.id,
357 				       next_stripe * dc->disk.stripe_size, 0),
358 				  dirty_pred);
359 
360 		if (array_freelist_empty(&buf->freelist))
361 			return;
362 
363 		stripe = next_stripe;
364 next:
365 		if (wrapped && stripe > start_stripe)
366 			return;
367 
368 		if (stripe == dc->disk.nr_stripes) {
369 			stripe = 0;
370 			wrapped = true;
371 		}
372 	}
373 }
374 
375 /*
376  * Returns true if we scanned the entire disk
377  */
378 static bool refill_dirty(struct cached_dev *dc)
379 {
380 	struct keybuf *buf = &dc->writeback_keys;
381 	struct bkey start = KEY(dc->disk.id, 0, 0);
382 	struct bkey end = KEY(dc->disk.id, MAX_KEY_OFFSET, 0);
383 	struct bkey start_pos;
384 
385 	/*
386 	 * make sure keybuf pos is inside the range for this disk - at bringup
387 	 * we might not be attached yet so this disk's inode nr isn't
388 	 * initialized then
389 	 */
390 	if (bkey_cmp(&buf->last_scanned, &start) < 0 ||
391 	    bkey_cmp(&buf->last_scanned, &end) > 0)
392 		buf->last_scanned = start;
393 
394 	if (dc->partial_stripes_expensive) {
395 		refill_full_stripes(dc);
396 		if (array_freelist_empty(&buf->freelist))
397 			return false;
398 	}
399 
400 	start_pos = buf->last_scanned;
401 	bch_refill_keybuf(dc->disk.c, buf, &end, dirty_pred);
402 
403 	if (bkey_cmp(&buf->last_scanned, &end) < 0)
404 		return false;
405 
406 	/*
407 	 * If we get to the end start scanning again from the beginning, and
408 	 * only scan up to where we initially started scanning from:
409 	 */
410 	buf->last_scanned = start;
411 	bch_refill_keybuf(dc->disk.c, buf, &start_pos, dirty_pred);
412 
413 	return bkey_cmp(&buf->last_scanned, &start_pos) >= 0;
414 }
415 
416 static int bch_writeback_thread(void *arg)
417 {
418 	struct cached_dev *dc = arg;
419 	bool searched_full_index;
420 
421 	while (!kthread_should_stop()) {
422 		down_write(&dc->writeback_lock);
423 		if (!atomic_read(&dc->has_dirty) ||
424 		    (!test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags) &&
425 		     !dc->writeback_running)) {
426 			up_write(&dc->writeback_lock);
427 			set_current_state(TASK_INTERRUPTIBLE);
428 
429 			if (kthread_should_stop())
430 				return 0;
431 
432 			schedule();
433 			continue;
434 		}
435 
436 		searched_full_index = refill_dirty(dc);
437 
438 		if (searched_full_index &&
439 		    RB_EMPTY_ROOT(&dc->writeback_keys.keys)) {
440 			atomic_set(&dc->has_dirty, 0);
441 			cached_dev_put(dc);
442 			SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN);
443 			bch_write_bdev_super(dc, NULL);
444 		}
445 
446 		up_write(&dc->writeback_lock);
447 
448 		bch_ratelimit_reset(&dc->writeback_rate);
449 		read_dirty(dc);
450 
451 		if (searched_full_index) {
452 			unsigned delay = dc->writeback_delay * HZ;
453 
454 			while (delay &&
455 			       !kthread_should_stop() &&
456 			       !test_bit(BCACHE_DEV_DETACHING, &dc->disk.flags))
457 				delay = schedule_timeout_interruptible(delay);
458 		}
459 	}
460 
461 	return 0;
462 }
463 
464 /* Init */
465 
466 struct sectors_dirty_init {
467 	struct btree_op	op;
468 	unsigned	inode;
469 };
470 
471 static int sectors_dirty_init_fn(struct btree_op *_op, struct btree *b,
472 				 struct bkey *k)
473 {
474 	struct sectors_dirty_init *op = container_of(_op,
475 						struct sectors_dirty_init, op);
476 	if (KEY_INODE(k) > op->inode)
477 		return MAP_DONE;
478 
479 	if (KEY_DIRTY(k))
480 		bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
481 					     KEY_START(k), KEY_SIZE(k));
482 
483 	return MAP_CONTINUE;
484 }
485 
486 void bch_sectors_dirty_init(struct bcache_device *d)
487 {
488 	struct sectors_dirty_init op;
489 
490 	bch_btree_op_init(&op.op, -1);
491 	op.inode = d->id;
492 
493 	bch_btree_map_keys(&op.op, d->c, &KEY(op.inode, 0, 0),
494 			   sectors_dirty_init_fn, 0);
495 
496 	d->sectors_dirty_last = bcache_dev_sectors_dirty(d);
497 }
498 
499 void bch_cached_dev_writeback_init(struct cached_dev *dc)
500 {
501 	sema_init(&dc->in_flight, 64);
502 	init_rwsem(&dc->writeback_lock);
503 	bch_keybuf_init(&dc->writeback_keys);
504 
505 	dc->writeback_metadata		= true;
506 	dc->writeback_running		= true;
507 	dc->writeback_percent		= 10;
508 	dc->writeback_delay		= 30;
509 	dc->writeback_rate.rate		= 1024;
510 
511 	dc->writeback_rate_update_seconds = 5;
512 	dc->writeback_rate_d_term	= 30;
513 	dc->writeback_rate_p_term_inverse = 6000;
514 
515 	INIT_DELAYED_WORK(&dc->writeback_rate_update, update_writeback_rate);
516 }
517 
518 int bch_cached_dev_writeback_start(struct cached_dev *dc)
519 {
520 	dc->writeback_write_wq = alloc_workqueue("bcache_writeback_wq",
521 						WQ_MEM_RECLAIM, 0);
522 	if (!dc->writeback_write_wq)
523 		return -ENOMEM;
524 
525 	dc->writeback_thread = kthread_create(bch_writeback_thread, dc,
526 					      "bcache_writeback");
527 	if (IS_ERR(dc->writeback_thread))
528 		return PTR_ERR(dc->writeback_thread);
529 
530 	schedule_delayed_work(&dc->writeback_rate_update,
531 			      dc->writeback_rate_update_seconds * HZ);
532 
533 	bch_writeback_queue(dc);
534 
535 	return 0;
536 }
537