xref: /openbmc/linux/drivers/md/bcache/btree.c (revision 77d84ff8)
1 /*
2  * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com>
3  *
4  * Uses a block device as cache for other block devices; optimized for SSDs.
5  * All allocation is done in buckets, which should match the erase block size
6  * of the device.
7  *
8  * Buckets containing cached data are kept on a heap sorted by priority;
9  * bucket priority is increased on cache hit, and periodically all the buckets
10  * on the heap have their priority scaled down. This currently is just used as
11  * an LRU but in the future should allow for more intelligent heuristics.
12  *
13  * Buckets have an 8 bit counter; freeing is accomplished by incrementing the
14  * counter. Garbage collection is used to remove stale pointers.
15  *
16  * Indexing is done via a btree; nodes are not necessarily fully sorted, rather
17  * as keys are inserted we only sort the pages that have not yet been written.
18  * When garbage collection is run, we resort the entire node.
19  *
20  * All configuration is done via sysfs; see Documentation/bcache.txt.
21  */
22 
23 #include "bcache.h"
24 #include "btree.h"
25 #include "debug.h"
26 #include "writeback.h"
27 
28 #include <linux/slab.h>
29 #include <linux/bitops.h>
30 #include <linux/freezer.h>
31 #include <linux/hash.h>
32 #include <linux/kthread.h>
33 #include <linux/prefetch.h>
34 #include <linux/random.h>
35 #include <linux/rcupdate.h>
36 #include <trace/events/bcache.h>
37 
38 /*
39  * Todo:
40  * register_bcache: Return errors out to userspace correctly
41  *
42  * Writeback: don't undirty key until after a cache flush
43  *
44  * Create an iterator for key pointers
45  *
46  * On btree write error, mark bucket such that it won't be freed from the cache
47  *
48  * Journalling:
49  *   Check for bad keys in replay
50  *   Propagate barriers
51  *   Refcount journal entries in journal_replay
52  *
53  * Garbage collection:
54  *   Finish incremental gc
55  *   Gc should free old UUIDs, data for invalid UUIDs
56  *
57  * Provide a way to list backing device UUIDs we have data cached for, and
58  * probably how long it's been since we've seen them, and a way to invalidate
59  * dirty data for devices that will never be attached again
60  *
61  * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so
62  * that based on that and how much dirty data we have we can keep writeback
63  * from being starved
64  *
65  * Add a tracepoint or somesuch to watch for writeback starvation
66  *
67  * When btree depth > 1 and splitting an interior node, we have to make sure
68  * alloc_bucket() cannot fail. This should be true but is not completely
69  * obvious.
70  *
71  * Make sure all allocations get charged to the root cgroup
72  *
73  * Plugging?
74  *
75  * If data write is less than hard sector size of ssd, round up offset in open
76  * bucket to the next whole sector
77  *
78  * Also lookup by cgroup in get_open_bucket()
79  *
80  * Superblock needs to be fleshed out for multiple cache devices
81  *
82  * Add a sysfs tunable for the number of writeback IOs in flight
83  *
84  * Add a sysfs tunable for the number of open data buckets
85  *
86  * IO tracking: Can we track when one process is doing io on behalf of another?
87  * IO tracking: Don't use just an average, weigh more recent stuff higher
88  *
89  * Test module load/unload
90  */
91 
92 enum {
93 	BTREE_INSERT_STATUS_INSERT,
94 	BTREE_INSERT_STATUS_BACK_MERGE,
95 	BTREE_INSERT_STATUS_OVERWROTE,
96 	BTREE_INSERT_STATUS_FRONT_MERGE,
97 };
98 
99 #define MAX_NEED_GC		64
100 #define MAX_SAVE_PRIO		72
101 
102 #define PTR_DIRTY_BIT		(((uint64_t) 1 << 36))
103 
104 #define PTR_HASH(c, k)							\
105 	(((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0))
106 
107 static struct workqueue_struct *btree_io_wq;
108 
109 static inline bool should_split(struct btree *b)
110 {
111 	struct bset *i = write_block(b);
112 	return b->written >= btree_blocks(b) ||
113 		(b->written + __set_blocks(i, i->keys + 15, b->c)
114 		 > btree_blocks(b));
115 }
116 
117 #define insert_lock(s, b)	((b)->level <= (s)->lock)
118 
119 /*
120  * These macros are for recursing down the btree - they handle the details of
121  * locking and looking up nodes in the cache for you. They're best treated as
122  * mere syntax when reading code that uses them.
123  *
124  * op->lock determines whether we take a read or a write lock at a given depth.
125  * If you've got a read lock and find that you need a write lock (i.e. you're
126  * going to have to split), set op->lock and return -EINTR; btree_root() will
127  * call you again and you'll have the correct lock.
128  */
129 
130 /**
131  * btree - recurse down the btree on a specified key
132  * @fn:		function to call, which will be passed the child node
133  * @key:	key to recurse on
134  * @b:		parent btree node
135  * @op:		pointer to struct btree_op
136  */
137 #define btree(fn, key, b, op, ...)					\
138 ({									\
139 	int _r, l = (b)->level - 1;					\
140 	bool _w = l <= (op)->lock;					\
141 	struct btree *_child = bch_btree_node_get((b)->c, key, l, _w);	\
142 	if (!IS_ERR(_child)) {						\
143 		_child->parent = (b);					\
144 		_r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__);	\
145 		rw_unlock(_w, _child);					\
146 	} else								\
147 		_r = PTR_ERR(_child);					\
148 	_r;								\
149 })
150 
151 /**
152  * btree_root - call a function on the root of the btree
153  * @fn:		function to call, which will be passed the child node
154  * @c:		cache set
155  * @op:		pointer to struct btree_op
156  */
157 #define btree_root(fn, c, op, ...)					\
158 ({									\
159 	int _r = -EINTR;						\
160 	do {								\
161 		struct btree *_b = (c)->root;				\
162 		bool _w = insert_lock(op, _b);				\
163 		rw_lock(_w, _b, _b->level);				\
164 		if (_b == (c)->root &&					\
165 		    _w == insert_lock(op, _b)) {			\
166 			_b->parent = NULL;				\
167 			_r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__);	\
168 		}							\
169 		rw_unlock(_w, _b);					\
170 		bch_cannibalize_unlock(c);				\
171 		if (_r == -ENOSPC) {					\
172 			wait_event((c)->try_wait,			\
173 				   !(c)->try_harder);			\
174 			_r = -EINTR;					\
175 		}							\
176 	} while (_r == -EINTR);						\
177 									\
178 	_r;								\
179 })
180 
181 /* Btree key manipulation */
182 
183 void bkey_put(struct cache_set *c, struct bkey *k)
184 {
185 	unsigned i;
186 
187 	for (i = 0; i < KEY_PTRS(k); i++)
188 		if (ptr_available(c, k, i))
189 			atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin);
190 }
191 
192 /* Btree IO */
193 
194 static uint64_t btree_csum_set(struct btree *b, struct bset *i)
195 {
196 	uint64_t crc = b->key.ptr[0];
197 	void *data = (void *) i + 8, *end = end(i);
198 
199 	crc = bch_crc64_update(crc, data, end - data);
200 	return crc ^ 0xffffffffffffffffULL;
201 }
202 
203 static void bch_btree_node_read_done(struct btree *b)
204 {
205 	const char *err = "bad btree header";
206 	struct bset *i = b->sets[0].data;
207 	struct btree_iter *iter;
208 
209 	iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT);
210 	iter->size = b->c->sb.bucket_size / b->c->sb.block_size;
211 	iter->used = 0;
212 
213 #ifdef CONFIG_BCACHE_DEBUG
214 	iter->b = b;
215 #endif
216 
217 	if (!i->seq)
218 		goto err;
219 
220 	for (;
221 	     b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq;
222 	     i = write_block(b)) {
223 		err = "unsupported bset version";
224 		if (i->version > BCACHE_BSET_VERSION)
225 			goto err;
226 
227 		err = "bad btree header";
228 		if (b->written + set_blocks(i, b->c) > btree_blocks(b))
229 			goto err;
230 
231 		err = "bad magic";
232 		if (i->magic != bset_magic(&b->c->sb))
233 			goto err;
234 
235 		err = "bad checksum";
236 		switch (i->version) {
237 		case 0:
238 			if (i->csum != csum_set(i))
239 				goto err;
240 			break;
241 		case BCACHE_BSET_VERSION:
242 			if (i->csum != btree_csum_set(b, i))
243 				goto err;
244 			break;
245 		}
246 
247 		err = "empty set";
248 		if (i != b->sets[0].data && !i->keys)
249 			goto err;
250 
251 		bch_btree_iter_push(iter, i->start, end(i));
252 
253 		b->written += set_blocks(i, b->c);
254 	}
255 
256 	err = "corrupted btree";
257 	for (i = write_block(b);
258 	     index(i, b) < btree_blocks(b);
259 	     i = ((void *) i) + block_bytes(b->c))
260 		if (i->seq == b->sets[0].data->seq)
261 			goto err;
262 
263 	bch_btree_sort_and_fix_extents(b, iter);
264 
265 	i = b->sets[0].data;
266 	err = "short btree key";
267 	if (b->sets[0].size &&
268 	    bkey_cmp(&b->key, &b->sets[0].end) < 0)
269 		goto err;
270 
271 	if (b->written < btree_blocks(b))
272 		bch_bset_init_next(b);
273 out:
274 	mempool_free(iter, b->c->fill_iter);
275 	return;
276 err:
277 	set_btree_node_io_error(b);
278 	bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys",
279 			    err, PTR_BUCKET_NR(b->c, &b->key, 0),
280 			    index(i, b), i->keys);
281 	goto out;
282 }
283 
284 static void btree_node_read_endio(struct bio *bio, int error)
285 {
286 	struct closure *cl = bio->bi_private;
287 	closure_put(cl);
288 }
289 
290 void bch_btree_node_read(struct btree *b)
291 {
292 	uint64_t start_time = local_clock();
293 	struct closure cl;
294 	struct bio *bio;
295 
296 	trace_bcache_btree_read(b);
297 
298 	closure_init_stack(&cl);
299 
300 	bio = bch_bbio_alloc(b->c);
301 	bio->bi_rw	= REQ_META|READ_SYNC;
302 	bio->bi_size	= KEY_SIZE(&b->key) << 9;
303 	bio->bi_end_io	= btree_node_read_endio;
304 	bio->bi_private	= &cl;
305 
306 	bch_bio_map(bio, b->sets[0].data);
307 
308 	bch_submit_bbio(bio, b->c, &b->key, 0);
309 	closure_sync(&cl);
310 
311 	if (!test_bit(BIO_UPTODATE, &bio->bi_flags))
312 		set_btree_node_io_error(b);
313 
314 	bch_bbio_free(bio, b->c);
315 
316 	if (btree_node_io_error(b))
317 		goto err;
318 
319 	bch_btree_node_read_done(b);
320 	bch_time_stats_update(&b->c->btree_read_time, start_time);
321 
322 	return;
323 err:
324 	bch_cache_set_error(b->c, "io error reading bucket %zu",
325 			    PTR_BUCKET_NR(b->c, &b->key, 0));
326 }
327 
328 static void btree_complete_write(struct btree *b, struct btree_write *w)
329 {
330 	if (w->prio_blocked &&
331 	    !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked))
332 		wake_up_allocators(b->c);
333 
334 	if (w->journal) {
335 		atomic_dec_bug(w->journal);
336 		__closure_wake_up(&b->c->journal.wait);
337 	}
338 
339 	w->prio_blocked	= 0;
340 	w->journal	= NULL;
341 }
342 
343 static void __btree_node_write_done(struct closure *cl)
344 {
345 	struct btree *b = container_of(cl, struct btree, io.cl);
346 	struct btree_write *w = btree_prev_write(b);
347 
348 	bch_bbio_free(b->bio, b->c);
349 	b->bio = NULL;
350 	btree_complete_write(b, w);
351 
352 	if (btree_node_dirty(b))
353 		queue_delayed_work(btree_io_wq, &b->work,
354 				   msecs_to_jiffies(30000));
355 
356 	closure_return(cl);
357 }
358 
359 static void btree_node_write_done(struct closure *cl)
360 {
361 	struct btree *b = container_of(cl, struct btree, io.cl);
362 	struct bio_vec *bv;
363 	int n;
364 
365 	__bio_for_each_segment(bv, b->bio, n, 0)
366 		__free_page(bv->bv_page);
367 
368 	__btree_node_write_done(cl);
369 }
370 
371 static void btree_node_write_endio(struct bio *bio, int error)
372 {
373 	struct closure *cl = bio->bi_private;
374 	struct btree *b = container_of(cl, struct btree, io.cl);
375 
376 	if (error)
377 		set_btree_node_io_error(b);
378 
379 	bch_bbio_count_io_errors(b->c, bio, error, "writing btree");
380 	closure_put(cl);
381 }
382 
383 static void do_btree_node_write(struct btree *b)
384 {
385 	struct closure *cl = &b->io.cl;
386 	struct bset *i = b->sets[b->nsets].data;
387 	BKEY_PADDED(key) k;
388 
389 	i->version	= BCACHE_BSET_VERSION;
390 	i->csum		= btree_csum_set(b, i);
391 
392 	BUG_ON(b->bio);
393 	b->bio = bch_bbio_alloc(b->c);
394 
395 	b->bio->bi_end_io	= btree_node_write_endio;
396 	b->bio->bi_private	= cl;
397 	b->bio->bi_rw		= REQ_META|WRITE_SYNC|REQ_FUA;
398 	b->bio->bi_size		= set_blocks(i, b->c) * block_bytes(b->c);
399 	bch_bio_map(b->bio, i);
400 
401 	/*
402 	 * If we're appending to a leaf node, we don't technically need FUA -
403 	 * this write just needs to be persisted before the next journal write,
404 	 * which will be marked FLUSH|FUA.
405 	 *
406 	 * Similarly if we're writing a new btree root - the pointer is going to
407 	 * be in the next journal entry.
408 	 *
409 	 * But if we're writing a new btree node (that isn't a root) or
410 	 * appending to a non leaf btree node, we need either FUA or a flush
411 	 * when we write the parent with the new pointer. FUA is cheaper than a
412 	 * flush, and writes appending to leaf nodes aren't blocking anything so
413 	 * just make all btree node writes FUA to keep things sane.
414 	 */
415 
416 	bkey_copy(&k.key, &b->key);
417 	SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i));
418 
419 	if (!bio_alloc_pages(b->bio, GFP_NOIO)) {
420 		int j;
421 		struct bio_vec *bv;
422 		void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1));
423 
424 		bio_for_each_segment(bv, b->bio, j)
425 			memcpy(page_address(bv->bv_page),
426 			       base + j * PAGE_SIZE, PAGE_SIZE);
427 
428 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
429 
430 		continue_at(cl, btree_node_write_done, NULL);
431 	} else {
432 		b->bio->bi_vcnt = 0;
433 		bch_bio_map(b->bio, i);
434 
435 		bch_submit_bbio(b->bio, b->c, &k.key, 0);
436 
437 		closure_sync(cl);
438 		__btree_node_write_done(cl);
439 	}
440 }
441 
442 void bch_btree_node_write(struct btree *b, struct closure *parent)
443 {
444 	struct bset *i = b->sets[b->nsets].data;
445 
446 	trace_bcache_btree_write(b);
447 
448 	BUG_ON(current->bio_list);
449 	BUG_ON(b->written >= btree_blocks(b));
450 	BUG_ON(b->written && !i->keys);
451 	BUG_ON(b->sets->data->seq != i->seq);
452 	bch_check_keys(b, "writing");
453 
454 	cancel_delayed_work(&b->work);
455 
456 	/* If caller isn't waiting for write, parent refcount is cache set */
457 	closure_lock(&b->io, parent ?: &b->c->cl);
458 
459 	clear_bit(BTREE_NODE_dirty,	 &b->flags);
460 	change_bit(BTREE_NODE_write_idx, &b->flags);
461 
462 	do_btree_node_write(b);
463 
464 	b->written += set_blocks(i, b->c);
465 	atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size,
466 			&PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written);
467 
468 	bch_btree_sort_lazy(b);
469 
470 	if (b->written < btree_blocks(b))
471 		bch_bset_init_next(b);
472 }
473 
474 static void bch_btree_node_write_sync(struct btree *b)
475 {
476 	struct closure cl;
477 
478 	closure_init_stack(&cl);
479 	bch_btree_node_write(b, &cl);
480 	closure_sync(&cl);
481 }
482 
483 static void btree_node_write_work(struct work_struct *w)
484 {
485 	struct btree *b = container_of(to_delayed_work(w), struct btree, work);
486 
487 	rw_lock(true, b, b->level);
488 
489 	if (btree_node_dirty(b))
490 		bch_btree_node_write(b, NULL);
491 	rw_unlock(true, b);
492 }
493 
494 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref)
495 {
496 	struct bset *i = b->sets[b->nsets].data;
497 	struct btree_write *w = btree_current_write(b);
498 
499 	BUG_ON(!b->written);
500 	BUG_ON(!i->keys);
501 
502 	if (!btree_node_dirty(b))
503 		queue_delayed_work(btree_io_wq, &b->work, 30 * HZ);
504 
505 	set_btree_node_dirty(b);
506 
507 	if (journal_ref) {
508 		if (w->journal &&
509 		    journal_pin_cmp(b->c, w->journal, journal_ref)) {
510 			atomic_dec_bug(w->journal);
511 			w->journal = NULL;
512 		}
513 
514 		if (!w->journal) {
515 			w->journal = journal_ref;
516 			atomic_inc(w->journal);
517 		}
518 	}
519 
520 	/* Force write if set is too big */
521 	if (set_bytes(i) > PAGE_SIZE - 48 &&
522 	    !current->bio_list)
523 		bch_btree_node_write(b, NULL);
524 }
525 
526 /*
527  * Btree in memory cache - allocation/freeing
528  * mca -> memory cache
529  */
530 
531 static void mca_reinit(struct btree *b)
532 {
533 	unsigned i;
534 
535 	b->flags	= 0;
536 	b->written	= 0;
537 	b->nsets	= 0;
538 
539 	for (i = 0; i < MAX_BSETS; i++)
540 		b->sets[i].size = 0;
541 	/*
542 	 * Second loop starts at 1 because b->sets[0]->data is the memory we
543 	 * allocated
544 	 */
545 	for (i = 1; i < MAX_BSETS; i++)
546 		b->sets[i].data = NULL;
547 }
548 
549 #define mca_reserve(c)	(((c->root && c->root->level)		\
550 			  ? c->root->level : 1) * 8 + 16)
551 #define mca_can_free(c)						\
552 	max_t(int, 0, c->bucket_cache_used - mca_reserve(c))
553 
554 static void mca_data_free(struct btree *b)
555 {
556 	struct bset_tree *t = b->sets;
557 	BUG_ON(!closure_is_unlocked(&b->io.cl));
558 
559 	if (bset_prev_bytes(b) < PAGE_SIZE)
560 		kfree(t->prev);
561 	else
562 		free_pages((unsigned long) t->prev,
563 			   get_order(bset_prev_bytes(b)));
564 
565 	if (bset_tree_bytes(b) < PAGE_SIZE)
566 		kfree(t->tree);
567 	else
568 		free_pages((unsigned long) t->tree,
569 			   get_order(bset_tree_bytes(b)));
570 
571 	free_pages((unsigned long) t->data, b->page_order);
572 
573 	t->prev = NULL;
574 	t->tree = NULL;
575 	t->data = NULL;
576 	list_move(&b->list, &b->c->btree_cache_freed);
577 	b->c->bucket_cache_used--;
578 }
579 
580 static void mca_bucket_free(struct btree *b)
581 {
582 	BUG_ON(btree_node_dirty(b));
583 
584 	b->key.ptr[0] = 0;
585 	hlist_del_init_rcu(&b->hash);
586 	list_move(&b->list, &b->c->btree_cache_freeable);
587 }
588 
589 static unsigned btree_order(struct bkey *k)
590 {
591 	return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1);
592 }
593 
594 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp)
595 {
596 	struct bset_tree *t = b->sets;
597 	BUG_ON(t->data);
598 
599 	b->page_order = max_t(unsigned,
600 			      ilog2(b->c->btree_pages),
601 			      btree_order(k));
602 
603 	t->data = (void *) __get_free_pages(gfp, b->page_order);
604 	if (!t->data)
605 		goto err;
606 
607 	t->tree = bset_tree_bytes(b) < PAGE_SIZE
608 		? kmalloc(bset_tree_bytes(b), gfp)
609 		: (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b)));
610 	if (!t->tree)
611 		goto err;
612 
613 	t->prev = bset_prev_bytes(b) < PAGE_SIZE
614 		? kmalloc(bset_prev_bytes(b), gfp)
615 		: (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b)));
616 	if (!t->prev)
617 		goto err;
618 
619 	list_move(&b->list, &b->c->btree_cache);
620 	b->c->bucket_cache_used++;
621 	return;
622 err:
623 	mca_data_free(b);
624 }
625 
626 static struct btree *mca_bucket_alloc(struct cache_set *c,
627 				      struct bkey *k, gfp_t gfp)
628 {
629 	struct btree *b = kzalloc(sizeof(struct btree), gfp);
630 	if (!b)
631 		return NULL;
632 
633 	init_rwsem(&b->lock);
634 	lockdep_set_novalidate_class(&b->lock);
635 	INIT_LIST_HEAD(&b->list);
636 	INIT_DELAYED_WORK(&b->work, btree_node_write_work);
637 	b->c = c;
638 	closure_init_unlocked(&b->io);
639 
640 	mca_data_alloc(b, k, gfp);
641 	return b;
642 }
643 
644 static int mca_reap(struct btree *b, unsigned min_order, bool flush)
645 {
646 	struct closure cl;
647 
648 	closure_init_stack(&cl);
649 	lockdep_assert_held(&b->c->bucket_lock);
650 
651 	if (!down_write_trylock(&b->lock))
652 		return -ENOMEM;
653 
654 	BUG_ON(btree_node_dirty(b) && !b->sets[0].data);
655 
656 	if (b->page_order < min_order ||
657 	    (!flush &&
658 	     (btree_node_dirty(b) ||
659 	      atomic_read(&b->io.cl.remaining) != -1))) {
660 		rw_unlock(true, b);
661 		return -ENOMEM;
662 	}
663 
664 	if (btree_node_dirty(b))
665 		bch_btree_node_write_sync(b);
666 
667 	/* wait for any in flight btree write */
668 	closure_wait_event(&b->io.wait, &cl,
669 			   atomic_read(&b->io.cl.remaining) == -1);
670 
671 	return 0;
672 }
673 
674 static unsigned long bch_mca_scan(struct shrinker *shrink,
675 				  struct shrink_control *sc)
676 {
677 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
678 	struct btree *b, *t;
679 	unsigned long i, nr = sc->nr_to_scan;
680 	unsigned long freed = 0;
681 
682 	if (c->shrinker_disabled)
683 		return SHRINK_STOP;
684 
685 	if (c->try_harder)
686 		return SHRINK_STOP;
687 
688 	/* Return -1 if we can't do anything right now */
689 	if (sc->gfp_mask & __GFP_IO)
690 		mutex_lock(&c->bucket_lock);
691 	else if (!mutex_trylock(&c->bucket_lock))
692 		return -1;
693 
694 	/*
695 	 * It's _really_ critical that we don't free too many btree nodes - we
696 	 * have to always leave ourselves a reserve. The reserve is how we
697 	 * guarantee that allocating memory for a new btree node can always
698 	 * succeed, so that inserting keys into the btree can always succeed and
699 	 * IO can always make forward progress:
700 	 */
701 	nr /= c->btree_pages;
702 	nr = min_t(unsigned long, nr, mca_can_free(c));
703 
704 	i = 0;
705 	list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) {
706 		if (freed >= nr)
707 			break;
708 
709 		if (++i > 3 &&
710 		    !mca_reap(b, 0, false)) {
711 			mca_data_free(b);
712 			rw_unlock(true, b);
713 			freed++;
714 		}
715 	}
716 
717 	/*
718 	 * Can happen right when we first start up, before we've read in any
719 	 * btree nodes
720 	 */
721 	if (list_empty(&c->btree_cache))
722 		goto out;
723 
724 	for (i = 0; (nr--) && i < c->bucket_cache_used; i++) {
725 		b = list_first_entry(&c->btree_cache, struct btree, list);
726 		list_rotate_left(&c->btree_cache);
727 
728 		if (!b->accessed &&
729 		    !mca_reap(b, 0, false)) {
730 			mca_bucket_free(b);
731 			mca_data_free(b);
732 			rw_unlock(true, b);
733 			freed++;
734 		} else
735 			b->accessed = 0;
736 	}
737 out:
738 	mutex_unlock(&c->bucket_lock);
739 	return freed;
740 }
741 
742 static unsigned long bch_mca_count(struct shrinker *shrink,
743 				   struct shrink_control *sc)
744 {
745 	struct cache_set *c = container_of(shrink, struct cache_set, shrink);
746 
747 	if (c->shrinker_disabled)
748 		return 0;
749 
750 	if (c->try_harder)
751 		return 0;
752 
753 	return mca_can_free(c) * c->btree_pages;
754 }
755 
756 void bch_btree_cache_free(struct cache_set *c)
757 {
758 	struct btree *b;
759 	struct closure cl;
760 	closure_init_stack(&cl);
761 
762 	if (c->shrink.list.next)
763 		unregister_shrinker(&c->shrink);
764 
765 	mutex_lock(&c->bucket_lock);
766 
767 #ifdef CONFIG_BCACHE_DEBUG
768 	if (c->verify_data)
769 		list_move(&c->verify_data->list, &c->btree_cache);
770 #endif
771 
772 	list_splice(&c->btree_cache_freeable,
773 		    &c->btree_cache);
774 
775 	while (!list_empty(&c->btree_cache)) {
776 		b = list_first_entry(&c->btree_cache, struct btree, list);
777 
778 		if (btree_node_dirty(b))
779 			btree_complete_write(b, btree_current_write(b));
780 		clear_bit(BTREE_NODE_dirty, &b->flags);
781 
782 		mca_data_free(b);
783 	}
784 
785 	while (!list_empty(&c->btree_cache_freed)) {
786 		b = list_first_entry(&c->btree_cache_freed,
787 				     struct btree, list);
788 		list_del(&b->list);
789 		cancel_delayed_work_sync(&b->work);
790 		kfree(b);
791 	}
792 
793 	mutex_unlock(&c->bucket_lock);
794 }
795 
796 int bch_btree_cache_alloc(struct cache_set *c)
797 {
798 	unsigned i;
799 
800 	for (i = 0; i < mca_reserve(c); i++)
801 		if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL))
802 			return -ENOMEM;
803 
804 	list_splice_init(&c->btree_cache,
805 			 &c->btree_cache_freeable);
806 
807 #ifdef CONFIG_BCACHE_DEBUG
808 	mutex_init(&c->verify_lock);
809 
810 	c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL);
811 
812 	if (c->verify_data &&
813 	    c->verify_data->sets[0].data)
814 		list_del_init(&c->verify_data->list);
815 	else
816 		c->verify_data = NULL;
817 #endif
818 
819 	c->shrink.count_objects = bch_mca_count;
820 	c->shrink.scan_objects = bch_mca_scan;
821 	c->shrink.seeks = 4;
822 	c->shrink.batch = c->btree_pages * 2;
823 	register_shrinker(&c->shrink);
824 
825 	return 0;
826 }
827 
828 /* Btree in memory cache - hash table */
829 
830 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k)
831 {
832 	return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)];
833 }
834 
835 static struct btree *mca_find(struct cache_set *c, struct bkey *k)
836 {
837 	struct btree *b;
838 
839 	rcu_read_lock();
840 	hlist_for_each_entry_rcu(b, mca_hash(c, k), hash)
841 		if (PTR_HASH(c, &b->key) == PTR_HASH(c, k))
842 			goto out;
843 	b = NULL;
844 out:
845 	rcu_read_unlock();
846 	return b;
847 }
848 
849 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k)
850 {
851 	struct btree *b;
852 
853 	trace_bcache_btree_cache_cannibalize(c);
854 
855 	if (!c->try_harder) {
856 		c->try_harder = current;
857 		c->try_harder_start = local_clock();
858 	} else if (c->try_harder != current)
859 		return ERR_PTR(-ENOSPC);
860 
861 	list_for_each_entry_reverse(b, &c->btree_cache, list)
862 		if (!mca_reap(b, btree_order(k), false))
863 			return b;
864 
865 	list_for_each_entry_reverse(b, &c->btree_cache, list)
866 		if (!mca_reap(b, btree_order(k), true))
867 			return b;
868 
869 	return ERR_PTR(-ENOMEM);
870 }
871 
872 /*
873  * We can only have one thread cannibalizing other cached btree nodes at a time,
874  * or we'll deadlock. We use an open coded mutex to ensure that, which a
875  * cannibalize_bucket() will take. This means every time we unlock the root of
876  * the btree, we need to release this lock if we have it held.
877  */
878 static void bch_cannibalize_unlock(struct cache_set *c)
879 {
880 	if (c->try_harder == current) {
881 		bch_time_stats_update(&c->try_harder_time, c->try_harder_start);
882 		c->try_harder = NULL;
883 		wake_up(&c->try_wait);
884 	}
885 }
886 
887 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, int level)
888 {
889 	struct btree *b;
890 
891 	BUG_ON(current->bio_list);
892 
893 	lockdep_assert_held(&c->bucket_lock);
894 
895 	if (mca_find(c, k))
896 		return NULL;
897 
898 	/* btree_free() doesn't free memory; it sticks the node on the end of
899 	 * the list. Check if there's any freed nodes there:
900 	 */
901 	list_for_each_entry(b, &c->btree_cache_freeable, list)
902 		if (!mca_reap(b, btree_order(k), false))
903 			goto out;
904 
905 	/* We never free struct btree itself, just the memory that holds the on
906 	 * disk node. Check the freed list before allocating a new one:
907 	 */
908 	list_for_each_entry(b, &c->btree_cache_freed, list)
909 		if (!mca_reap(b, 0, false)) {
910 			mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO);
911 			if (!b->sets[0].data)
912 				goto err;
913 			else
914 				goto out;
915 		}
916 
917 	b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO);
918 	if (!b)
919 		goto err;
920 
921 	BUG_ON(!down_write_trylock(&b->lock));
922 	if (!b->sets->data)
923 		goto err;
924 out:
925 	BUG_ON(!closure_is_unlocked(&b->io.cl));
926 
927 	bkey_copy(&b->key, k);
928 	list_move(&b->list, &c->btree_cache);
929 	hlist_del_init_rcu(&b->hash);
930 	hlist_add_head_rcu(&b->hash, mca_hash(c, k));
931 
932 	lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_);
933 	b->level	= level;
934 	b->parent	= (void *) ~0UL;
935 
936 	mca_reinit(b);
937 
938 	return b;
939 err:
940 	if (b)
941 		rw_unlock(true, b);
942 
943 	b = mca_cannibalize(c, k);
944 	if (!IS_ERR(b))
945 		goto out;
946 
947 	return b;
948 }
949 
950 /**
951  * bch_btree_node_get - find a btree node in the cache and lock it, reading it
952  * in from disk if necessary.
953  *
954  * If IO is necessary and running under generic_make_request, returns -EAGAIN.
955  *
956  * The btree node will have either a read or a write lock held, depending on
957  * level and op->lock.
958  */
959 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k,
960 				 int level, bool write)
961 {
962 	int i = 0;
963 	struct btree *b;
964 
965 	BUG_ON(level < 0);
966 retry:
967 	b = mca_find(c, k);
968 
969 	if (!b) {
970 		if (current->bio_list)
971 			return ERR_PTR(-EAGAIN);
972 
973 		mutex_lock(&c->bucket_lock);
974 		b = mca_alloc(c, k, level);
975 		mutex_unlock(&c->bucket_lock);
976 
977 		if (!b)
978 			goto retry;
979 		if (IS_ERR(b))
980 			return b;
981 
982 		bch_btree_node_read(b);
983 
984 		if (!write)
985 			downgrade_write(&b->lock);
986 	} else {
987 		rw_lock(write, b, level);
988 		if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) {
989 			rw_unlock(write, b);
990 			goto retry;
991 		}
992 		BUG_ON(b->level != level);
993 	}
994 
995 	b->accessed = 1;
996 
997 	for (; i <= b->nsets && b->sets[i].size; i++) {
998 		prefetch(b->sets[i].tree);
999 		prefetch(b->sets[i].data);
1000 	}
1001 
1002 	for (; i <= b->nsets; i++)
1003 		prefetch(b->sets[i].data);
1004 
1005 	if (btree_node_io_error(b)) {
1006 		rw_unlock(write, b);
1007 		return ERR_PTR(-EIO);
1008 	}
1009 
1010 	BUG_ON(!b->written);
1011 
1012 	return b;
1013 }
1014 
1015 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level)
1016 {
1017 	struct btree *b;
1018 
1019 	mutex_lock(&c->bucket_lock);
1020 	b = mca_alloc(c, k, level);
1021 	mutex_unlock(&c->bucket_lock);
1022 
1023 	if (!IS_ERR_OR_NULL(b)) {
1024 		bch_btree_node_read(b);
1025 		rw_unlock(true, b);
1026 	}
1027 }
1028 
1029 /* Btree alloc */
1030 
1031 static void btree_node_free(struct btree *b)
1032 {
1033 	unsigned i;
1034 
1035 	trace_bcache_btree_node_free(b);
1036 
1037 	BUG_ON(b == b->c->root);
1038 
1039 	if (btree_node_dirty(b))
1040 		btree_complete_write(b, btree_current_write(b));
1041 	clear_bit(BTREE_NODE_dirty, &b->flags);
1042 
1043 	cancel_delayed_work(&b->work);
1044 
1045 	mutex_lock(&b->c->bucket_lock);
1046 
1047 	for (i = 0; i < KEY_PTRS(&b->key); i++) {
1048 		BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin));
1049 
1050 		bch_inc_gen(PTR_CACHE(b->c, &b->key, i),
1051 			    PTR_BUCKET(b->c, &b->key, i));
1052 	}
1053 
1054 	bch_bucket_free(b->c, &b->key);
1055 	mca_bucket_free(b);
1056 	mutex_unlock(&b->c->bucket_lock);
1057 }
1058 
1059 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, bool wait)
1060 {
1061 	BKEY_PADDED(key) k;
1062 	struct btree *b = ERR_PTR(-EAGAIN);
1063 
1064 	mutex_lock(&c->bucket_lock);
1065 retry:
1066 	if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, wait))
1067 		goto err;
1068 
1069 	bkey_put(c, &k.key);
1070 	SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS);
1071 
1072 	b = mca_alloc(c, &k.key, level);
1073 	if (IS_ERR(b))
1074 		goto err_free;
1075 
1076 	if (!b) {
1077 		cache_bug(c,
1078 			"Tried to allocate bucket that was in btree cache");
1079 		goto retry;
1080 	}
1081 
1082 	b->accessed = 1;
1083 	bch_bset_init_next(b);
1084 
1085 	mutex_unlock(&c->bucket_lock);
1086 
1087 	trace_bcache_btree_node_alloc(b);
1088 	return b;
1089 err_free:
1090 	bch_bucket_free(c, &k.key);
1091 err:
1092 	mutex_unlock(&c->bucket_lock);
1093 
1094 	trace_bcache_btree_node_alloc_fail(b);
1095 	return b;
1096 }
1097 
1098 static struct btree *btree_node_alloc_replacement(struct btree *b, bool wait)
1099 {
1100 	struct btree *n = bch_btree_node_alloc(b->c, b->level, wait);
1101 	if (!IS_ERR_OR_NULL(n))
1102 		bch_btree_sort_into(b, n);
1103 
1104 	return n;
1105 }
1106 
1107 static void make_btree_freeing_key(struct btree *b, struct bkey *k)
1108 {
1109 	unsigned i;
1110 
1111 	bkey_copy(k, &b->key);
1112 	bkey_copy_key(k, &ZERO_KEY);
1113 
1114 	for (i = 0; i < KEY_PTRS(k); i++) {
1115 		uint8_t g = PTR_BUCKET(b->c, k, i)->gen + 1;
1116 
1117 		SET_PTR_GEN(k, i, g);
1118 	}
1119 
1120 	atomic_inc(&b->c->prio_blocked);
1121 }
1122 
1123 /* Garbage collection */
1124 
1125 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k)
1126 {
1127 	uint8_t stale = 0;
1128 	unsigned i;
1129 	struct bucket *g;
1130 
1131 	/*
1132 	 * ptr_invalid() can't return true for the keys that mark btree nodes as
1133 	 * freed, but since ptr_bad() returns true we'll never actually use them
1134 	 * for anything and thus we don't want mark their pointers here
1135 	 */
1136 	if (!bkey_cmp(k, &ZERO_KEY))
1137 		return stale;
1138 
1139 	for (i = 0; i < KEY_PTRS(k); i++) {
1140 		if (!ptr_available(c, k, i))
1141 			continue;
1142 
1143 		g = PTR_BUCKET(c, k, i);
1144 
1145 		if (gen_after(g->gc_gen, PTR_GEN(k, i)))
1146 			g->gc_gen = PTR_GEN(k, i);
1147 
1148 		if (ptr_stale(c, k, i)) {
1149 			stale = max(stale, ptr_stale(c, k, i));
1150 			continue;
1151 		}
1152 
1153 		cache_bug_on(GC_MARK(g) &&
1154 			     (GC_MARK(g) == GC_MARK_METADATA) != (level != 0),
1155 			     c, "inconsistent ptrs: mark = %llu, level = %i",
1156 			     GC_MARK(g), level);
1157 
1158 		if (level)
1159 			SET_GC_MARK(g, GC_MARK_METADATA);
1160 		else if (KEY_DIRTY(k))
1161 			SET_GC_MARK(g, GC_MARK_DIRTY);
1162 
1163 		/* guard against overflow */
1164 		SET_GC_SECTORS_USED(g, min_t(unsigned,
1165 					     GC_SECTORS_USED(g) + KEY_SIZE(k),
1166 					     (1 << 14) - 1));
1167 
1168 		BUG_ON(!GC_SECTORS_USED(g));
1169 	}
1170 
1171 	return stale;
1172 }
1173 
1174 #define btree_mark_key(b, k)	__bch_btree_mark_key(b->c, b->level, k)
1175 
1176 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc)
1177 {
1178 	uint8_t stale = 0;
1179 	unsigned keys = 0, good_keys = 0;
1180 	struct bkey *k;
1181 	struct btree_iter iter;
1182 	struct bset_tree *t;
1183 
1184 	gc->nodes++;
1185 
1186 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1187 		stale = max(stale, btree_mark_key(b, k));
1188 		keys++;
1189 
1190 		if (bch_ptr_bad(b, k))
1191 			continue;
1192 
1193 		gc->key_bytes += bkey_u64s(k);
1194 		gc->nkeys++;
1195 		good_keys++;
1196 
1197 		gc->data += KEY_SIZE(k);
1198 	}
1199 
1200 	for (t = b->sets; t <= &b->sets[b->nsets]; t++)
1201 		btree_bug_on(t->size &&
1202 			     bset_written(b, t) &&
1203 			     bkey_cmp(&b->key, &t->end) < 0,
1204 			     b, "found short btree key in gc");
1205 
1206 	if (b->c->gc_always_rewrite)
1207 		return true;
1208 
1209 	if (stale > 10)
1210 		return true;
1211 
1212 	if ((keys - good_keys) * 2 > keys)
1213 		return true;
1214 
1215 	return false;
1216 }
1217 
1218 #define GC_MERGE_NODES	4U
1219 
1220 struct gc_merge_info {
1221 	struct btree	*b;
1222 	unsigned	keys;
1223 };
1224 
1225 static int bch_btree_insert_node(struct btree *, struct btree_op *,
1226 				 struct keylist *, atomic_t *, struct bkey *);
1227 
1228 static int btree_gc_coalesce(struct btree *b, struct btree_op *op,
1229 			     struct keylist *keylist, struct gc_stat *gc,
1230 			     struct gc_merge_info *r)
1231 {
1232 	unsigned i, nodes = 0, keys = 0, blocks;
1233 	struct btree *new_nodes[GC_MERGE_NODES];
1234 	struct closure cl;
1235 	struct bkey *k;
1236 
1237 	memset(new_nodes, 0, sizeof(new_nodes));
1238 	closure_init_stack(&cl);
1239 
1240 	while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b))
1241 		keys += r[nodes++].keys;
1242 
1243 	blocks = btree_default_blocks(b->c) * 2 / 3;
1244 
1245 	if (nodes < 2 ||
1246 	    __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1))
1247 		return 0;
1248 
1249 	for (i = 0; i < nodes; i++) {
1250 		new_nodes[i] = btree_node_alloc_replacement(r[i].b, false);
1251 		if (IS_ERR_OR_NULL(new_nodes[i]))
1252 			goto out_nocoalesce;
1253 	}
1254 
1255 	for (i = nodes - 1; i > 0; --i) {
1256 		struct bset *n1 = new_nodes[i]->sets->data;
1257 		struct bset *n2 = new_nodes[i - 1]->sets->data;
1258 		struct bkey *k, *last = NULL;
1259 
1260 		keys = 0;
1261 
1262 		if (i > 1) {
1263 			for (k = n2->start;
1264 			     k < end(n2);
1265 			     k = bkey_next(k)) {
1266 				if (__set_blocks(n1, n1->keys + keys +
1267 						 bkey_u64s(k), b->c) > blocks)
1268 					break;
1269 
1270 				last = k;
1271 				keys += bkey_u64s(k);
1272 			}
1273 		} else {
1274 			/*
1275 			 * Last node we're not getting rid of - we're getting
1276 			 * rid of the node at r[0]. Have to try and fit all of
1277 			 * the remaining keys into this node; we can't ensure
1278 			 * they will always fit due to rounding and variable
1279 			 * length keys (shouldn't be possible in practice,
1280 			 * though)
1281 			 */
1282 			if (__set_blocks(n1, n1->keys + n2->keys,
1283 					 b->c) > btree_blocks(new_nodes[i]))
1284 				goto out_nocoalesce;
1285 
1286 			keys = n2->keys;
1287 			/* Take the key of the node we're getting rid of */
1288 			last = &r->b->key;
1289 		}
1290 
1291 		BUG_ON(__set_blocks(n1, n1->keys + keys,
1292 				    b->c) > btree_blocks(new_nodes[i]));
1293 
1294 		if (last)
1295 			bkey_copy_key(&new_nodes[i]->key, last);
1296 
1297 		memcpy(end(n1),
1298 		       n2->start,
1299 		       (void *) node(n2, keys) - (void *) n2->start);
1300 
1301 		n1->keys += keys;
1302 		r[i].keys = n1->keys;
1303 
1304 		memmove(n2->start,
1305 			node(n2, keys),
1306 			(void *) end(n2) - (void *) node(n2, keys));
1307 
1308 		n2->keys -= keys;
1309 
1310 		if (bch_keylist_realloc(keylist,
1311 					KEY_PTRS(&new_nodes[i]->key), b->c))
1312 			goto out_nocoalesce;
1313 
1314 		bch_btree_node_write(new_nodes[i], &cl);
1315 		bch_keylist_add(keylist, &new_nodes[i]->key);
1316 	}
1317 
1318 	for (i = 0; i < nodes; i++) {
1319 		if (bch_keylist_realloc(keylist, KEY_PTRS(&r[i].b->key), b->c))
1320 			goto out_nocoalesce;
1321 
1322 		make_btree_freeing_key(r[i].b, keylist->top);
1323 		bch_keylist_push(keylist);
1324 	}
1325 
1326 	/* We emptied out this node */
1327 	BUG_ON(new_nodes[0]->sets->data->keys);
1328 	btree_node_free(new_nodes[0]);
1329 	rw_unlock(true, new_nodes[0]);
1330 
1331 	closure_sync(&cl);
1332 
1333 	for (i = 0; i < nodes; i++) {
1334 		btree_node_free(r[i].b);
1335 		rw_unlock(true, r[i].b);
1336 
1337 		r[i].b = new_nodes[i];
1338 	}
1339 
1340 	bch_btree_insert_node(b, op, keylist, NULL, NULL);
1341 	BUG_ON(!bch_keylist_empty(keylist));
1342 
1343 	memmove(r, r + 1, sizeof(r[0]) * (nodes - 1));
1344 	r[nodes - 1].b = ERR_PTR(-EINTR);
1345 
1346 	trace_bcache_btree_gc_coalesce(nodes);
1347 	gc->nodes--;
1348 
1349 	/* Invalidated our iterator */
1350 	return -EINTR;
1351 
1352 out_nocoalesce:
1353 	closure_sync(&cl);
1354 
1355 	while ((k = bch_keylist_pop(keylist)))
1356 		if (!bkey_cmp(k, &ZERO_KEY))
1357 			atomic_dec(&b->c->prio_blocked);
1358 
1359 	for (i = 0; i < nodes; i++)
1360 		if (!IS_ERR_OR_NULL(new_nodes[i])) {
1361 			btree_node_free(new_nodes[i]);
1362 			rw_unlock(true, new_nodes[i]);
1363 		}
1364 	return 0;
1365 }
1366 
1367 static unsigned btree_gc_count_keys(struct btree *b)
1368 {
1369 	struct bkey *k;
1370 	struct btree_iter iter;
1371 	unsigned ret = 0;
1372 
1373 	for_each_key_filter(b, k, &iter, bch_ptr_bad)
1374 		ret += bkey_u64s(k);
1375 
1376 	return ret;
1377 }
1378 
1379 static int btree_gc_recurse(struct btree *b, struct btree_op *op,
1380 			    struct closure *writes, struct gc_stat *gc)
1381 {
1382 	unsigned i;
1383 	int ret = 0;
1384 	bool should_rewrite;
1385 	struct btree *n;
1386 	struct bkey *k;
1387 	struct keylist keys;
1388 	struct btree_iter iter;
1389 	struct gc_merge_info r[GC_MERGE_NODES];
1390 	struct gc_merge_info *last = r + GC_MERGE_NODES - 1;
1391 
1392 	bch_keylist_init(&keys);
1393 	bch_btree_iter_init(b, &iter, &b->c->gc_done);
1394 
1395 	for (i = 0; i < GC_MERGE_NODES; i++)
1396 		r[i].b = ERR_PTR(-EINTR);
1397 
1398 	while (1) {
1399 		k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1400 		if (k) {
1401 			r->b = bch_btree_node_get(b->c, k, b->level - 1, true);
1402 			if (IS_ERR(r->b)) {
1403 				ret = PTR_ERR(r->b);
1404 				break;
1405 			}
1406 
1407 			r->keys = btree_gc_count_keys(r->b);
1408 
1409 			ret = btree_gc_coalesce(b, op, &keys, gc, r);
1410 			if (ret)
1411 				break;
1412 		}
1413 
1414 		if (!last->b)
1415 			break;
1416 
1417 		if (!IS_ERR(last->b)) {
1418 			should_rewrite = btree_gc_mark_node(last->b, gc);
1419 			if (should_rewrite) {
1420 				n = btree_node_alloc_replacement(last->b,
1421 								 false);
1422 
1423 				if (!IS_ERR_OR_NULL(n)) {
1424 					bch_btree_node_write_sync(n);
1425 					bch_keylist_add(&keys, &n->key);
1426 
1427 					make_btree_freeing_key(last->b,
1428 							       keys.top);
1429 					bch_keylist_push(&keys);
1430 
1431 					btree_node_free(last->b);
1432 
1433 					bch_btree_insert_node(b, op, &keys,
1434 							      NULL, NULL);
1435 					BUG_ON(!bch_keylist_empty(&keys));
1436 
1437 					rw_unlock(true, last->b);
1438 					last->b = n;
1439 
1440 					/* Invalidated our iterator */
1441 					ret = -EINTR;
1442 					break;
1443 				}
1444 			}
1445 
1446 			if (last->b->level) {
1447 				ret = btree_gc_recurse(last->b, op, writes, gc);
1448 				if (ret)
1449 					break;
1450 			}
1451 
1452 			bkey_copy_key(&b->c->gc_done, &last->b->key);
1453 
1454 			/*
1455 			 * Must flush leaf nodes before gc ends, since replace
1456 			 * operations aren't journalled
1457 			 */
1458 			if (btree_node_dirty(last->b))
1459 				bch_btree_node_write(last->b, writes);
1460 			rw_unlock(true, last->b);
1461 		}
1462 
1463 		memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1));
1464 		r->b = NULL;
1465 
1466 		if (need_resched()) {
1467 			ret = -EAGAIN;
1468 			break;
1469 		}
1470 	}
1471 
1472 	for (i = 0; i < GC_MERGE_NODES; i++)
1473 		if (!IS_ERR_OR_NULL(r[i].b)) {
1474 			if (btree_node_dirty(r[i].b))
1475 				bch_btree_node_write(r[i].b, writes);
1476 			rw_unlock(true, r[i].b);
1477 		}
1478 
1479 	bch_keylist_free(&keys);
1480 
1481 	return ret;
1482 }
1483 
1484 static int bch_btree_gc_root(struct btree *b, struct btree_op *op,
1485 			     struct closure *writes, struct gc_stat *gc)
1486 {
1487 	struct btree *n = NULL;
1488 	int ret = 0;
1489 	bool should_rewrite;
1490 
1491 	should_rewrite = btree_gc_mark_node(b, gc);
1492 	if (should_rewrite) {
1493 		n = btree_node_alloc_replacement(b, false);
1494 
1495 		if (!IS_ERR_OR_NULL(n)) {
1496 			bch_btree_node_write_sync(n);
1497 			bch_btree_set_root(n);
1498 			btree_node_free(b);
1499 			rw_unlock(true, n);
1500 
1501 			return -EINTR;
1502 		}
1503 	}
1504 
1505 	if (b->level) {
1506 		ret = btree_gc_recurse(b, op, writes, gc);
1507 		if (ret)
1508 			return ret;
1509 	}
1510 
1511 	bkey_copy_key(&b->c->gc_done, &b->key);
1512 
1513 	return ret;
1514 }
1515 
1516 static void btree_gc_start(struct cache_set *c)
1517 {
1518 	struct cache *ca;
1519 	struct bucket *b;
1520 	unsigned i;
1521 
1522 	if (!c->gc_mark_valid)
1523 		return;
1524 
1525 	mutex_lock(&c->bucket_lock);
1526 
1527 	c->gc_mark_valid = 0;
1528 	c->gc_done = ZERO_KEY;
1529 
1530 	for_each_cache(ca, c, i)
1531 		for_each_bucket(b, ca) {
1532 			b->gc_gen = b->gen;
1533 			if (!atomic_read(&b->pin)) {
1534 				SET_GC_MARK(b, GC_MARK_RECLAIMABLE);
1535 				SET_GC_SECTORS_USED(b, 0);
1536 			}
1537 		}
1538 
1539 	mutex_unlock(&c->bucket_lock);
1540 }
1541 
1542 size_t bch_btree_gc_finish(struct cache_set *c)
1543 {
1544 	size_t available = 0;
1545 	struct bucket *b;
1546 	struct cache *ca;
1547 	unsigned i;
1548 
1549 	mutex_lock(&c->bucket_lock);
1550 
1551 	set_gc_sectors(c);
1552 	c->gc_mark_valid = 1;
1553 	c->need_gc	= 0;
1554 
1555 	if (c->root)
1556 		for (i = 0; i < KEY_PTRS(&c->root->key); i++)
1557 			SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i),
1558 				    GC_MARK_METADATA);
1559 
1560 	for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++)
1561 		SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i),
1562 			    GC_MARK_METADATA);
1563 
1564 	for_each_cache(ca, c, i) {
1565 		uint64_t *i;
1566 
1567 		ca->invalidate_needs_gc = 0;
1568 
1569 		for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++)
1570 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1571 
1572 		for (i = ca->prio_buckets;
1573 		     i < ca->prio_buckets + prio_buckets(ca) * 2; i++)
1574 			SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA);
1575 
1576 		for_each_bucket(b, ca) {
1577 			b->last_gc	= b->gc_gen;
1578 			c->need_gc	= max(c->need_gc, bucket_gc_gen(b));
1579 
1580 			if (!atomic_read(&b->pin) &&
1581 			    GC_MARK(b) == GC_MARK_RECLAIMABLE) {
1582 				available++;
1583 				if (!GC_SECTORS_USED(b))
1584 					bch_bucket_add_unused(ca, b);
1585 			}
1586 		}
1587 	}
1588 
1589 	mutex_unlock(&c->bucket_lock);
1590 	return available;
1591 }
1592 
1593 static void bch_btree_gc(struct cache_set *c)
1594 {
1595 	int ret;
1596 	unsigned long available;
1597 	struct gc_stat stats;
1598 	struct closure writes;
1599 	struct btree_op op;
1600 	uint64_t start_time = local_clock();
1601 
1602 	trace_bcache_gc_start(c);
1603 
1604 	memset(&stats, 0, sizeof(struct gc_stat));
1605 	closure_init_stack(&writes);
1606 	bch_btree_op_init(&op, SHRT_MAX);
1607 
1608 	btree_gc_start(c);
1609 
1610 	do {
1611 		ret = btree_root(gc_root, c, &op, &writes, &stats);
1612 		closure_sync(&writes);
1613 
1614 		if (ret && ret != -EAGAIN)
1615 			pr_warn("gc failed!");
1616 	} while (ret);
1617 
1618 	available = bch_btree_gc_finish(c);
1619 	wake_up_allocators(c);
1620 
1621 	bch_time_stats_update(&c->btree_gc_time, start_time);
1622 
1623 	stats.key_bytes *= sizeof(uint64_t);
1624 	stats.data	<<= 9;
1625 	stats.in_use	= (c->nbuckets - available) * 100 / c->nbuckets;
1626 	memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat));
1627 
1628 	trace_bcache_gc_end(c);
1629 
1630 	bch_moving_gc(c);
1631 }
1632 
1633 static int bch_gc_thread(void *arg)
1634 {
1635 	struct cache_set *c = arg;
1636 	struct cache *ca;
1637 	unsigned i;
1638 
1639 	while (1) {
1640 again:
1641 		bch_btree_gc(c);
1642 
1643 		set_current_state(TASK_INTERRUPTIBLE);
1644 		if (kthread_should_stop())
1645 			break;
1646 
1647 		mutex_lock(&c->bucket_lock);
1648 
1649 		for_each_cache(ca, c, i)
1650 			if (ca->invalidate_needs_gc) {
1651 				mutex_unlock(&c->bucket_lock);
1652 				set_current_state(TASK_RUNNING);
1653 				goto again;
1654 			}
1655 
1656 		mutex_unlock(&c->bucket_lock);
1657 
1658 		try_to_freeze();
1659 		schedule();
1660 	}
1661 
1662 	return 0;
1663 }
1664 
1665 int bch_gc_thread_start(struct cache_set *c)
1666 {
1667 	c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc");
1668 	if (IS_ERR(c->gc_thread))
1669 		return PTR_ERR(c->gc_thread);
1670 
1671 	set_task_state(c->gc_thread, TASK_INTERRUPTIBLE);
1672 	return 0;
1673 }
1674 
1675 /* Initial partial gc */
1676 
1677 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op,
1678 				   unsigned long **seen)
1679 {
1680 	int ret = 0;
1681 	unsigned i;
1682 	struct bkey *k, *p = NULL;
1683 	struct bucket *g;
1684 	struct btree_iter iter;
1685 
1686 	for_each_key_filter(b, k, &iter, bch_ptr_invalid) {
1687 		for (i = 0; i < KEY_PTRS(k); i++) {
1688 			if (!ptr_available(b->c, k, i))
1689 				continue;
1690 
1691 			g = PTR_BUCKET(b->c, k, i);
1692 
1693 			if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i),
1694 						seen[PTR_DEV(k, i)]) ||
1695 			    !ptr_stale(b->c, k, i)) {
1696 				g->gen = PTR_GEN(k, i);
1697 
1698 				if (b->level)
1699 					g->prio = BTREE_PRIO;
1700 				else if (g->prio == BTREE_PRIO)
1701 					g->prio = INITIAL_PRIO;
1702 			}
1703 		}
1704 
1705 		btree_mark_key(b, k);
1706 	}
1707 
1708 	if (b->level) {
1709 		bch_btree_iter_init(b, &iter, NULL);
1710 
1711 		do {
1712 			k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad);
1713 			if (k)
1714 				btree_node_prefetch(b->c, k, b->level - 1);
1715 
1716 			if (p)
1717 				ret = btree(check_recurse, p, b, op, seen);
1718 
1719 			p = k;
1720 		} while (p && !ret);
1721 	}
1722 
1723 	return 0;
1724 }
1725 
1726 int bch_btree_check(struct cache_set *c)
1727 {
1728 	int ret = -ENOMEM;
1729 	unsigned i;
1730 	unsigned long *seen[MAX_CACHES_PER_SET];
1731 	struct btree_op op;
1732 
1733 	memset(seen, 0, sizeof(seen));
1734 	bch_btree_op_init(&op, SHRT_MAX);
1735 
1736 	for (i = 0; c->cache[i]; i++) {
1737 		size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8);
1738 		seen[i] = kmalloc(n, GFP_KERNEL);
1739 		if (!seen[i])
1740 			goto err;
1741 
1742 		/* Disables the seen array until prio_read() uses it too */
1743 		memset(seen[i], 0xFF, n);
1744 	}
1745 
1746 	ret = btree_root(check_recurse, c, &op, seen);
1747 err:
1748 	for (i = 0; i < MAX_CACHES_PER_SET; i++)
1749 		kfree(seen[i]);
1750 	return ret;
1751 }
1752 
1753 /* Btree insertion */
1754 
1755 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert)
1756 {
1757 	struct bset *i = b->sets[b->nsets].data;
1758 
1759 	memmove((uint64_t *) where + bkey_u64s(insert),
1760 		where,
1761 		(void *) end(i) - (void *) where);
1762 
1763 	i->keys += bkey_u64s(insert);
1764 	bkey_copy(where, insert);
1765 	bch_bset_fix_lookup_table(b, where);
1766 }
1767 
1768 static bool fix_overlapping_extents(struct btree *b, struct bkey *insert,
1769 				    struct btree_iter *iter,
1770 				    struct bkey *replace_key)
1771 {
1772 	void subtract_dirty(struct bkey *k, uint64_t offset, int sectors)
1773 	{
1774 		if (KEY_DIRTY(k))
1775 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1776 						     offset, -sectors);
1777 	}
1778 
1779 	uint64_t old_offset;
1780 	unsigned old_size, sectors_found = 0;
1781 
1782 	while (1) {
1783 		struct bkey *k = bch_btree_iter_next(iter);
1784 		if (!k ||
1785 		    bkey_cmp(&START_KEY(k), insert) >= 0)
1786 			break;
1787 
1788 		if (bkey_cmp(k, &START_KEY(insert)) <= 0)
1789 			continue;
1790 
1791 		old_offset = KEY_START(k);
1792 		old_size = KEY_SIZE(k);
1793 
1794 		/*
1795 		 * We might overlap with 0 size extents; we can't skip these
1796 		 * because if they're in the set we're inserting to we have to
1797 		 * adjust them so they don't overlap with the key we're
1798 		 * inserting. But we don't want to check them for replace
1799 		 * operations.
1800 		 */
1801 
1802 		if (replace_key && KEY_SIZE(k)) {
1803 			/*
1804 			 * k might have been split since we inserted/found the
1805 			 * key we're replacing
1806 			 */
1807 			unsigned i;
1808 			uint64_t offset = KEY_START(k) -
1809 				KEY_START(replace_key);
1810 
1811 			/* But it must be a subset of the replace key */
1812 			if (KEY_START(k) < KEY_START(replace_key) ||
1813 			    KEY_OFFSET(k) > KEY_OFFSET(replace_key))
1814 				goto check_failed;
1815 
1816 			/* We didn't find a key that we were supposed to */
1817 			if (KEY_START(k) > KEY_START(insert) + sectors_found)
1818 				goto check_failed;
1819 
1820 			if (KEY_PTRS(replace_key) != KEY_PTRS(k))
1821 				goto check_failed;
1822 
1823 			/* skip past gen */
1824 			offset <<= 8;
1825 
1826 			BUG_ON(!KEY_PTRS(replace_key));
1827 
1828 			for (i = 0; i < KEY_PTRS(replace_key); i++)
1829 				if (k->ptr[i] != replace_key->ptr[i] + offset)
1830 					goto check_failed;
1831 
1832 			sectors_found = KEY_OFFSET(k) - KEY_START(insert);
1833 		}
1834 
1835 		if (bkey_cmp(insert, k) < 0 &&
1836 		    bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) {
1837 			/*
1838 			 * We overlapped in the middle of an existing key: that
1839 			 * means we have to split the old key. But we have to do
1840 			 * slightly different things depending on whether the
1841 			 * old key has been written out yet.
1842 			 */
1843 
1844 			struct bkey *top;
1845 
1846 			subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert));
1847 
1848 			if (bkey_written(b, k)) {
1849 				/*
1850 				 * We insert a new key to cover the top of the
1851 				 * old key, and the old key is modified in place
1852 				 * to represent the bottom split.
1853 				 *
1854 				 * It's completely arbitrary whether the new key
1855 				 * is the top or the bottom, but it has to match
1856 				 * up with what btree_sort_fixup() does - it
1857 				 * doesn't check for this kind of overlap, it
1858 				 * depends on us inserting a new key for the top
1859 				 * here.
1860 				 */
1861 				top = bch_bset_search(b, &b->sets[b->nsets],
1862 						      insert);
1863 				shift_keys(b, top, k);
1864 			} else {
1865 				BKEY_PADDED(key) temp;
1866 				bkey_copy(&temp.key, k);
1867 				shift_keys(b, k, &temp.key);
1868 				top = bkey_next(k);
1869 			}
1870 
1871 			bch_cut_front(insert, top);
1872 			bch_cut_back(&START_KEY(insert), k);
1873 			bch_bset_fix_invalidated_key(b, k);
1874 			return false;
1875 		}
1876 
1877 		if (bkey_cmp(insert, k) < 0) {
1878 			bch_cut_front(insert, k);
1879 		} else {
1880 			if (bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0)
1881 				old_offset = KEY_START(insert);
1882 
1883 			if (bkey_written(b, k) &&
1884 			    bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) {
1885 				/*
1886 				 * Completely overwrote, so we don't have to
1887 				 * invalidate the binary search tree
1888 				 */
1889 				bch_cut_front(k, k);
1890 			} else {
1891 				__bch_cut_back(&START_KEY(insert), k);
1892 				bch_bset_fix_invalidated_key(b, k);
1893 			}
1894 		}
1895 
1896 		subtract_dirty(k, old_offset, old_size - KEY_SIZE(k));
1897 	}
1898 
1899 check_failed:
1900 	if (replace_key) {
1901 		if (!sectors_found) {
1902 			return true;
1903 		} else if (sectors_found < KEY_SIZE(insert)) {
1904 			SET_KEY_OFFSET(insert, KEY_OFFSET(insert) -
1905 				       (KEY_SIZE(insert) - sectors_found));
1906 			SET_KEY_SIZE(insert, sectors_found);
1907 		}
1908 	}
1909 
1910 	return false;
1911 }
1912 
1913 static bool btree_insert_key(struct btree *b, struct btree_op *op,
1914 			     struct bkey *k, struct bkey *replace_key)
1915 {
1916 	struct bset *i = b->sets[b->nsets].data;
1917 	struct bkey *m, *prev;
1918 	unsigned status = BTREE_INSERT_STATUS_INSERT;
1919 
1920 	BUG_ON(bkey_cmp(k, &b->key) > 0);
1921 	BUG_ON(b->level && !KEY_PTRS(k));
1922 	BUG_ON(!b->level && !KEY_OFFSET(k));
1923 
1924 	if (!b->level) {
1925 		struct btree_iter iter;
1926 
1927 		/*
1928 		 * bset_search() returns the first key that is strictly greater
1929 		 * than the search key - but for back merging, we want to find
1930 		 * the previous key.
1931 		 */
1932 		prev = NULL;
1933 		m = bch_btree_iter_init(b, &iter, PRECEDING_KEY(&START_KEY(k)));
1934 
1935 		if (fix_overlapping_extents(b, k, &iter, replace_key)) {
1936 			op->insert_collision = true;
1937 			return false;
1938 		}
1939 
1940 		if (KEY_DIRTY(k))
1941 			bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k),
1942 						     KEY_START(k), KEY_SIZE(k));
1943 
1944 		while (m != end(i) &&
1945 		       bkey_cmp(k, &START_KEY(m)) > 0)
1946 			prev = m, m = bkey_next(m);
1947 
1948 		if (key_merging_disabled(b->c))
1949 			goto insert;
1950 
1951 		/* prev is in the tree, if we merge we're done */
1952 		status = BTREE_INSERT_STATUS_BACK_MERGE;
1953 		if (prev &&
1954 		    bch_bkey_try_merge(b, prev, k))
1955 			goto merged;
1956 
1957 		status = BTREE_INSERT_STATUS_OVERWROTE;
1958 		if (m != end(i) &&
1959 		    KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m))
1960 			goto copy;
1961 
1962 		status = BTREE_INSERT_STATUS_FRONT_MERGE;
1963 		if (m != end(i) &&
1964 		    bch_bkey_try_merge(b, k, m))
1965 			goto copy;
1966 	} else {
1967 		BUG_ON(replace_key);
1968 		m = bch_bset_search(b, &b->sets[b->nsets], k);
1969 	}
1970 
1971 insert:	shift_keys(b, m, k);
1972 copy:	bkey_copy(m, k);
1973 merged:
1974 	bch_check_keys(b, "%u for %s", status,
1975 		       replace_key ? "replace" : "insert");
1976 
1977 	if (b->level && !KEY_OFFSET(k))
1978 		btree_current_write(b)->prio_blocked++;
1979 
1980 	trace_bcache_btree_insert_key(b, k, replace_key != NULL, status);
1981 
1982 	return true;
1983 }
1984 
1985 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op,
1986 				  struct keylist *insert_keys,
1987 				  struct bkey *replace_key)
1988 {
1989 	bool ret = false;
1990 	int oldsize = bch_count_data(b);
1991 
1992 	while (!bch_keylist_empty(insert_keys)) {
1993 		struct bset *i = write_block(b);
1994 		struct bkey *k = insert_keys->keys;
1995 
1996 		if (b->written + __set_blocks(i, i->keys + bkey_u64s(k), b->c)
1997 		    > btree_blocks(b))
1998 			break;
1999 
2000 		if (bkey_cmp(k, &b->key) <= 0) {
2001 			if (!b->level)
2002 				bkey_put(b->c, k);
2003 
2004 			ret |= btree_insert_key(b, op, k, replace_key);
2005 			bch_keylist_pop_front(insert_keys);
2006 		} else if (bkey_cmp(&START_KEY(k), &b->key) < 0) {
2007 			BKEY_PADDED(key) temp;
2008 			bkey_copy(&temp.key, insert_keys->keys);
2009 
2010 			bch_cut_back(&b->key, &temp.key);
2011 			bch_cut_front(&b->key, insert_keys->keys);
2012 
2013 			ret |= btree_insert_key(b, op, &temp.key, replace_key);
2014 			break;
2015 		} else {
2016 			break;
2017 		}
2018 	}
2019 
2020 	BUG_ON(!bch_keylist_empty(insert_keys) && b->level);
2021 
2022 	BUG_ON(bch_count_data(b) < oldsize);
2023 	return ret;
2024 }
2025 
2026 static int btree_split(struct btree *b, struct btree_op *op,
2027 		       struct keylist *insert_keys,
2028 		       struct bkey *replace_key)
2029 {
2030 	bool split;
2031 	struct btree *n1, *n2 = NULL, *n3 = NULL;
2032 	uint64_t start_time = local_clock();
2033 	struct closure cl;
2034 	struct keylist parent_keys;
2035 
2036 	closure_init_stack(&cl);
2037 	bch_keylist_init(&parent_keys);
2038 
2039 	n1 = btree_node_alloc_replacement(b, true);
2040 	if (IS_ERR(n1))
2041 		goto err;
2042 
2043 	split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5;
2044 
2045 	if (split) {
2046 		unsigned keys = 0;
2047 
2048 		trace_bcache_btree_node_split(b, n1->sets[0].data->keys);
2049 
2050 		n2 = bch_btree_node_alloc(b->c, b->level, true);
2051 		if (IS_ERR(n2))
2052 			goto err_free1;
2053 
2054 		if (!b->parent) {
2055 			n3 = bch_btree_node_alloc(b->c, b->level + 1, true);
2056 			if (IS_ERR(n3))
2057 				goto err_free2;
2058 		}
2059 
2060 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2061 
2062 		/*
2063 		 * Has to be a linear search because we don't have an auxiliary
2064 		 * search tree yet
2065 		 */
2066 
2067 		while (keys < (n1->sets[0].data->keys * 3) / 5)
2068 			keys += bkey_u64s(node(n1->sets[0].data, keys));
2069 
2070 		bkey_copy_key(&n1->key, node(n1->sets[0].data, keys));
2071 		keys += bkey_u64s(node(n1->sets[0].data, keys));
2072 
2073 		n2->sets[0].data->keys = n1->sets[0].data->keys - keys;
2074 		n1->sets[0].data->keys = keys;
2075 
2076 		memcpy(n2->sets[0].data->start,
2077 		       end(n1->sets[0].data),
2078 		       n2->sets[0].data->keys * sizeof(uint64_t));
2079 
2080 		bkey_copy_key(&n2->key, &b->key);
2081 
2082 		bch_keylist_add(&parent_keys, &n2->key);
2083 		bch_btree_node_write(n2, &cl);
2084 		rw_unlock(true, n2);
2085 	} else {
2086 		trace_bcache_btree_node_compact(b, n1->sets[0].data->keys);
2087 
2088 		bch_btree_insert_keys(n1, op, insert_keys, replace_key);
2089 	}
2090 
2091 	bch_keylist_add(&parent_keys, &n1->key);
2092 	bch_btree_node_write(n1, &cl);
2093 
2094 	if (n3) {
2095 		/* Depth increases, make a new root */
2096 		bkey_copy_key(&n3->key, &MAX_KEY);
2097 		bch_btree_insert_keys(n3, op, &parent_keys, NULL);
2098 		bch_btree_node_write(n3, &cl);
2099 
2100 		closure_sync(&cl);
2101 		bch_btree_set_root(n3);
2102 		rw_unlock(true, n3);
2103 
2104 		btree_node_free(b);
2105 	} else if (!b->parent) {
2106 		/* Root filled up but didn't need to be split */
2107 		closure_sync(&cl);
2108 		bch_btree_set_root(n1);
2109 
2110 		btree_node_free(b);
2111 	} else {
2112 		/* Split a non root node */
2113 		closure_sync(&cl);
2114 		make_btree_freeing_key(b, parent_keys.top);
2115 		bch_keylist_push(&parent_keys);
2116 
2117 		btree_node_free(b);
2118 
2119 		bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL);
2120 		BUG_ON(!bch_keylist_empty(&parent_keys));
2121 	}
2122 
2123 	rw_unlock(true, n1);
2124 
2125 	bch_time_stats_update(&b->c->btree_split_time, start_time);
2126 
2127 	return 0;
2128 err_free2:
2129 	btree_node_free(n2);
2130 	rw_unlock(true, n2);
2131 err_free1:
2132 	btree_node_free(n1);
2133 	rw_unlock(true, n1);
2134 err:
2135 	if (n3 == ERR_PTR(-EAGAIN) ||
2136 	    n2 == ERR_PTR(-EAGAIN) ||
2137 	    n1 == ERR_PTR(-EAGAIN))
2138 		return -EAGAIN;
2139 
2140 	pr_warn("couldn't split");
2141 	return -ENOMEM;
2142 }
2143 
2144 static int bch_btree_insert_node(struct btree *b, struct btree_op *op,
2145 				 struct keylist *insert_keys,
2146 				 atomic_t *journal_ref,
2147 				 struct bkey *replace_key)
2148 {
2149 	BUG_ON(b->level && replace_key);
2150 
2151 	if (should_split(b)) {
2152 		if (current->bio_list) {
2153 			op->lock = b->c->root->level + 1;
2154 			return -EAGAIN;
2155 		} else if (op->lock <= b->c->root->level) {
2156 			op->lock = b->c->root->level + 1;
2157 			return -EINTR;
2158 		} else {
2159 			/* Invalidated all iterators */
2160 			return btree_split(b, op, insert_keys, replace_key) ?:
2161 				-EINTR;
2162 		}
2163 	} else {
2164 		BUG_ON(write_block(b) != b->sets[b->nsets].data);
2165 
2166 		if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) {
2167 			if (!b->level)
2168 				bch_btree_leaf_dirty(b, journal_ref);
2169 			else
2170 				bch_btree_node_write_sync(b);
2171 		}
2172 
2173 		return 0;
2174 	}
2175 }
2176 
2177 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op,
2178 			       struct bkey *check_key)
2179 {
2180 	int ret = -EINTR;
2181 	uint64_t btree_ptr = b->key.ptr[0];
2182 	unsigned long seq = b->seq;
2183 	struct keylist insert;
2184 	bool upgrade = op->lock == -1;
2185 
2186 	bch_keylist_init(&insert);
2187 
2188 	if (upgrade) {
2189 		rw_unlock(false, b);
2190 		rw_lock(true, b, b->level);
2191 
2192 		if (b->key.ptr[0] != btree_ptr ||
2193 		    b->seq != seq + 1)
2194 			goto out;
2195 	}
2196 
2197 	SET_KEY_PTRS(check_key, 1);
2198 	get_random_bytes(&check_key->ptr[0], sizeof(uint64_t));
2199 
2200 	SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV);
2201 
2202 	bch_keylist_add(&insert, check_key);
2203 
2204 	ret = bch_btree_insert_node(b, op, &insert, NULL, NULL);
2205 
2206 	BUG_ON(!ret && !bch_keylist_empty(&insert));
2207 out:
2208 	if (upgrade)
2209 		downgrade_write(&b->lock);
2210 	return ret;
2211 }
2212 
2213 struct btree_insert_op {
2214 	struct btree_op	op;
2215 	struct keylist	*keys;
2216 	atomic_t	*journal_ref;
2217 	struct bkey	*replace_key;
2218 };
2219 
2220 int btree_insert_fn(struct btree_op *b_op, struct btree *b)
2221 {
2222 	struct btree_insert_op *op = container_of(b_op,
2223 					struct btree_insert_op, op);
2224 
2225 	int ret = bch_btree_insert_node(b, &op->op, op->keys,
2226 					op->journal_ref, op->replace_key);
2227 	if (ret && !bch_keylist_empty(op->keys))
2228 		return ret;
2229 	else
2230 		return MAP_DONE;
2231 }
2232 
2233 int bch_btree_insert(struct cache_set *c, struct keylist *keys,
2234 		     atomic_t *journal_ref, struct bkey *replace_key)
2235 {
2236 	struct btree_insert_op op;
2237 	int ret = 0;
2238 
2239 	BUG_ON(current->bio_list);
2240 	BUG_ON(bch_keylist_empty(keys));
2241 
2242 	bch_btree_op_init(&op.op, 0);
2243 	op.keys		= keys;
2244 	op.journal_ref	= journal_ref;
2245 	op.replace_key	= replace_key;
2246 
2247 	while (!ret && !bch_keylist_empty(keys)) {
2248 		op.op.lock = 0;
2249 		ret = bch_btree_map_leaf_nodes(&op.op, c,
2250 					       &START_KEY(keys->keys),
2251 					       btree_insert_fn);
2252 	}
2253 
2254 	if (ret) {
2255 		struct bkey *k;
2256 
2257 		pr_err("error %i", ret);
2258 
2259 		while ((k = bch_keylist_pop(keys)))
2260 			bkey_put(c, k);
2261 	} else if (op.op.insert_collision)
2262 		ret = -ESRCH;
2263 
2264 	return ret;
2265 }
2266 
2267 void bch_btree_set_root(struct btree *b)
2268 {
2269 	unsigned i;
2270 	struct closure cl;
2271 
2272 	closure_init_stack(&cl);
2273 
2274 	trace_bcache_btree_set_root(b);
2275 
2276 	BUG_ON(!b->written);
2277 
2278 	for (i = 0; i < KEY_PTRS(&b->key); i++)
2279 		BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO);
2280 
2281 	mutex_lock(&b->c->bucket_lock);
2282 	list_del_init(&b->list);
2283 	mutex_unlock(&b->c->bucket_lock);
2284 
2285 	b->c->root = b;
2286 
2287 	bch_journal_meta(b->c, &cl);
2288 	closure_sync(&cl);
2289 }
2290 
2291 /* Map across nodes or keys */
2292 
2293 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op,
2294 				       struct bkey *from,
2295 				       btree_map_nodes_fn *fn, int flags)
2296 {
2297 	int ret = MAP_CONTINUE;
2298 
2299 	if (b->level) {
2300 		struct bkey *k;
2301 		struct btree_iter iter;
2302 
2303 		bch_btree_iter_init(b, &iter, from);
2304 
2305 		while ((k = bch_btree_iter_next_filter(&iter, b,
2306 						       bch_ptr_bad))) {
2307 			ret = btree(map_nodes_recurse, k, b,
2308 				    op, from, fn, flags);
2309 			from = NULL;
2310 
2311 			if (ret != MAP_CONTINUE)
2312 				return ret;
2313 		}
2314 	}
2315 
2316 	if (!b->level || flags == MAP_ALL_NODES)
2317 		ret = fn(op, b);
2318 
2319 	return ret;
2320 }
2321 
2322 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
2323 			  struct bkey *from, btree_map_nodes_fn *fn, int flags)
2324 {
2325 	return btree_root(map_nodes_recurse, c, op, from, fn, flags);
2326 }
2327 
2328 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op,
2329 				      struct bkey *from, btree_map_keys_fn *fn,
2330 				      int flags)
2331 {
2332 	int ret = MAP_CONTINUE;
2333 	struct bkey *k;
2334 	struct btree_iter iter;
2335 
2336 	bch_btree_iter_init(b, &iter, from);
2337 
2338 	while ((k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad))) {
2339 		ret = !b->level
2340 			? fn(op, b, k)
2341 			: btree(map_keys_recurse, k, b, op, from, fn, flags);
2342 		from = NULL;
2343 
2344 		if (ret != MAP_CONTINUE)
2345 			return ret;
2346 	}
2347 
2348 	if (!b->level && (flags & MAP_END_KEY))
2349 		ret = fn(op, b, &KEY(KEY_INODE(&b->key),
2350 				     KEY_OFFSET(&b->key), 0));
2351 
2352 	return ret;
2353 }
2354 
2355 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c,
2356 		       struct bkey *from, btree_map_keys_fn *fn, int flags)
2357 {
2358 	return btree_root(map_keys_recurse, c, op, from, fn, flags);
2359 }
2360 
2361 /* Keybuf code */
2362 
2363 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r)
2364 {
2365 	/* Overlapping keys compare equal */
2366 	if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0)
2367 		return -1;
2368 	if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0)
2369 		return 1;
2370 	return 0;
2371 }
2372 
2373 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l,
2374 					    struct keybuf_key *r)
2375 {
2376 	return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1);
2377 }
2378 
2379 struct refill {
2380 	struct btree_op	op;
2381 	unsigned	nr_found;
2382 	struct keybuf	*buf;
2383 	struct bkey	*end;
2384 	keybuf_pred_fn	*pred;
2385 };
2386 
2387 static int refill_keybuf_fn(struct btree_op *op, struct btree *b,
2388 			    struct bkey *k)
2389 {
2390 	struct refill *refill = container_of(op, struct refill, op);
2391 	struct keybuf *buf = refill->buf;
2392 	int ret = MAP_CONTINUE;
2393 
2394 	if (bkey_cmp(k, refill->end) >= 0) {
2395 		ret = MAP_DONE;
2396 		goto out;
2397 	}
2398 
2399 	if (!KEY_SIZE(k)) /* end key */
2400 		goto out;
2401 
2402 	if (refill->pred(buf, k)) {
2403 		struct keybuf_key *w;
2404 
2405 		spin_lock(&buf->lock);
2406 
2407 		w = array_alloc(&buf->freelist);
2408 		if (!w) {
2409 			spin_unlock(&buf->lock);
2410 			return MAP_DONE;
2411 		}
2412 
2413 		w->private = NULL;
2414 		bkey_copy(&w->key, k);
2415 
2416 		if (RB_INSERT(&buf->keys, w, node, keybuf_cmp))
2417 			array_free(&buf->freelist, w);
2418 		else
2419 			refill->nr_found++;
2420 
2421 		if (array_freelist_empty(&buf->freelist))
2422 			ret = MAP_DONE;
2423 
2424 		spin_unlock(&buf->lock);
2425 	}
2426 out:
2427 	buf->last_scanned = *k;
2428 	return ret;
2429 }
2430 
2431 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf,
2432 		       struct bkey *end, keybuf_pred_fn *pred)
2433 {
2434 	struct bkey start = buf->last_scanned;
2435 	struct refill refill;
2436 
2437 	cond_resched();
2438 
2439 	bch_btree_op_init(&refill.op, -1);
2440 	refill.nr_found	= 0;
2441 	refill.buf	= buf;
2442 	refill.end	= end;
2443 	refill.pred	= pred;
2444 
2445 	bch_btree_map_keys(&refill.op, c, &buf->last_scanned,
2446 			   refill_keybuf_fn, MAP_END_KEY);
2447 
2448 	trace_bcache_keyscan(refill.nr_found,
2449 			     KEY_INODE(&start), KEY_OFFSET(&start),
2450 			     KEY_INODE(&buf->last_scanned),
2451 			     KEY_OFFSET(&buf->last_scanned));
2452 
2453 	spin_lock(&buf->lock);
2454 
2455 	if (!RB_EMPTY_ROOT(&buf->keys)) {
2456 		struct keybuf_key *w;
2457 		w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2458 		buf->start	= START_KEY(&w->key);
2459 
2460 		w = RB_LAST(&buf->keys, struct keybuf_key, node);
2461 		buf->end	= w->key;
2462 	} else {
2463 		buf->start	= MAX_KEY;
2464 		buf->end	= MAX_KEY;
2465 	}
2466 
2467 	spin_unlock(&buf->lock);
2468 }
2469 
2470 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2471 {
2472 	rb_erase(&w->node, &buf->keys);
2473 	array_free(&buf->freelist, w);
2474 }
2475 
2476 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w)
2477 {
2478 	spin_lock(&buf->lock);
2479 	__bch_keybuf_del(buf, w);
2480 	spin_unlock(&buf->lock);
2481 }
2482 
2483 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start,
2484 				  struct bkey *end)
2485 {
2486 	bool ret = false;
2487 	struct keybuf_key *p, *w, s;
2488 	s.key = *start;
2489 
2490 	if (bkey_cmp(end, &buf->start) <= 0 ||
2491 	    bkey_cmp(start, &buf->end) >= 0)
2492 		return false;
2493 
2494 	spin_lock(&buf->lock);
2495 	w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp);
2496 
2497 	while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) {
2498 		p = w;
2499 		w = RB_NEXT(w, node);
2500 
2501 		if (p->private)
2502 			ret = true;
2503 		else
2504 			__bch_keybuf_del(buf, p);
2505 	}
2506 
2507 	spin_unlock(&buf->lock);
2508 	return ret;
2509 }
2510 
2511 struct keybuf_key *bch_keybuf_next(struct keybuf *buf)
2512 {
2513 	struct keybuf_key *w;
2514 	spin_lock(&buf->lock);
2515 
2516 	w = RB_FIRST(&buf->keys, struct keybuf_key, node);
2517 
2518 	while (w && w->private)
2519 		w = RB_NEXT(w, node);
2520 
2521 	if (w)
2522 		w->private = ERR_PTR(-EINTR);
2523 
2524 	spin_unlock(&buf->lock);
2525 	return w;
2526 }
2527 
2528 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c,
2529 					  struct keybuf *buf,
2530 					  struct bkey *end,
2531 					  keybuf_pred_fn *pred)
2532 {
2533 	struct keybuf_key *ret;
2534 
2535 	while (1) {
2536 		ret = bch_keybuf_next(buf);
2537 		if (ret)
2538 			break;
2539 
2540 		if (bkey_cmp(&buf->last_scanned, end) >= 0) {
2541 			pr_debug("scan finished");
2542 			break;
2543 		}
2544 
2545 		bch_refill_keybuf(c, buf, end, pred);
2546 	}
2547 
2548 	return ret;
2549 }
2550 
2551 void bch_keybuf_init(struct keybuf *buf)
2552 {
2553 	buf->last_scanned	= MAX_KEY;
2554 	buf->keys		= RB_ROOT;
2555 
2556 	spin_lock_init(&buf->lock);
2557 	array_allocator_init(&buf->freelist);
2558 }
2559 
2560 void bch_btree_exit(void)
2561 {
2562 	if (btree_io_wq)
2563 		destroy_workqueue(btree_io_wq);
2564 }
2565 
2566 int __init bch_btree_init(void)
2567 {
2568 	btree_io_wq = create_singlethread_workqueue("bch_btree_io");
2569 	if (!btree_io_wq)
2570 		return -ENOMEM;
2571 
2572 	return 0;
2573 }
2574