xref: /openbmc/linux/drivers/md/bcache/btree.h (revision cbb751c060fe61140e3f23dc7cd95190bba4c89e)
1b2441318SGreg Kroah-Hartman /* SPDX-License-Identifier: GPL-2.0 */
2cafe5635SKent Overstreet #ifndef _BCACHE_BTREE_H
3cafe5635SKent Overstreet #define _BCACHE_BTREE_H
4cafe5635SKent Overstreet 
5cafe5635SKent Overstreet /*
6cafe5635SKent Overstreet  * THE BTREE:
7cafe5635SKent Overstreet  *
8cafe5635SKent Overstreet  * At a high level, bcache's btree is relatively standard b+ tree. All keys and
9cafe5635SKent Overstreet  * pointers are in the leaves; interior nodes only have pointers to the child
10cafe5635SKent Overstreet  * nodes.
11cafe5635SKent Overstreet  *
12cafe5635SKent Overstreet  * In the interior nodes, a struct bkey always points to a child btree node, and
13cafe5635SKent Overstreet  * the key is the highest key in the child node - except that the highest key in
14cafe5635SKent Overstreet  * an interior node is always MAX_KEY. The size field refers to the size on disk
15cafe5635SKent Overstreet  * of the child node - this would allow us to have variable sized btree nodes
16cafe5635SKent Overstreet  * (handy for keeping the depth of the btree 1 by expanding just the root).
17cafe5635SKent Overstreet  *
18cafe5635SKent Overstreet  * Btree nodes are themselves log structured, but this is hidden fairly
19cafe5635SKent Overstreet  * thoroughly. Btree nodes on disk will in practice have extents that overlap
20cafe5635SKent Overstreet  * (because they were written at different times), but in memory we never have
21cafe5635SKent Overstreet  * overlapping extents - when we read in a btree node from disk, the first thing
22cafe5635SKent Overstreet  * we do is resort all the sets of keys with a mergesort, and in the same pass
23cafe5635SKent Overstreet  * we check for overlapping extents and adjust them appropriately.
24cafe5635SKent Overstreet  *
25cafe5635SKent Overstreet  * struct btree_op is a central interface to the btree code. It's used for
26cafe5635SKent Overstreet  * specifying read vs. write locking, and the embedded closure is used for
27cafe5635SKent Overstreet  * waiting on IO or reserve memory.
28cafe5635SKent Overstreet  *
29cafe5635SKent Overstreet  * BTREE CACHE:
30cafe5635SKent Overstreet  *
31cafe5635SKent Overstreet  * Btree nodes are cached in memory; traversing the btree might require reading
32cafe5635SKent Overstreet  * in btree nodes which is handled mostly transparently.
33cafe5635SKent Overstreet  *
34cafe5635SKent Overstreet  * bch_btree_node_get() looks up a btree node in the cache and reads it in from
35cafe5635SKent Overstreet  * disk if necessary. This function is almost never called directly though - the
36cafe5635SKent Overstreet  * btree() macro is used to get a btree node, call some function on it, and
37cafe5635SKent Overstreet  * unlock the node after the function returns.
38cafe5635SKent Overstreet  *
39cafe5635SKent Overstreet  * The root is special cased - it's taken out of the cache's lru (thus pinning
40cafe5635SKent Overstreet  * it in memory), so we can find the root of the btree by just dereferencing a
41cafe5635SKent Overstreet  * pointer instead of looking it up in the cache. This makes locking a bit
42cafe5635SKent Overstreet  * tricky, since the root pointer is protected by the lock in the btree node it
43cafe5635SKent Overstreet  * points to - the btree_root() macro handles this.
44cafe5635SKent Overstreet  *
45cafe5635SKent Overstreet  * In various places we must be able to allocate memory for multiple btree nodes
46cafe5635SKent Overstreet  * in order to make forward progress. To do this we use the btree cache itself
47cafe5635SKent Overstreet  * as a reserve; if __get_free_pages() fails, we'll find a node in the btree
48cafe5635SKent Overstreet  * cache we can reuse. We can't allow more than one thread to be doing this at a
49cafe5635SKent Overstreet  * time, so there's a lock, implemented by a pointer to the btree_op closure -
50cafe5635SKent Overstreet  * this allows the btree_root() macro to implicitly release this lock.
51cafe5635SKent Overstreet  *
52cafe5635SKent Overstreet  * BTREE IO:
53cafe5635SKent Overstreet  *
54cafe5635SKent Overstreet  * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles
55cafe5635SKent Overstreet  * this.
56cafe5635SKent Overstreet  *
57cafe5635SKent Overstreet  * For writing, we have two btree_write structs embeddded in struct btree - one
58cafe5635SKent Overstreet  * write in flight, and one being set up, and we toggle between them.
59cafe5635SKent Overstreet  *
60cafe5635SKent Overstreet  * Writing is done with a single function -  bch_btree_write() really serves two
61cafe5635SKent Overstreet  * different purposes and should be broken up into two different functions. When
62cafe5635SKent Overstreet  * passing now = false, it merely indicates that the node is now dirty - calling
63cafe5635SKent Overstreet  * it ensures that the dirty keys will be written at some point in the future.
64cafe5635SKent Overstreet  *
65cafe5635SKent Overstreet  * When passing now = true, bch_btree_write() causes a write to happen
66cafe5635SKent Overstreet  * "immediately" (if there was already a write in flight, it'll cause the write
67cafe5635SKent Overstreet  * to happen as soon as the previous write completes). It returns immediately
68cafe5635SKent Overstreet  * though - but it takes a refcount on the closure in struct btree_op you passed
69cafe5635SKent Overstreet  * to it, so a closure_sync() later can be used to wait for the write to
70cafe5635SKent Overstreet  * complete.
71cafe5635SKent Overstreet  *
72cafe5635SKent Overstreet  * This is handy because btree_split() and garbage collection can issue writes
73cafe5635SKent Overstreet  * in parallel, reducing the amount of time they have to hold write locks.
74cafe5635SKent Overstreet  *
75cafe5635SKent Overstreet  * LOCKING:
76cafe5635SKent Overstreet  *
77cafe5635SKent Overstreet  * When traversing the btree, we may need write locks starting at some level -
78cafe5635SKent Overstreet  * inserting a key into the btree will typically only require a write lock on
79cafe5635SKent Overstreet  * the leaf node.
80cafe5635SKent Overstreet  *
81cafe5635SKent Overstreet  * This is specified with the lock field in struct btree_op; lock = 0 means we
82cafe5635SKent Overstreet  * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get()
83cafe5635SKent Overstreet  * checks this field and returns the node with the appropriate lock held.
84cafe5635SKent Overstreet  *
85cafe5635SKent Overstreet  * If, after traversing the btree, the insertion code discovers it has to split
86cafe5635SKent Overstreet  * then it must restart from the root and take new locks - to do this it changes
87cafe5635SKent Overstreet  * the lock field and returns -EINTR, which causes the btree_root() macro to
88cafe5635SKent Overstreet  * loop.
89cafe5635SKent Overstreet  *
90cafe5635SKent Overstreet  * Handling cache misses require a different mechanism for upgrading to a write
91cafe5635SKent Overstreet  * lock. We do cache lookups with only a read lock held, but if we get a cache
92cafe5635SKent Overstreet  * miss and we wish to insert this data into the cache, we have to insert a
93cafe5635SKent Overstreet  * placeholder key to detect races - otherwise, we could race with a write and
94cafe5635SKent Overstreet  * overwrite the data that was just written to the cache with stale data from
95cafe5635SKent Overstreet  * the backing device.
96cafe5635SKent Overstreet  *
97cafe5635SKent Overstreet  * For this we use a sequence number that write locks and unlocks increment - to
98cafe5635SKent Overstreet  * insert the check key it unlocks the btree node and then takes a write lock,
99cafe5635SKent Overstreet  * and fails if the sequence number doesn't match.
100cafe5635SKent Overstreet  */
101cafe5635SKent Overstreet 
102cafe5635SKent Overstreet #include "bset.h"
103cafe5635SKent Overstreet #include "debug.h"
104cafe5635SKent Overstreet 
105cafe5635SKent Overstreet struct btree_write {
106cafe5635SKent Overstreet 	atomic_t		*journal;
107cafe5635SKent Overstreet 
108cafe5635SKent Overstreet 	/* If btree_split() frees a btree node, it writes a new pointer to that
109cafe5635SKent Overstreet 	 * btree node indicating it was freed; it takes a refcount on
110cafe5635SKent Overstreet 	 * c->prio_blocked because we can't write the gens until the new
111cafe5635SKent Overstreet 	 * pointer is on disk. This allows btree_write_endio() to release the
112cafe5635SKent Overstreet 	 * refcount that btree_split() took.
113cafe5635SKent Overstreet 	 */
114cafe5635SKent Overstreet 	int			prio_blocked;
115cafe5635SKent Overstreet };
116cafe5635SKent Overstreet 
117cafe5635SKent Overstreet struct btree {
118cafe5635SKent Overstreet 	/* Hottest entries first */
119cafe5635SKent Overstreet 	struct hlist_node	hash;
120cafe5635SKent Overstreet 
121cafe5635SKent Overstreet 	/* Key/pointer for this btree node */
122cafe5635SKent Overstreet 	BKEY_PADDED(key);
123cafe5635SKent Overstreet 
124cafe5635SKent Overstreet 	/* Single bit - set when accessed, cleared by shrinker */
125cafe5635SKent Overstreet 	unsigned long		accessed;
126cafe5635SKent Overstreet 	unsigned long		seq;
127cafe5635SKent Overstreet 	struct rw_semaphore	lock;
128cafe5635SKent Overstreet 	struct cache_set	*c;
129d6fd3b11SKent Overstreet 	struct btree		*parent;
130cafe5635SKent Overstreet 
1312a285686SKent Overstreet 	struct mutex		write_lock;
1322a285686SKent Overstreet 
133cafe5635SKent Overstreet 	unsigned long		flags;
134cafe5635SKent Overstreet 	uint16_t		written;	/* would be nice to kill */
135cafe5635SKent Overstreet 	uint8_t			level;
136cafe5635SKent Overstreet 
137a85e968eSKent Overstreet 	struct btree_keys	keys;
138cafe5635SKent Overstreet 
13957943511SKent Overstreet 	/* For outstanding btree writes, used as a lock - protects write_idx */
140cb7a583eSKent Overstreet 	struct closure		io;
141cb7a583eSKent Overstreet 	struct semaphore	io_mutex;
142cafe5635SKent Overstreet 
143cafe5635SKent Overstreet 	struct list_head	list;
144cafe5635SKent Overstreet 	struct delayed_work	work;
145cafe5635SKent Overstreet 
146cafe5635SKent Overstreet 	struct btree_write	writes[2];
147cafe5635SKent Overstreet 	struct bio		*bio;
148cafe5635SKent Overstreet };
149cafe5635SKent Overstreet 
150cafe5635SKent Overstreet #define BTREE_FLAG(flag)						\
151cafe5635SKent Overstreet static inline bool btree_node_ ## flag(struct btree *b)			\
152cafe5635SKent Overstreet {	return test_bit(BTREE_NODE_ ## flag, &b->flags); }		\
153cafe5635SKent Overstreet 									\
154cafe5635SKent Overstreet static inline void set_btree_node_ ## flag(struct btree *b)		\
155*cbb751c0SShenghui Wang {	set_bit(BTREE_NODE_ ## flag, &b->flags); }
156cafe5635SKent Overstreet 
157cafe5635SKent Overstreet enum btree_flags {
158cafe5635SKent Overstreet 	BTREE_NODE_io_error,
159cafe5635SKent Overstreet 	BTREE_NODE_dirty,
160cafe5635SKent Overstreet 	BTREE_NODE_write_idx,
161cafe5635SKent Overstreet };
162cafe5635SKent Overstreet 
163cafe5635SKent Overstreet BTREE_FLAG(io_error);
164cafe5635SKent Overstreet BTREE_FLAG(dirty);
165cafe5635SKent Overstreet BTREE_FLAG(write_idx);
166cafe5635SKent Overstreet 
167cafe5635SKent Overstreet static inline struct btree_write *btree_current_write(struct btree *b)
168cafe5635SKent Overstreet {
169cafe5635SKent Overstreet 	return b->writes + btree_node_write_idx(b);
170cafe5635SKent Overstreet }
171cafe5635SKent Overstreet 
172cafe5635SKent Overstreet static inline struct btree_write *btree_prev_write(struct btree *b)
173cafe5635SKent Overstreet {
174cafe5635SKent Overstreet 	return b->writes + (btree_node_write_idx(b) ^ 1);
175cafe5635SKent Overstreet }
176cafe5635SKent Overstreet 
17788b9f8c4SKent Overstreet static inline struct bset *btree_bset_first(struct btree *b)
17888b9f8c4SKent Overstreet {
179a85e968eSKent Overstreet 	return b->keys.set->data;
18088b9f8c4SKent Overstreet }
18188b9f8c4SKent Overstreet 
182ee811287SKent Overstreet static inline struct bset *btree_bset_last(struct btree *b)
183ee811287SKent Overstreet {
184a85e968eSKent Overstreet 	return bset_tree_last(&b->keys)->data;
18588b9f8c4SKent Overstreet }
18688b9f8c4SKent Overstreet 
18788b9f8c4SKent Overstreet static inline unsigned bset_block_offset(struct btree *b, struct bset *i)
18888b9f8c4SKent Overstreet {
189a85e968eSKent Overstreet 	return bset_sector_offset(&b->keys, i) >> b->c->block_bits;
190cafe5635SKent Overstreet }
191cafe5635SKent Overstreet 
192cafe5635SKent Overstreet static inline void set_gc_sectors(struct cache_set *c)
193cafe5635SKent Overstreet {
194a1f0358bSKent Overstreet 	atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16);
195cafe5635SKent Overstreet }
196cafe5635SKent Overstreet 
1973a3b6a4eSKent Overstreet void bkey_put(struct cache_set *c, struct bkey *k);
198e7c590ebSKent Overstreet 
199cafe5635SKent Overstreet /* Looping macros */
200cafe5635SKent Overstreet 
201cafe5635SKent Overstreet #define for_each_cached_btree(b, c, iter)				\
202cafe5635SKent Overstreet 	for (iter = 0;							\
203cafe5635SKent Overstreet 	     iter < ARRAY_SIZE((c)->bucket_hash);			\
204cafe5635SKent Overstreet 	     iter++)							\
205cafe5635SKent Overstreet 		hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash)
206cafe5635SKent Overstreet 
207cafe5635SKent Overstreet /* Recursing down the btree */
208cafe5635SKent Overstreet 
209cafe5635SKent Overstreet struct btree_op {
21078365411SKent Overstreet 	/* for waiting on btree reserve in btree_split() */
211ac6424b9SIngo Molnar 	wait_queue_entry_t		wait;
21278365411SKent Overstreet 
213cafe5635SKent Overstreet 	/* Btree level at which we start taking write locks */
214cafe5635SKent Overstreet 	short			lock;
215cafe5635SKent Overstreet 
216cafe5635SKent Overstreet 	unsigned		insert_collision:1;
217cafe5635SKent Overstreet };
218cafe5635SKent Overstreet 
219b54d6934SKent Overstreet static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level)
220b54d6934SKent Overstreet {
221b54d6934SKent Overstreet 	memset(op, 0, sizeof(struct btree_op));
22278365411SKent Overstreet 	init_wait(&op->wait);
223b54d6934SKent Overstreet 	op->lock = write_lock_level;
224b54d6934SKent Overstreet }
225cafe5635SKent Overstreet 
226cafe5635SKent Overstreet static inline void rw_lock(bool w, struct btree *b, int level)
227cafe5635SKent Overstreet {
228cafe5635SKent Overstreet 	w ? down_write_nested(&b->lock, level + 1)
229cafe5635SKent Overstreet 	  : down_read_nested(&b->lock, level + 1);
230cafe5635SKent Overstreet 	if (w)
231cafe5635SKent Overstreet 		b->seq++;
232cafe5635SKent Overstreet }
233cafe5635SKent Overstreet 
234cafe5635SKent Overstreet static inline void rw_unlock(bool w, struct btree *b)
235cafe5635SKent Overstreet {
236cafe5635SKent Overstreet 	if (w)
237cafe5635SKent Overstreet 		b->seq++;
238cafe5635SKent Overstreet 	(w ? up_write : up_read)(&b->lock);
239cafe5635SKent Overstreet }
240cafe5635SKent Overstreet 
24178b77bf8SKent Overstreet void bch_btree_node_read_done(struct btree *);
2422a285686SKent Overstreet void __bch_btree_node_write(struct btree *, struct closure *);
24357943511SKent Overstreet void bch_btree_node_write(struct btree *, struct closure *);
244cafe5635SKent Overstreet 
245cafe5635SKent Overstreet void bch_btree_set_root(struct btree *);
246c5aa4a31SSlava Pestov struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *,
2472452cc89SSlava Pestov 				     int, bool, struct btree *);
2480a63b66dSKent Overstreet struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *,
2492452cc89SSlava Pestov 				 struct bkey *, int, bool, struct btree *);
250cafe5635SKent Overstreet 
251e7c590ebSKent Overstreet int bch_btree_insert_check_key(struct btree *, struct btree_op *,
252e7c590ebSKent Overstreet 			       struct bkey *);
253cc7b8819SKent Overstreet int bch_btree_insert(struct cache_set *, struct keylist *,
254cc7b8819SKent Overstreet 		     atomic_t *, struct bkey *);
255cafe5635SKent Overstreet 
25672a44517SKent Overstreet int bch_gc_thread_start(struct cache_set *);
2572531d9eeSKent Overstreet void bch_initial_gc_finish(struct cache_set *);
25872a44517SKent Overstreet void bch_moving_gc(struct cache_set *);
259c18536a7SKent Overstreet int bch_btree_check(struct cache_set *);
260487dded8SKent Overstreet void bch_initial_mark_key(struct cache_set *, int, struct bkey *);
261cafe5635SKent Overstreet 
26272a44517SKent Overstreet static inline void wake_up_gc(struct cache_set *c)
26372a44517SKent Overstreet {
264be628be0SKent Overstreet 	wake_up(&c->gc_wait);
26572a44517SKent Overstreet }
26672a44517SKent Overstreet 
26748dad8baSKent Overstreet #define MAP_DONE	0
26848dad8baSKent Overstreet #define MAP_CONTINUE	1
26948dad8baSKent Overstreet 
27048dad8baSKent Overstreet #define MAP_ALL_NODES	0
27148dad8baSKent Overstreet #define MAP_LEAF_NODES	1
27248dad8baSKent Overstreet 
27348dad8baSKent Overstreet #define MAP_END_KEY	1
27448dad8baSKent Overstreet 
27548dad8baSKent Overstreet typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *);
27648dad8baSKent Overstreet int __bch_btree_map_nodes(struct btree_op *, struct cache_set *,
27748dad8baSKent Overstreet 			  struct bkey *, btree_map_nodes_fn *, int);
27848dad8baSKent Overstreet 
27948dad8baSKent Overstreet static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c,
28048dad8baSKent Overstreet 				      struct bkey *from, btree_map_nodes_fn *fn)
28148dad8baSKent Overstreet {
28248dad8baSKent Overstreet 	return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES);
28348dad8baSKent Overstreet }
28448dad8baSKent Overstreet 
28548dad8baSKent Overstreet static inline int bch_btree_map_leaf_nodes(struct btree_op *op,
28648dad8baSKent Overstreet 					   struct cache_set *c,
28748dad8baSKent Overstreet 					   struct bkey *from,
28848dad8baSKent Overstreet 					   btree_map_nodes_fn *fn)
28948dad8baSKent Overstreet {
29048dad8baSKent Overstreet 	return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES);
29148dad8baSKent Overstreet }
29248dad8baSKent Overstreet 
29348dad8baSKent Overstreet typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *,
29448dad8baSKent Overstreet 				struct bkey *);
29548dad8baSKent Overstreet int bch_btree_map_keys(struct btree_op *, struct cache_set *,
29648dad8baSKent Overstreet 		       struct bkey *, btree_map_keys_fn *, int);
29748dad8baSKent Overstreet 
29848dad8baSKent Overstreet typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *);
29948dad8baSKent Overstreet 
30072c27061SKent Overstreet void bch_keybuf_init(struct keybuf *);
30148dad8baSKent Overstreet void bch_refill_keybuf(struct cache_set *, struct keybuf *,
30248dad8baSKent Overstreet 		       struct bkey *, keybuf_pred_fn *);
303cafe5635SKent Overstreet bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *,
304cafe5635SKent Overstreet 				  struct bkey *);
305cafe5635SKent Overstreet void bch_keybuf_del(struct keybuf *, struct keybuf_key *);
306cafe5635SKent Overstreet struct keybuf_key *bch_keybuf_next(struct keybuf *);
30772c27061SKent Overstreet struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *,
30872c27061SKent Overstreet 					  struct bkey *, keybuf_pred_fn *);
309d44c2f9eSTang Junhui void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats);
310cafe5635SKent Overstreet #endif
311