1b2441318SGreg Kroah-Hartman /* SPDX-License-Identifier: GPL-2.0 */ 2cafe5635SKent Overstreet #ifndef _BCACHE_BTREE_H 3cafe5635SKent Overstreet #define _BCACHE_BTREE_H 4cafe5635SKent Overstreet 5cafe5635SKent Overstreet /* 6cafe5635SKent Overstreet * THE BTREE: 7cafe5635SKent Overstreet * 8cafe5635SKent Overstreet * At a high level, bcache's btree is relatively standard b+ tree. All keys and 9cafe5635SKent Overstreet * pointers are in the leaves; interior nodes only have pointers to the child 10cafe5635SKent Overstreet * nodes. 11cafe5635SKent Overstreet * 12cafe5635SKent Overstreet * In the interior nodes, a struct bkey always points to a child btree node, and 13cafe5635SKent Overstreet * the key is the highest key in the child node - except that the highest key in 14cafe5635SKent Overstreet * an interior node is always MAX_KEY. The size field refers to the size on disk 15cafe5635SKent Overstreet * of the child node - this would allow us to have variable sized btree nodes 16cafe5635SKent Overstreet * (handy for keeping the depth of the btree 1 by expanding just the root). 17cafe5635SKent Overstreet * 18cafe5635SKent Overstreet * Btree nodes are themselves log structured, but this is hidden fairly 19cafe5635SKent Overstreet * thoroughly. Btree nodes on disk will in practice have extents that overlap 20cafe5635SKent Overstreet * (because they were written at different times), but in memory we never have 21cafe5635SKent Overstreet * overlapping extents - when we read in a btree node from disk, the first thing 22cafe5635SKent Overstreet * we do is resort all the sets of keys with a mergesort, and in the same pass 23cafe5635SKent Overstreet * we check for overlapping extents and adjust them appropriately. 24cafe5635SKent Overstreet * 25cafe5635SKent Overstreet * struct btree_op is a central interface to the btree code. It's used for 26cafe5635SKent Overstreet * specifying read vs. write locking, and the embedded closure is used for 27cafe5635SKent Overstreet * waiting on IO or reserve memory. 28cafe5635SKent Overstreet * 29cafe5635SKent Overstreet * BTREE CACHE: 30cafe5635SKent Overstreet * 31cafe5635SKent Overstreet * Btree nodes are cached in memory; traversing the btree might require reading 32cafe5635SKent Overstreet * in btree nodes which is handled mostly transparently. 33cafe5635SKent Overstreet * 34cafe5635SKent Overstreet * bch_btree_node_get() looks up a btree node in the cache and reads it in from 35cafe5635SKent Overstreet * disk if necessary. This function is almost never called directly though - the 36cafe5635SKent Overstreet * btree() macro is used to get a btree node, call some function on it, and 37cafe5635SKent Overstreet * unlock the node after the function returns. 38cafe5635SKent Overstreet * 39cafe5635SKent Overstreet * The root is special cased - it's taken out of the cache's lru (thus pinning 40cafe5635SKent Overstreet * it in memory), so we can find the root of the btree by just dereferencing a 41cafe5635SKent Overstreet * pointer instead of looking it up in the cache. This makes locking a bit 42cafe5635SKent Overstreet * tricky, since the root pointer is protected by the lock in the btree node it 43cafe5635SKent Overstreet * points to - the btree_root() macro handles this. 44cafe5635SKent Overstreet * 45cafe5635SKent Overstreet * In various places we must be able to allocate memory for multiple btree nodes 46cafe5635SKent Overstreet * in order to make forward progress. To do this we use the btree cache itself 47cafe5635SKent Overstreet * as a reserve; if __get_free_pages() fails, we'll find a node in the btree 48cafe5635SKent Overstreet * cache we can reuse. We can't allow more than one thread to be doing this at a 49cafe5635SKent Overstreet * time, so there's a lock, implemented by a pointer to the btree_op closure - 50cafe5635SKent Overstreet * this allows the btree_root() macro to implicitly release this lock. 51cafe5635SKent Overstreet * 52cafe5635SKent Overstreet * BTREE IO: 53cafe5635SKent Overstreet * 54cafe5635SKent Overstreet * Btree nodes never have to be explicitly read in; bch_btree_node_get() handles 55cafe5635SKent Overstreet * this. 56cafe5635SKent Overstreet * 57cafe5635SKent Overstreet * For writing, we have two btree_write structs embeddded in struct btree - one 58cafe5635SKent Overstreet * write in flight, and one being set up, and we toggle between them. 59cafe5635SKent Overstreet * 60cafe5635SKent Overstreet * Writing is done with a single function - bch_btree_write() really serves two 61cafe5635SKent Overstreet * different purposes and should be broken up into two different functions. When 62cafe5635SKent Overstreet * passing now = false, it merely indicates that the node is now dirty - calling 63cafe5635SKent Overstreet * it ensures that the dirty keys will be written at some point in the future. 64cafe5635SKent Overstreet * 65cafe5635SKent Overstreet * When passing now = true, bch_btree_write() causes a write to happen 66cafe5635SKent Overstreet * "immediately" (if there was already a write in flight, it'll cause the write 67cafe5635SKent Overstreet * to happen as soon as the previous write completes). It returns immediately 68cafe5635SKent Overstreet * though - but it takes a refcount on the closure in struct btree_op you passed 69cafe5635SKent Overstreet * to it, so a closure_sync() later can be used to wait for the write to 70cafe5635SKent Overstreet * complete. 71cafe5635SKent Overstreet * 72cafe5635SKent Overstreet * This is handy because btree_split() and garbage collection can issue writes 73cafe5635SKent Overstreet * in parallel, reducing the amount of time they have to hold write locks. 74cafe5635SKent Overstreet * 75cafe5635SKent Overstreet * LOCKING: 76cafe5635SKent Overstreet * 77cafe5635SKent Overstreet * When traversing the btree, we may need write locks starting at some level - 78cafe5635SKent Overstreet * inserting a key into the btree will typically only require a write lock on 79cafe5635SKent Overstreet * the leaf node. 80cafe5635SKent Overstreet * 81cafe5635SKent Overstreet * This is specified with the lock field in struct btree_op; lock = 0 means we 82cafe5635SKent Overstreet * take write locks at level <= 0, i.e. only leaf nodes. bch_btree_node_get() 83cafe5635SKent Overstreet * checks this field and returns the node with the appropriate lock held. 84cafe5635SKent Overstreet * 85cafe5635SKent Overstreet * If, after traversing the btree, the insertion code discovers it has to split 86cafe5635SKent Overstreet * then it must restart from the root and take new locks - to do this it changes 87cafe5635SKent Overstreet * the lock field and returns -EINTR, which causes the btree_root() macro to 88cafe5635SKent Overstreet * loop. 89cafe5635SKent Overstreet * 90cafe5635SKent Overstreet * Handling cache misses require a different mechanism for upgrading to a write 91cafe5635SKent Overstreet * lock. We do cache lookups with only a read lock held, but if we get a cache 92cafe5635SKent Overstreet * miss and we wish to insert this data into the cache, we have to insert a 93cafe5635SKent Overstreet * placeholder key to detect races - otherwise, we could race with a write and 94cafe5635SKent Overstreet * overwrite the data that was just written to the cache with stale data from 95cafe5635SKent Overstreet * the backing device. 96cafe5635SKent Overstreet * 97cafe5635SKent Overstreet * For this we use a sequence number that write locks and unlocks increment - to 98cafe5635SKent Overstreet * insert the check key it unlocks the btree node and then takes a write lock, 99cafe5635SKent Overstreet * and fails if the sequence number doesn't match. 100cafe5635SKent Overstreet */ 101cafe5635SKent Overstreet 102cafe5635SKent Overstreet #include "bset.h" 103cafe5635SKent Overstreet #include "debug.h" 104cafe5635SKent Overstreet 105cafe5635SKent Overstreet struct btree_write { 106cafe5635SKent Overstreet atomic_t *journal; 107cafe5635SKent Overstreet 108cafe5635SKent Overstreet /* If btree_split() frees a btree node, it writes a new pointer to that 109cafe5635SKent Overstreet * btree node indicating it was freed; it takes a refcount on 110cafe5635SKent Overstreet * c->prio_blocked because we can't write the gens until the new 111cafe5635SKent Overstreet * pointer is on disk. This allows btree_write_endio() to release the 112cafe5635SKent Overstreet * refcount that btree_split() took. 113cafe5635SKent Overstreet */ 114cafe5635SKent Overstreet int prio_blocked; 115cafe5635SKent Overstreet }; 116cafe5635SKent Overstreet 117cafe5635SKent Overstreet struct btree { 118cafe5635SKent Overstreet /* Hottest entries first */ 119cafe5635SKent Overstreet struct hlist_node hash; 120cafe5635SKent Overstreet 121cafe5635SKent Overstreet /* Key/pointer for this btree node */ 122cafe5635SKent Overstreet BKEY_PADDED(key); 123cafe5635SKent Overstreet 124cafe5635SKent Overstreet /* Single bit - set when accessed, cleared by shrinker */ 125cafe5635SKent Overstreet unsigned long accessed; 126cafe5635SKent Overstreet unsigned long seq; 127cafe5635SKent Overstreet struct rw_semaphore lock; 128cafe5635SKent Overstreet struct cache_set *c; 129d6fd3b11SKent Overstreet struct btree *parent; 130cafe5635SKent Overstreet 1312a285686SKent Overstreet struct mutex write_lock; 1322a285686SKent Overstreet 133cafe5635SKent Overstreet unsigned long flags; 134cafe5635SKent Overstreet uint16_t written; /* would be nice to kill */ 135cafe5635SKent Overstreet uint8_t level; 136cafe5635SKent Overstreet 137a85e968eSKent Overstreet struct btree_keys keys; 138cafe5635SKent Overstreet 13957943511SKent Overstreet /* For outstanding btree writes, used as a lock - protects write_idx */ 140cb7a583eSKent Overstreet struct closure io; 141cb7a583eSKent Overstreet struct semaphore io_mutex; 142cafe5635SKent Overstreet 143cafe5635SKent Overstreet struct list_head list; 144cafe5635SKent Overstreet struct delayed_work work; 145cafe5635SKent Overstreet 146cafe5635SKent Overstreet struct btree_write writes[2]; 147cafe5635SKent Overstreet struct bio *bio; 148cafe5635SKent Overstreet }; 149cafe5635SKent Overstreet 150cafe5635SKent Overstreet #define BTREE_FLAG(flag) \ 151cafe5635SKent Overstreet static inline bool btree_node_ ## flag(struct btree *b) \ 152cafe5635SKent Overstreet { return test_bit(BTREE_NODE_ ## flag, &b->flags); } \ 153cafe5635SKent Overstreet \ 154cafe5635SKent Overstreet static inline void set_btree_node_ ## flag(struct btree *b) \ 155*cbb751c0SShenghui Wang { set_bit(BTREE_NODE_ ## flag, &b->flags); } 156cafe5635SKent Overstreet 157cafe5635SKent Overstreet enum btree_flags { 158cafe5635SKent Overstreet BTREE_NODE_io_error, 159cafe5635SKent Overstreet BTREE_NODE_dirty, 160cafe5635SKent Overstreet BTREE_NODE_write_idx, 161cafe5635SKent Overstreet }; 162cafe5635SKent Overstreet 163cafe5635SKent Overstreet BTREE_FLAG(io_error); 164cafe5635SKent Overstreet BTREE_FLAG(dirty); 165cafe5635SKent Overstreet BTREE_FLAG(write_idx); 166cafe5635SKent Overstreet 167cafe5635SKent Overstreet static inline struct btree_write *btree_current_write(struct btree *b) 168cafe5635SKent Overstreet { 169cafe5635SKent Overstreet return b->writes + btree_node_write_idx(b); 170cafe5635SKent Overstreet } 171cafe5635SKent Overstreet 172cafe5635SKent Overstreet static inline struct btree_write *btree_prev_write(struct btree *b) 173cafe5635SKent Overstreet { 174cafe5635SKent Overstreet return b->writes + (btree_node_write_idx(b) ^ 1); 175cafe5635SKent Overstreet } 176cafe5635SKent Overstreet 17788b9f8c4SKent Overstreet static inline struct bset *btree_bset_first(struct btree *b) 17888b9f8c4SKent Overstreet { 179a85e968eSKent Overstreet return b->keys.set->data; 18088b9f8c4SKent Overstreet } 18188b9f8c4SKent Overstreet 182ee811287SKent Overstreet static inline struct bset *btree_bset_last(struct btree *b) 183ee811287SKent Overstreet { 184a85e968eSKent Overstreet return bset_tree_last(&b->keys)->data; 18588b9f8c4SKent Overstreet } 18688b9f8c4SKent Overstreet 18788b9f8c4SKent Overstreet static inline unsigned bset_block_offset(struct btree *b, struct bset *i) 18888b9f8c4SKent Overstreet { 189a85e968eSKent Overstreet return bset_sector_offset(&b->keys, i) >> b->c->block_bits; 190cafe5635SKent Overstreet } 191cafe5635SKent Overstreet 192cafe5635SKent Overstreet static inline void set_gc_sectors(struct cache_set *c) 193cafe5635SKent Overstreet { 194a1f0358bSKent Overstreet atomic_set(&c->sectors_to_gc, c->sb.bucket_size * c->nbuckets / 16); 195cafe5635SKent Overstreet } 196cafe5635SKent Overstreet 1973a3b6a4eSKent Overstreet void bkey_put(struct cache_set *c, struct bkey *k); 198e7c590ebSKent Overstreet 199cafe5635SKent Overstreet /* Looping macros */ 200cafe5635SKent Overstreet 201cafe5635SKent Overstreet #define for_each_cached_btree(b, c, iter) \ 202cafe5635SKent Overstreet for (iter = 0; \ 203cafe5635SKent Overstreet iter < ARRAY_SIZE((c)->bucket_hash); \ 204cafe5635SKent Overstreet iter++) \ 205cafe5635SKent Overstreet hlist_for_each_entry_rcu((b), (c)->bucket_hash + iter, hash) 206cafe5635SKent Overstreet 207cafe5635SKent Overstreet /* Recursing down the btree */ 208cafe5635SKent Overstreet 209cafe5635SKent Overstreet struct btree_op { 21078365411SKent Overstreet /* for waiting on btree reserve in btree_split() */ 211ac6424b9SIngo Molnar wait_queue_entry_t wait; 21278365411SKent Overstreet 213cafe5635SKent Overstreet /* Btree level at which we start taking write locks */ 214cafe5635SKent Overstreet short lock; 215cafe5635SKent Overstreet 216cafe5635SKent Overstreet unsigned insert_collision:1; 217cafe5635SKent Overstreet }; 218cafe5635SKent Overstreet 219b54d6934SKent Overstreet static inline void bch_btree_op_init(struct btree_op *op, int write_lock_level) 220b54d6934SKent Overstreet { 221b54d6934SKent Overstreet memset(op, 0, sizeof(struct btree_op)); 22278365411SKent Overstreet init_wait(&op->wait); 223b54d6934SKent Overstreet op->lock = write_lock_level; 224b54d6934SKent Overstreet } 225cafe5635SKent Overstreet 226cafe5635SKent Overstreet static inline void rw_lock(bool w, struct btree *b, int level) 227cafe5635SKent Overstreet { 228cafe5635SKent Overstreet w ? down_write_nested(&b->lock, level + 1) 229cafe5635SKent Overstreet : down_read_nested(&b->lock, level + 1); 230cafe5635SKent Overstreet if (w) 231cafe5635SKent Overstreet b->seq++; 232cafe5635SKent Overstreet } 233cafe5635SKent Overstreet 234cafe5635SKent Overstreet static inline void rw_unlock(bool w, struct btree *b) 235cafe5635SKent Overstreet { 236cafe5635SKent Overstreet if (w) 237cafe5635SKent Overstreet b->seq++; 238cafe5635SKent Overstreet (w ? up_write : up_read)(&b->lock); 239cafe5635SKent Overstreet } 240cafe5635SKent Overstreet 24178b77bf8SKent Overstreet void bch_btree_node_read_done(struct btree *); 2422a285686SKent Overstreet void __bch_btree_node_write(struct btree *, struct closure *); 24357943511SKent Overstreet void bch_btree_node_write(struct btree *, struct closure *); 244cafe5635SKent Overstreet 245cafe5635SKent Overstreet void bch_btree_set_root(struct btree *); 246c5aa4a31SSlava Pestov struct btree *__bch_btree_node_alloc(struct cache_set *, struct btree_op *, 2472452cc89SSlava Pestov int, bool, struct btree *); 2480a63b66dSKent Overstreet struct btree *bch_btree_node_get(struct cache_set *, struct btree_op *, 2492452cc89SSlava Pestov struct bkey *, int, bool, struct btree *); 250cafe5635SKent Overstreet 251e7c590ebSKent Overstreet int bch_btree_insert_check_key(struct btree *, struct btree_op *, 252e7c590ebSKent Overstreet struct bkey *); 253cc7b8819SKent Overstreet int bch_btree_insert(struct cache_set *, struct keylist *, 254cc7b8819SKent Overstreet atomic_t *, struct bkey *); 255cafe5635SKent Overstreet 25672a44517SKent Overstreet int bch_gc_thread_start(struct cache_set *); 2572531d9eeSKent Overstreet void bch_initial_gc_finish(struct cache_set *); 25872a44517SKent Overstreet void bch_moving_gc(struct cache_set *); 259c18536a7SKent Overstreet int bch_btree_check(struct cache_set *); 260487dded8SKent Overstreet void bch_initial_mark_key(struct cache_set *, int, struct bkey *); 261cafe5635SKent Overstreet 26272a44517SKent Overstreet static inline void wake_up_gc(struct cache_set *c) 26372a44517SKent Overstreet { 264be628be0SKent Overstreet wake_up(&c->gc_wait); 26572a44517SKent Overstreet } 26672a44517SKent Overstreet 26748dad8baSKent Overstreet #define MAP_DONE 0 26848dad8baSKent Overstreet #define MAP_CONTINUE 1 26948dad8baSKent Overstreet 27048dad8baSKent Overstreet #define MAP_ALL_NODES 0 27148dad8baSKent Overstreet #define MAP_LEAF_NODES 1 27248dad8baSKent Overstreet 27348dad8baSKent Overstreet #define MAP_END_KEY 1 27448dad8baSKent Overstreet 27548dad8baSKent Overstreet typedef int (btree_map_nodes_fn)(struct btree_op *, struct btree *); 27648dad8baSKent Overstreet int __bch_btree_map_nodes(struct btree_op *, struct cache_set *, 27748dad8baSKent Overstreet struct bkey *, btree_map_nodes_fn *, int); 27848dad8baSKent Overstreet 27948dad8baSKent Overstreet static inline int bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 28048dad8baSKent Overstreet struct bkey *from, btree_map_nodes_fn *fn) 28148dad8baSKent Overstreet { 28248dad8baSKent Overstreet return __bch_btree_map_nodes(op, c, from, fn, MAP_ALL_NODES); 28348dad8baSKent Overstreet } 28448dad8baSKent Overstreet 28548dad8baSKent Overstreet static inline int bch_btree_map_leaf_nodes(struct btree_op *op, 28648dad8baSKent Overstreet struct cache_set *c, 28748dad8baSKent Overstreet struct bkey *from, 28848dad8baSKent Overstreet btree_map_nodes_fn *fn) 28948dad8baSKent Overstreet { 29048dad8baSKent Overstreet return __bch_btree_map_nodes(op, c, from, fn, MAP_LEAF_NODES); 29148dad8baSKent Overstreet } 29248dad8baSKent Overstreet 29348dad8baSKent Overstreet typedef int (btree_map_keys_fn)(struct btree_op *, struct btree *, 29448dad8baSKent Overstreet struct bkey *); 29548dad8baSKent Overstreet int bch_btree_map_keys(struct btree_op *, struct cache_set *, 29648dad8baSKent Overstreet struct bkey *, btree_map_keys_fn *, int); 29748dad8baSKent Overstreet 29848dad8baSKent Overstreet typedef bool (keybuf_pred_fn)(struct keybuf *, struct bkey *); 29948dad8baSKent Overstreet 30072c27061SKent Overstreet void bch_keybuf_init(struct keybuf *); 30148dad8baSKent Overstreet void bch_refill_keybuf(struct cache_set *, struct keybuf *, 30248dad8baSKent Overstreet struct bkey *, keybuf_pred_fn *); 303cafe5635SKent Overstreet bool bch_keybuf_check_overlapping(struct keybuf *, struct bkey *, 304cafe5635SKent Overstreet struct bkey *); 305cafe5635SKent Overstreet void bch_keybuf_del(struct keybuf *, struct keybuf_key *); 306cafe5635SKent Overstreet struct keybuf_key *bch_keybuf_next(struct keybuf *); 30772c27061SKent Overstreet struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *, struct keybuf *, 30872c27061SKent Overstreet struct bkey *, keybuf_pred_fn *); 309d44c2f9eSTang Junhui void bch_update_bucket_in_use(struct cache_set *c, struct gc_stat *stats); 310cafe5635SKent Overstreet #endif 311