1 /* 2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 3 * 4 * Uses a block device as cache for other block devices; optimized for SSDs. 5 * All allocation is done in buckets, which should match the erase block size 6 * of the device. 7 * 8 * Buckets containing cached data are kept on a heap sorted by priority; 9 * bucket priority is increased on cache hit, and periodically all the buckets 10 * on the heap have their priority scaled down. This currently is just used as 11 * an LRU but in the future should allow for more intelligent heuristics. 12 * 13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 14 * counter. Garbage collection is used to remove stale pointers. 15 * 16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 17 * as keys are inserted we only sort the pages that have not yet been written. 18 * When garbage collection is run, we resort the entire node. 19 * 20 * All configuration is done via sysfs; see Documentation/bcache.txt. 21 */ 22 23 #include "bcache.h" 24 #include "btree.h" 25 #include "debug.h" 26 #include "extents.h" 27 28 #include <linux/slab.h> 29 #include <linux/bitops.h> 30 #include <linux/freezer.h> 31 #include <linux/hash.h> 32 #include <linux/kthread.h> 33 #include <linux/prefetch.h> 34 #include <linux/random.h> 35 #include <linux/rcupdate.h> 36 #include <trace/events/bcache.h> 37 38 /* 39 * Todo: 40 * register_bcache: Return errors out to userspace correctly 41 * 42 * Writeback: don't undirty key until after a cache flush 43 * 44 * Create an iterator for key pointers 45 * 46 * On btree write error, mark bucket such that it won't be freed from the cache 47 * 48 * Journalling: 49 * Check for bad keys in replay 50 * Propagate barriers 51 * Refcount journal entries in journal_replay 52 * 53 * Garbage collection: 54 * Finish incremental gc 55 * Gc should free old UUIDs, data for invalid UUIDs 56 * 57 * Provide a way to list backing device UUIDs we have data cached for, and 58 * probably how long it's been since we've seen them, and a way to invalidate 59 * dirty data for devices that will never be attached again 60 * 61 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 62 * that based on that and how much dirty data we have we can keep writeback 63 * from being starved 64 * 65 * Add a tracepoint or somesuch to watch for writeback starvation 66 * 67 * When btree depth > 1 and splitting an interior node, we have to make sure 68 * alloc_bucket() cannot fail. This should be true but is not completely 69 * obvious. 70 * 71 * Plugging? 72 * 73 * If data write is less than hard sector size of ssd, round up offset in open 74 * bucket to the next whole sector 75 * 76 * Superblock needs to be fleshed out for multiple cache devices 77 * 78 * Add a sysfs tunable for the number of writeback IOs in flight 79 * 80 * Add a sysfs tunable for the number of open data buckets 81 * 82 * IO tracking: Can we track when one process is doing io on behalf of another? 83 * IO tracking: Don't use just an average, weigh more recent stuff higher 84 * 85 * Test module load/unload 86 */ 87 88 #define MAX_NEED_GC 64 89 #define MAX_SAVE_PRIO 72 90 91 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 92 93 #define PTR_HASH(c, k) \ 94 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 95 96 #define insert_lock(s, b) ((b)->level <= (s)->lock) 97 98 /* 99 * These macros are for recursing down the btree - they handle the details of 100 * locking and looking up nodes in the cache for you. They're best treated as 101 * mere syntax when reading code that uses them. 102 * 103 * op->lock determines whether we take a read or a write lock at a given depth. 104 * If you've got a read lock and find that you need a write lock (i.e. you're 105 * going to have to split), set op->lock and return -EINTR; btree_root() will 106 * call you again and you'll have the correct lock. 107 */ 108 109 /** 110 * btree - recurse down the btree on a specified key 111 * @fn: function to call, which will be passed the child node 112 * @key: key to recurse on 113 * @b: parent btree node 114 * @op: pointer to struct btree_op 115 */ 116 #define btree(fn, key, b, op, ...) \ 117 ({ \ 118 int _r, l = (b)->level - 1; \ 119 bool _w = l <= (op)->lock; \ 120 struct btree *_child = bch_btree_node_get((b)->c, op, key, l, _w);\ 121 if (!IS_ERR(_child)) { \ 122 _child->parent = (b); \ 123 _r = bch_btree_ ## fn(_child, op, ##__VA_ARGS__); \ 124 rw_unlock(_w, _child); \ 125 } else \ 126 _r = PTR_ERR(_child); \ 127 _r; \ 128 }) 129 130 /** 131 * btree_root - call a function on the root of the btree 132 * @fn: function to call, which will be passed the child node 133 * @c: cache set 134 * @op: pointer to struct btree_op 135 */ 136 #define btree_root(fn, c, op, ...) \ 137 ({ \ 138 int _r = -EINTR; \ 139 do { \ 140 struct btree *_b = (c)->root; \ 141 bool _w = insert_lock(op, _b); \ 142 rw_lock(_w, _b, _b->level); \ 143 if (_b == (c)->root && \ 144 _w == insert_lock(op, _b)) { \ 145 _b->parent = NULL; \ 146 _r = bch_btree_ ## fn(_b, op, ##__VA_ARGS__); \ 147 } \ 148 rw_unlock(_w, _b); \ 149 bch_cannibalize_unlock(c); \ 150 if (_r == -EINTR) \ 151 schedule(); \ 152 } while (_r == -EINTR); \ 153 \ 154 finish_wait(&(c)->btree_cache_wait, &(op)->wait); \ 155 _r; \ 156 }) 157 158 static inline struct bset *write_block(struct btree *b) 159 { 160 return ((void *) btree_bset_first(b)) + b->written * block_bytes(b->c); 161 } 162 163 static void bch_btree_init_next(struct btree *b) 164 { 165 /* If not a leaf node, always sort */ 166 if (b->level && b->keys.nsets) 167 bch_btree_sort(&b->keys, &b->c->sort); 168 else 169 bch_btree_sort_lazy(&b->keys, &b->c->sort); 170 171 if (b->written < btree_blocks(b)) 172 bch_bset_init_next(&b->keys, write_block(b), 173 bset_magic(&b->c->sb)); 174 175 } 176 177 /* Btree key manipulation */ 178 179 void bkey_put(struct cache_set *c, struct bkey *k) 180 { 181 unsigned i; 182 183 for (i = 0; i < KEY_PTRS(k); i++) 184 if (ptr_available(c, k, i)) 185 atomic_dec_bug(&PTR_BUCKET(c, k, i)->pin); 186 } 187 188 /* Btree IO */ 189 190 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 191 { 192 uint64_t crc = b->key.ptr[0]; 193 void *data = (void *) i + 8, *end = bset_bkey_last(i); 194 195 crc = bch_crc64_update(crc, data, end - data); 196 return crc ^ 0xffffffffffffffffULL; 197 } 198 199 void bch_btree_node_read_done(struct btree *b) 200 { 201 const char *err = "bad btree header"; 202 struct bset *i = btree_bset_first(b); 203 struct btree_iter *iter; 204 205 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 206 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 207 iter->used = 0; 208 209 #ifdef CONFIG_BCACHE_DEBUG 210 iter->b = &b->keys; 211 #endif 212 213 if (!i->seq) 214 goto err; 215 216 for (; 217 b->written < btree_blocks(b) && i->seq == b->keys.set[0].data->seq; 218 i = write_block(b)) { 219 err = "unsupported bset version"; 220 if (i->version > BCACHE_BSET_VERSION) 221 goto err; 222 223 err = "bad btree header"; 224 if (b->written + set_blocks(i, block_bytes(b->c)) > 225 btree_blocks(b)) 226 goto err; 227 228 err = "bad magic"; 229 if (i->magic != bset_magic(&b->c->sb)) 230 goto err; 231 232 err = "bad checksum"; 233 switch (i->version) { 234 case 0: 235 if (i->csum != csum_set(i)) 236 goto err; 237 break; 238 case BCACHE_BSET_VERSION: 239 if (i->csum != btree_csum_set(b, i)) 240 goto err; 241 break; 242 } 243 244 err = "empty set"; 245 if (i != b->keys.set[0].data && !i->keys) 246 goto err; 247 248 bch_btree_iter_push(iter, i->start, bset_bkey_last(i)); 249 250 b->written += set_blocks(i, block_bytes(b->c)); 251 } 252 253 err = "corrupted btree"; 254 for (i = write_block(b); 255 bset_sector_offset(&b->keys, i) < KEY_SIZE(&b->key); 256 i = ((void *) i) + block_bytes(b->c)) 257 if (i->seq == b->keys.set[0].data->seq) 258 goto err; 259 260 bch_btree_sort_and_fix_extents(&b->keys, iter, &b->c->sort); 261 262 i = b->keys.set[0].data; 263 err = "short btree key"; 264 if (b->keys.set[0].size && 265 bkey_cmp(&b->key, &b->keys.set[0].end) < 0) 266 goto err; 267 268 if (b->written < btree_blocks(b)) 269 bch_bset_init_next(&b->keys, write_block(b), 270 bset_magic(&b->c->sb)); 271 out: 272 mempool_free(iter, b->c->fill_iter); 273 return; 274 err: 275 set_btree_node_io_error(b); 276 bch_cache_set_error(b->c, "%s at bucket %zu, block %u, %u keys", 277 err, PTR_BUCKET_NR(b->c, &b->key, 0), 278 bset_block_offset(b, i), i->keys); 279 goto out; 280 } 281 282 static void btree_node_read_endio(struct bio *bio, int error) 283 { 284 struct closure *cl = bio->bi_private; 285 closure_put(cl); 286 } 287 288 static void bch_btree_node_read(struct btree *b) 289 { 290 uint64_t start_time = local_clock(); 291 struct closure cl; 292 struct bio *bio; 293 294 trace_bcache_btree_read(b); 295 296 closure_init_stack(&cl); 297 298 bio = bch_bbio_alloc(b->c); 299 bio->bi_rw = REQ_META|READ_SYNC; 300 bio->bi_iter.bi_size = KEY_SIZE(&b->key) << 9; 301 bio->bi_end_io = btree_node_read_endio; 302 bio->bi_private = &cl; 303 304 bch_bio_map(bio, b->keys.set[0].data); 305 306 bch_submit_bbio(bio, b->c, &b->key, 0); 307 closure_sync(&cl); 308 309 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 310 set_btree_node_io_error(b); 311 312 bch_bbio_free(bio, b->c); 313 314 if (btree_node_io_error(b)) 315 goto err; 316 317 bch_btree_node_read_done(b); 318 bch_time_stats_update(&b->c->btree_read_time, start_time); 319 320 return; 321 err: 322 bch_cache_set_error(b->c, "io error reading bucket %zu", 323 PTR_BUCKET_NR(b->c, &b->key, 0)); 324 } 325 326 static void btree_complete_write(struct btree *b, struct btree_write *w) 327 { 328 if (w->prio_blocked && 329 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 330 wake_up_allocators(b->c); 331 332 if (w->journal) { 333 atomic_dec_bug(w->journal); 334 __closure_wake_up(&b->c->journal.wait); 335 } 336 337 w->prio_blocked = 0; 338 w->journal = NULL; 339 } 340 341 static void btree_node_write_unlock(struct closure *cl) 342 { 343 struct btree *b = container_of(cl, struct btree, io); 344 345 up(&b->io_mutex); 346 } 347 348 static void __btree_node_write_done(struct closure *cl) 349 { 350 struct btree *b = container_of(cl, struct btree, io); 351 struct btree_write *w = btree_prev_write(b); 352 353 bch_bbio_free(b->bio, b->c); 354 b->bio = NULL; 355 btree_complete_write(b, w); 356 357 if (btree_node_dirty(b)) 358 schedule_delayed_work(&b->work, 30 * HZ); 359 360 closure_return_with_destructor(cl, btree_node_write_unlock); 361 } 362 363 static void btree_node_write_done(struct closure *cl) 364 { 365 struct btree *b = container_of(cl, struct btree, io); 366 struct bio_vec *bv; 367 int n; 368 369 bio_for_each_segment_all(bv, b->bio, n) 370 __free_page(bv->bv_page); 371 372 __btree_node_write_done(cl); 373 } 374 375 static void btree_node_write_endio(struct bio *bio, int error) 376 { 377 struct closure *cl = bio->bi_private; 378 struct btree *b = container_of(cl, struct btree, io); 379 380 if (error) 381 set_btree_node_io_error(b); 382 383 bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); 384 closure_put(cl); 385 } 386 387 static void do_btree_node_write(struct btree *b) 388 { 389 struct closure *cl = &b->io; 390 struct bset *i = btree_bset_last(b); 391 BKEY_PADDED(key) k; 392 393 i->version = BCACHE_BSET_VERSION; 394 i->csum = btree_csum_set(b, i); 395 396 BUG_ON(b->bio); 397 b->bio = bch_bbio_alloc(b->c); 398 399 b->bio->bi_end_io = btree_node_write_endio; 400 b->bio->bi_private = cl; 401 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 402 b->bio->bi_iter.bi_size = roundup(set_bytes(i), block_bytes(b->c)); 403 bch_bio_map(b->bio, i); 404 405 /* 406 * If we're appending to a leaf node, we don't technically need FUA - 407 * this write just needs to be persisted before the next journal write, 408 * which will be marked FLUSH|FUA. 409 * 410 * Similarly if we're writing a new btree root - the pointer is going to 411 * be in the next journal entry. 412 * 413 * But if we're writing a new btree node (that isn't a root) or 414 * appending to a non leaf btree node, we need either FUA or a flush 415 * when we write the parent with the new pointer. FUA is cheaper than a 416 * flush, and writes appending to leaf nodes aren't blocking anything so 417 * just make all btree node writes FUA to keep things sane. 418 */ 419 420 bkey_copy(&k.key, &b->key); 421 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + 422 bset_sector_offset(&b->keys, i)); 423 424 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 425 int j; 426 struct bio_vec *bv; 427 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 428 429 bio_for_each_segment_all(bv, b->bio, j) 430 memcpy(page_address(bv->bv_page), 431 base + j * PAGE_SIZE, PAGE_SIZE); 432 433 bch_submit_bbio(b->bio, b->c, &k.key, 0); 434 435 continue_at(cl, btree_node_write_done, NULL); 436 } else { 437 b->bio->bi_vcnt = 0; 438 bch_bio_map(b->bio, i); 439 440 bch_submit_bbio(b->bio, b->c, &k.key, 0); 441 442 closure_sync(cl); 443 continue_at_nobarrier(cl, __btree_node_write_done, NULL); 444 } 445 } 446 447 void __bch_btree_node_write(struct btree *b, struct closure *parent) 448 { 449 struct bset *i = btree_bset_last(b); 450 451 lockdep_assert_held(&b->write_lock); 452 453 trace_bcache_btree_write(b); 454 455 BUG_ON(current->bio_list); 456 BUG_ON(b->written >= btree_blocks(b)); 457 BUG_ON(b->written && !i->keys); 458 BUG_ON(btree_bset_first(b)->seq != i->seq); 459 bch_check_keys(&b->keys, "writing"); 460 461 cancel_delayed_work(&b->work); 462 463 /* If caller isn't waiting for write, parent refcount is cache set */ 464 down(&b->io_mutex); 465 closure_init(&b->io, parent ?: &b->c->cl); 466 467 clear_bit(BTREE_NODE_dirty, &b->flags); 468 change_bit(BTREE_NODE_write_idx, &b->flags); 469 470 do_btree_node_write(b); 471 472 atomic_long_add(set_blocks(i, block_bytes(b->c)) * b->c->sb.block_size, 473 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 474 475 b->written += set_blocks(i, block_bytes(b->c)); 476 } 477 478 void bch_btree_node_write(struct btree *b, struct closure *parent) 479 { 480 unsigned nsets = b->keys.nsets; 481 482 lockdep_assert_held(&b->lock); 483 484 __bch_btree_node_write(b, parent); 485 486 /* 487 * do verify if there was more than one set initially (i.e. we did a 488 * sort) and we sorted down to a single set: 489 */ 490 if (nsets && !b->keys.nsets) 491 bch_btree_verify(b); 492 493 bch_btree_init_next(b); 494 } 495 496 static void bch_btree_node_write_sync(struct btree *b) 497 { 498 struct closure cl; 499 500 closure_init_stack(&cl); 501 502 mutex_lock(&b->write_lock); 503 bch_btree_node_write(b, &cl); 504 mutex_unlock(&b->write_lock); 505 506 closure_sync(&cl); 507 } 508 509 static void btree_node_write_work(struct work_struct *w) 510 { 511 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 512 513 mutex_lock(&b->write_lock); 514 if (btree_node_dirty(b)) 515 __bch_btree_node_write(b, NULL); 516 mutex_unlock(&b->write_lock); 517 } 518 519 static void bch_btree_leaf_dirty(struct btree *b, atomic_t *journal_ref) 520 { 521 struct bset *i = btree_bset_last(b); 522 struct btree_write *w = btree_current_write(b); 523 524 lockdep_assert_held(&b->write_lock); 525 526 BUG_ON(!b->written); 527 BUG_ON(!i->keys); 528 529 if (!btree_node_dirty(b)) 530 schedule_delayed_work(&b->work, 30 * HZ); 531 532 set_btree_node_dirty(b); 533 534 if (journal_ref) { 535 if (w->journal && 536 journal_pin_cmp(b->c, w->journal, journal_ref)) { 537 atomic_dec_bug(w->journal); 538 w->journal = NULL; 539 } 540 541 if (!w->journal) { 542 w->journal = journal_ref; 543 atomic_inc(w->journal); 544 } 545 } 546 547 /* Force write if set is too big */ 548 if (set_bytes(i) > PAGE_SIZE - 48 && 549 !current->bio_list) 550 bch_btree_node_write(b, NULL); 551 } 552 553 /* 554 * Btree in memory cache - allocation/freeing 555 * mca -> memory cache 556 */ 557 558 #define mca_reserve(c) (((c->root && c->root->level) \ 559 ? c->root->level : 1) * 8 + 16) 560 #define mca_can_free(c) \ 561 max_t(int, 0, c->btree_cache_used - mca_reserve(c)) 562 563 static void mca_data_free(struct btree *b) 564 { 565 BUG_ON(b->io_mutex.count != 1); 566 567 bch_btree_keys_free(&b->keys); 568 569 b->c->btree_cache_used--; 570 list_move(&b->list, &b->c->btree_cache_freed); 571 } 572 573 static void mca_bucket_free(struct btree *b) 574 { 575 BUG_ON(btree_node_dirty(b)); 576 577 b->key.ptr[0] = 0; 578 hlist_del_init_rcu(&b->hash); 579 list_move(&b->list, &b->c->btree_cache_freeable); 580 } 581 582 static unsigned btree_order(struct bkey *k) 583 { 584 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 585 } 586 587 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 588 { 589 if (!bch_btree_keys_alloc(&b->keys, 590 max_t(unsigned, 591 ilog2(b->c->btree_pages), 592 btree_order(k)), 593 gfp)) { 594 b->c->btree_cache_used++; 595 list_move(&b->list, &b->c->btree_cache); 596 } else { 597 list_move(&b->list, &b->c->btree_cache_freed); 598 } 599 } 600 601 static struct btree *mca_bucket_alloc(struct cache_set *c, 602 struct bkey *k, gfp_t gfp) 603 { 604 struct btree *b = kzalloc(sizeof(struct btree), gfp); 605 if (!b) 606 return NULL; 607 608 init_rwsem(&b->lock); 609 lockdep_set_novalidate_class(&b->lock); 610 mutex_init(&b->write_lock); 611 lockdep_set_novalidate_class(&b->write_lock); 612 INIT_LIST_HEAD(&b->list); 613 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 614 b->c = c; 615 sema_init(&b->io_mutex, 1); 616 617 mca_data_alloc(b, k, gfp); 618 return b; 619 } 620 621 static int mca_reap(struct btree *b, unsigned min_order, bool flush) 622 { 623 struct closure cl; 624 625 closure_init_stack(&cl); 626 lockdep_assert_held(&b->c->bucket_lock); 627 628 if (!down_write_trylock(&b->lock)) 629 return -ENOMEM; 630 631 BUG_ON(btree_node_dirty(b) && !b->keys.set[0].data); 632 633 if (b->keys.page_order < min_order) 634 goto out_unlock; 635 636 if (!flush) { 637 if (btree_node_dirty(b)) 638 goto out_unlock; 639 640 if (down_trylock(&b->io_mutex)) 641 goto out_unlock; 642 up(&b->io_mutex); 643 } 644 645 mutex_lock(&b->write_lock); 646 if (btree_node_dirty(b)) 647 __bch_btree_node_write(b, &cl); 648 mutex_unlock(&b->write_lock); 649 650 closure_sync(&cl); 651 652 /* wait for any in flight btree write */ 653 down(&b->io_mutex); 654 up(&b->io_mutex); 655 656 return 0; 657 out_unlock: 658 rw_unlock(true, b); 659 return -ENOMEM; 660 } 661 662 static unsigned long bch_mca_scan(struct shrinker *shrink, 663 struct shrink_control *sc) 664 { 665 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 666 struct btree *b, *t; 667 unsigned long i, nr = sc->nr_to_scan; 668 unsigned long freed = 0; 669 670 if (c->shrinker_disabled) 671 return SHRINK_STOP; 672 673 if (c->btree_cache_alloc_lock) 674 return SHRINK_STOP; 675 676 /* Return -1 if we can't do anything right now */ 677 if (sc->gfp_mask & __GFP_IO) 678 mutex_lock(&c->bucket_lock); 679 else if (!mutex_trylock(&c->bucket_lock)) 680 return -1; 681 682 /* 683 * It's _really_ critical that we don't free too many btree nodes - we 684 * have to always leave ourselves a reserve. The reserve is how we 685 * guarantee that allocating memory for a new btree node can always 686 * succeed, so that inserting keys into the btree can always succeed and 687 * IO can always make forward progress: 688 */ 689 nr /= c->btree_pages; 690 nr = min_t(unsigned long, nr, mca_can_free(c)); 691 692 i = 0; 693 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 694 if (freed >= nr) 695 break; 696 697 if (++i > 3 && 698 !mca_reap(b, 0, false)) { 699 mca_data_free(b); 700 rw_unlock(true, b); 701 freed++; 702 } 703 } 704 705 for (i = 0; (nr--) && i < c->btree_cache_used; i++) { 706 if (list_empty(&c->btree_cache)) 707 goto out; 708 709 b = list_first_entry(&c->btree_cache, struct btree, list); 710 list_rotate_left(&c->btree_cache); 711 712 if (!b->accessed && 713 !mca_reap(b, 0, false)) { 714 mca_bucket_free(b); 715 mca_data_free(b); 716 rw_unlock(true, b); 717 freed++; 718 } else 719 b->accessed = 0; 720 } 721 out: 722 mutex_unlock(&c->bucket_lock); 723 return freed; 724 } 725 726 static unsigned long bch_mca_count(struct shrinker *shrink, 727 struct shrink_control *sc) 728 { 729 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 730 731 if (c->shrinker_disabled) 732 return 0; 733 734 if (c->btree_cache_alloc_lock) 735 return 0; 736 737 return mca_can_free(c) * c->btree_pages; 738 } 739 740 void bch_btree_cache_free(struct cache_set *c) 741 { 742 struct btree *b; 743 struct closure cl; 744 closure_init_stack(&cl); 745 746 if (c->shrink.list.next) 747 unregister_shrinker(&c->shrink); 748 749 mutex_lock(&c->bucket_lock); 750 751 #ifdef CONFIG_BCACHE_DEBUG 752 if (c->verify_data) 753 list_move(&c->verify_data->list, &c->btree_cache); 754 755 free_pages((unsigned long) c->verify_ondisk, ilog2(bucket_pages(c))); 756 #endif 757 758 list_splice(&c->btree_cache_freeable, 759 &c->btree_cache); 760 761 while (!list_empty(&c->btree_cache)) { 762 b = list_first_entry(&c->btree_cache, struct btree, list); 763 764 if (btree_node_dirty(b)) 765 btree_complete_write(b, btree_current_write(b)); 766 clear_bit(BTREE_NODE_dirty, &b->flags); 767 768 mca_data_free(b); 769 } 770 771 while (!list_empty(&c->btree_cache_freed)) { 772 b = list_first_entry(&c->btree_cache_freed, 773 struct btree, list); 774 list_del(&b->list); 775 cancel_delayed_work_sync(&b->work); 776 kfree(b); 777 } 778 779 mutex_unlock(&c->bucket_lock); 780 } 781 782 int bch_btree_cache_alloc(struct cache_set *c) 783 { 784 unsigned i; 785 786 for (i = 0; i < mca_reserve(c); i++) 787 if (!mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL)) 788 return -ENOMEM; 789 790 list_splice_init(&c->btree_cache, 791 &c->btree_cache_freeable); 792 793 #ifdef CONFIG_BCACHE_DEBUG 794 mutex_init(&c->verify_lock); 795 796 c->verify_ondisk = (void *) 797 __get_free_pages(GFP_KERNEL, ilog2(bucket_pages(c))); 798 799 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 800 801 if (c->verify_data && 802 c->verify_data->keys.set->data) 803 list_del_init(&c->verify_data->list); 804 else 805 c->verify_data = NULL; 806 #endif 807 808 c->shrink.count_objects = bch_mca_count; 809 c->shrink.scan_objects = bch_mca_scan; 810 c->shrink.seeks = 4; 811 c->shrink.batch = c->btree_pages * 2; 812 register_shrinker(&c->shrink); 813 814 return 0; 815 } 816 817 /* Btree in memory cache - hash table */ 818 819 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 820 { 821 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 822 } 823 824 static struct btree *mca_find(struct cache_set *c, struct bkey *k) 825 { 826 struct btree *b; 827 828 rcu_read_lock(); 829 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 830 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 831 goto out; 832 b = NULL; 833 out: 834 rcu_read_unlock(); 835 return b; 836 } 837 838 static int mca_cannibalize_lock(struct cache_set *c, struct btree_op *op) 839 { 840 struct task_struct *old; 841 842 old = cmpxchg(&c->btree_cache_alloc_lock, NULL, current); 843 if (old && old != current) { 844 if (op) 845 prepare_to_wait(&c->btree_cache_wait, &op->wait, 846 TASK_UNINTERRUPTIBLE); 847 return -EINTR; 848 } 849 850 return 0; 851 } 852 853 static struct btree *mca_cannibalize(struct cache_set *c, struct btree_op *op, 854 struct bkey *k) 855 { 856 struct btree *b; 857 858 trace_bcache_btree_cache_cannibalize(c); 859 860 if (mca_cannibalize_lock(c, op)) 861 return ERR_PTR(-EINTR); 862 863 list_for_each_entry_reverse(b, &c->btree_cache, list) 864 if (!mca_reap(b, btree_order(k), false)) 865 return b; 866 867 list_for_each_entry_reverse(b, &c->btree_cache, list) 868 if (!mca_reap(b, btree_order(k), true)) 869 return b; 870 871 WARN(1, "btree cache cannibalize failed\n"); 872 return ERR_PTR(-ENOMEM); 873 } 874 875 /* 876 * We can only have one thread cannibalizing other cached btree nodes at a time, 877 * or we'll deadlock. We use an open coded mutex to ensure that, which a 878 * cannibalize_bucket() will take. This means every time we unlock the root of 879 * the btree, we need to release this lock if we have it held. 880 */ 881 static void bch_cannibalize_unlock(struct cache_set *c) 882 { 883 if (c->btree_cache_alloc_lock == current) { 884 c->btree_cache_alloc_lock = NULL; 885 wake_up(&c->btree_cache_wait); 886 } 887 } 888 889 static struct btree *mca_alloc(struct cache_set *c, struct btree_op *op, 890 struct bkey *k, int level) 891 { 892 struct btree *b; 893 894 BUG_ON(current->bio_list); 895 896 lockdep_assert_held(&c->bucket_lock); 897 898 if (mca_find(c, k)) 899 return NULL; 900 901 /* btree_free() doesn't free memory; it sticks the node on the end of 902 * the list. Check if there's any freed nodes there: 903 */ 904 list_for_each_entry(b, &c->btree_cache_freeable, list) 905 if (!mca_reap(b, btree_order(k), false)) 906 goto out; 907 908 /* We never free struct btree itself, just the memory that holds the on 909 * disk node. Check the freed list before allocating a new one: 910 */ 911 list_for_each_entry(b, &c->btree_cache_freed, list) 912 if (!mca_reap(b, 0, false)) { 913 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 914 if (!b->keys.set[0].data) 915 goto err; 916 else 917 goto out; 918 } 919 920 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 921 if (!b) 922 goto err; 923 924 BUG_ON(!down_write_trylock(&b->lock)); 925 if (!b->keys.set->data) 926 goto err; 927 out: 928 BUG_ON(b->io_mutex.count != 1); 929 930 bkey_copy(&b->key, k); 931 list_move(&b->list, &c->btree_cache); 932 hlist_del_init_rcu(&b->hash); 933 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 934 935 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 936 b->parent = (void *) ~0UL; 937 b->flags = 0; 938 b->written = 0; 939 b->level = level; 940 941 if (!b->level) 942 bch_btree_keys_init(&b->keys, &bch_extent_keys_ops, 943 &b->c->expensive_debug_checks); 944 else 945 bch_btree_keys_init(&b->keys, &bch_btree_keys_ops, 946 &b->c->expensive_debug_checks); 947 948 return b; 949 err: 950 if (b) 951 rw_unlock(true, b); 952 953 b = mca_cannibalize(c, op, k); 954 if (!IS_ERR(b)) 955 goto out; 956 957 return b; 958 } 959 960 /** 961 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 962 * in from disk if necessary. 963 * 964 * If IO is necessary and running under generic_make_request, returns -EAGAIN. 965 * 966 * The btree node will have either a read or a write lock held, depending on 967 * level and op->lock. 968 */ 969 struct btree *bch_btree_node_get(struct cache_set *c, struct btree_op *op, 970 struct bkey *k, int level, bool write) 971 { 972 int i = 0; 973 struct btree *b; 974 975 BUG_ON(level < 0); 976 retry: 977 b = mca_find(c, k); 978 979 if (!b) { 980 if (current->bio_list) 981 return ERR_PTR(-EAGAIN); 982 983 mutex_lock(&c->bucket_lock); 984 b = mca_alloc(c, op, k, level); 985 mutex_unlock(&c->bucket_lock); 986 987 if (!b) 988 goto retry; 989 if (IS_ERR(b)) 990 return b; 991 992 bch_btree_node_read(b); 993 994 if (!write) 995 downgrade_write(&b->lock); 996 } else { 997 rw_lock(write, b, level); 998 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 999 rw_unlock(write, b); 1000 goto retry; 1001 } 1002 BUG_ON(b->level != level); 1003 } 1004 1005 b->accessed = 1; 1006 1007 for (; i <= b->keys.nsets && b->keys.set[i].size; i++) { 1008 prefetch(b->keys.set[i].tree); 1009 prefetch(b->keys.set[i].data); 1010 } 1011 1012 for (; i <= b->keys.nsets; i++) 1013 prefetch(b->keys.set[i].data); 1014 1015 if (btree_node_io_error(b)) { 1016 rw_unlock(write, b); 1017 return ERR_PTR(-EIO); 1018 } 1019 1020 BUG_ON(!b->written); 1021 1022 return b; 1023 } 1024 1025 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) 1026 { 1027 struct btree *b; 1028 1029 mutex_lock(&c->bucket_lock); 1030 b = mca_alloc(c, NULL, k, level); 1031 mutex_unlock(&c->bucket_lock); 1032 1033 if (!IS_ERR_OR_NULL(b)) { 1034 bch_btree_node_read(b); 1035 rw_unlock(true, b); 1036 } 1037 } 1038 1039 /* Btree alloc */ 1040 1041 static void btree_node_free(struct btree *b) 1042 { 1043 trace_bcache_btree_node_free(b); 1044 1045 BUG_ON(b == b->c->root); 1046 1047 mutex_lock(&b->write_lock); 1048 1049 if (btree_node_dirty(b)) 1050 btree_complete_write(b, btree_current_write(b)); 1051 clear_bit(BTREE_NODE_dirty, &b->flags); 1052 1053 mutex_unlock(&b->write_lock); 1054 1055 cancel_delayed_work(&b->work); 1056 1057 mutex_lock(&b->c->bucket_lock); 1058 bch_bucket_free(b->c, &b->key); 1059 mca_bucket_free(b); 1060 mutex_unlock(&b->c->bucket_lock); 1061 } 1062 1063 struct btree *bch_btree_node_alloc(struct cache_set *c, struct btree_op *op, 1064 int level) 1065 { 1066 BKEY_PADDED(key) k; 1067 struct btree *b = ERR_PTR(-EAGAIN); 1068 1069 mutex_lock(&c->bucket_lock); 1070 retry: 1071 if (__bch_bucket_alloc_set(c, RESERVE_BTREE, &k.key, 1, op != NULL)) 1072 goto err; 1073 1074 bkey_put(c, &k.key); 1075 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1076 1077 b = mca_alloc(c, op, &k.key, level); 1078 if (IS_ERR(b)) 1079 goto err_free; 1080 1081 if (!b) { 1082 cache_bug(c, 1083 "Tried to allocate bucket that was in btree cache"); 1084 goto retry; 1085 } 1086 1087 b->accessed = 1; 1088 bch_bset_init_next(&b->keys, b->keys.set->data, bset_magic(&b->c->sb)); 1089 1090 mutex_unlock(&c->bucket_lock); 1091 1092 trace_bcache_btree_node_alloc(b); 1093 return b; 1094 err_free: 1095 bch_bucket_free(c, &k.key); 1096 err: 1097 mutex_unlock(&c->bucket_lock); 1098 1099 trace_bcache_btree_node_alloc_fail(b); 1100 return b; 1101 } 1102 1103 static struct btree *btree_node_alloc_replacement(struct btree *b, 1104 struct btree_op *op) 1105 { 1106 struct btree *n = bch_btree_node_alloc(b->c, op, b->level); 1107 if (!IS_ERR_OR_NULL(n)) { 1108 mutex_lock(&n->write_lock); 1109 bch_btree_sort_into(&b->keys, &n->keys, &b->c->sort); 1110 bkey_copy_key(&n->key, &b->key); 1111 mutex_unlock(&n->write_lock); 1112 } 1113 1114 return n; 1115 } 1116 1117 static void make_btree_freeing_key(struct btree *b, struct bkey *k) 1118 { 1119 unsigned i; 1120 1121 mutex_lock(&b->c->bucket_lock); 1122 1123 atomic_inc(&b->c->prio_blocked); 1124 1125 bkey_copy(k, &b->key); 1126 bkey_copy_key(k, &ZERO_KEY); 1127 1128 for (i = 0; i < KEY_PTRS(k); i++) 1129 SET_PTR_GEN(k, i, 1130 bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 1131 PTR_BUCKET(b->c, &b->key, i))); 1132 1133 mutex_unlock(&b->c->bucket_lock); 1134 } 1135 1136 static int btree_check_reserve(struct btree *b, struct btree_op *op) 1137 { 1138 struct cache_set *c = b->c; 1139 struct cache *ca; 1140 unsigned i, reserve = (c->root->level - b->level) * 2 + 1; 1141 1142 mutex_lock(&c->bucket_lock); 1143 1144 for_each_cache(ca, c, i) 1145 if (fifo_used(&ca->free[RESERVE_BTREE]) < reserve) { 1146 if (op) 1147 prepare_to_wait(&c->btree_cache_wait, &op->wait, 1148 TASK_UNINTERRUPTIBLE); 1149 mutex_unlock(&c->bucket_lock); 1150 return -EINTR; 1151 } 1152 1153 mutex_unlock(&c->bucket_lock); 1154 1155 return mca_cannibalize_lock(b->c, op); 1156 } 1157 1158 /* Garbage collection */ 1159 1160 static uint8_t __bch_btree_mark_key(struct cache_set *c, int level, 1161 struct bkey *k) 1162 { 1163 uint8_t stale = 0; 1164 unsigned i; 1165 struct bucket *g; 1166 1167 /* 1168 * ptr_invalid() can't return true for the keys that mark btree nodes as 1169 * freed, but since ptr_bad() returns true we'll never actually use them 1170 * for anything and thus we don't want mark their pointers here 1171 */ 1172 if (!bkey_cmp(k, &ZERO_KEY)) 1173 return stale; 1174 1175 for (i = 0; i < KEY_PTRS(k); i++) { 1176 if (!ptr_available(c, k, i)) 1177 continue; 1178 1179 g = PTR_BUCKET(c, k, i); 1180 1181 if (gen_after(g->last_gc, PTR_GEN(k, i))) 1182 g->last_gc = PTR_GEN(k, i); 1183 1184 if (ptr_stale(c, k, i)) { 1185 stale = max(stale, ptr_stale(c, k, i)); 1186 continue; 1187 } 1188 1189 cache_bug_on(GC_MARK(g) && 1190 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1191 c, "inconsistent ptrs: mark = %llu, level = %i", 1192 GC_MARK(g), level); 1193 1194 if (level) 1195 SET_GC_MARK(g, GC_MARK_METADATA); 1196 else if (KEY_DIRTY(k)) 1197 SET_GC_MARK(g, GC_MARK_DIRTY); 1198 else if (!GC_MARK(g)) 1199 SET_GC_MARK(g, GC_MARK_RECLAIMABLE); 1200 1201 /* guard against overflow */ 1202 SET_GC_SECTORS_USED(g, min_t(unsigned, 1203 GC_SECTORS_USED(g) + KEY_SIZE(k), 1204 MAX_GC_SECTORS_USED)); 1205 1206 BUG_ON(!GC_SECTORS_USED(g)); 1207 } 1208 1209 return stale; 1210 } 1211 1212 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1213 1214 void bch_initial_mark_key(struct cache_set *c, int level, struct bkey *k) 1215 { 1216 unsigned i; 1217 1218 for (i = 0; i < KEY_PTRS(k); i++) 1219 if (ptr_available(c, k, i) && 1220 !ptr_stale(c, k, i)) { 1221 struct bucket *b = PTR_BUCKET(c, k, i); 1222 1223 b->gen = PTR_GEN(k, i); 1224 1225 if (level && bkey_cmp(k, &ZERO_KEY)) 1226 b->prio = BTREE_PRIO; 1227 else if (!level && b->prio == BTREE_PRIO) 1228 b->prio = INITIAL_PRIO; 1229 } 1230 1231 __bch_btree_mark_key(c, level, k); 1232 } 1233 1234 static bool btree_gc_mark_node(struct btree *b, struct gc_stat *gc) 1235 { 1236 uint8_t stale = 0; 1237 unsigned keys = 0, good_keys = 0; 1238 struct bkey *k; 1239 struct btree_iter iter; 1240 struct bset_tree *t; 1241 1242 gc->nodes++; 1243 1244 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) { 1245 stale = max(stale, btree_mark_key(b, k)); 1246 keys++; 1247 1248 if (bch_ptr_bad(&b->keys, k)) 1249 continue; 1250 1251 gc->key_bytes += bkey_u64s(k); 1252 gc->nkeys++; 1253 good_keys++; 1254 1255 gc->data += KEY_SIZE(k); 1256 } 1257 1258 for (t = b->keys.set; t <= &b->keys.set[b->keys.nsets]; t++) 1259 btree_bug_on(t->size && 1260 bset_written(&b->keys, t) && 1261 bkey_cmp(&b->key, &t->end) < 0, 1262 b, "found short btree key in gc"); 1263 1264 if (b->c->gc_always_rewrite) 1265 return true; 1266 1267 if (stale > 10) 1268 return true; 1269 1270 if ((keys - good_keys) * 2 > keys) 1271 return true; 1272 1273 return false; 1274 } 1275 1276 #define GC_MERGE_NODES 4U 1277 1278 struct gc_merge_info { 1279 struct btree *b; 1280 unsigned keys; 1281 }; 1282 1283 static int bch_btree_insert_node(struct btree *, struct btree_op *, 1284 struct keylist *, atomic_t *, struct bkey *); 1285 1286 static int btree_gc_coalesce(struct btree *b, struct btree_op *op, 1287 struct gc_stat *gc, struct gc_merge_info *r) 1288 { 1289 unsigned i, nodes = 0, keys = 0, blocks; 1290 struct btree *new_nodes[GC_MERGE_NODES]; 1291 struct keylist keylist; 1292 struct closure cl; 1293 struct bkey *k; 1294 1295 bch_keylist_init(&keylist); 1296 1297 if (btree_check_reserve(b, NULL)) 1298 return 0; 1299 1300 memset(new_nodes, 0, sizeof(new_nodes)); 1301 closure_init_stack(&cl); 1302 1303 while (nodes < GC_MERGE_NODES && !IS_ERR_OR_NULL(r[nodes].b)) 1304 keys += r[nodes++].keys; 1305 1306 blocks = btree_default_blocks(b->c) * 2 / 3; 1307 1308 if (nodes < 2 || 1309 __set_blocks(b->keys.set[0].data, keys, 1310 block_bytes(b->c)) > blocks * (nodes - 1)) 1311 return 0; 1312 1313 for (i = 0; i < nodes; i++) { 1314 new_nodes[i] = btree_node_alloc_replacement(r[i].b, NULL); 1315 if (IS_ERR_OR_NULL(new_nodes[i])) 1316 goto out_nocoalesce; 1317 } 1318 1319 /* 1320 * We have to check the reserve here, after we've allocated our new 1321 * nodes, to make sure the insert below will succeed - we also check 1322 * before as an optimization to potentially avoid a bunch of expensive 1323 * allocs/sorts 1324 */ 1325 if (btree_check_reserve(b, NULL)) 1326 goto out_nocoalesce; 1327 1328 for (i = 0; i < nodes; i++) 1329 mutex_lock(&new_nodes[i]->write_lock); 1330 1331 for (i = nodes - 1; i > 0; --i) { 1332 struct bset *n1 = btree_bset_first(new_nodes[i]); 1333 struct bset *n2 = btree_bset_first(new_nodes[i - 1]); 1334 struct bkey *k, *last = NULL; 1335 1336 keys = 0; 1337 1338 if (i > 1) { 1339 for (k = n2->start; 1340 k < bset_bkey_last(n2); 1341 k = bkey_next(k)) { 1342 if (__set_blocks(n1, n1->keys + keys + 1343 bkey_u64s(k), 1344 block_bytes(b->c)) > blocks) 1345 break; 1346 1347 last = k; 1348 keys += bkey_u64s(k); 1349 } 1350 } else { 1351 /* 1352 * Last node we're not getting rid of - we're getting 1353 * rid of the node at r[0]. Have to try and fit all of 1354 * the remaining keys into this node; we can't ensure 1355 * they will always fit due to rounding and variable 1356 * length keys (shouldn't be possible in practice, 1357 * though) 1358 */ 1359 if (__set_blocks(n1, n1->keys + n2->keys, 1360 block_bytes(b->c)) > 1361 btree_blocks(new_nodes[i])) 1362 goto out_nocoalesce; 1363 1364 keys = n2->keys; 1365 /* Take the key of the node we're getting rid of */ 1366 last = &r->b->key; 1367 } 1368 1369 BUG_ON(__set_blocks(n1, n1->keys + keys, block_bytes(b->c)) > 1370 btree_blocks(new_nodes[i])); 1371 1372 if (last) 1373 bkey_copy_key(&new_nodes[i]->key, last); 1374 1375 memcpy(bset_bkey_last(n1), 1376 n2->start, 1377 (void *) bset_bkey_idx(n2, keys) - (void *) n2->start); 1378 1379 n1->keys += keys; 1380 r[i].keys = n1->keys; 1381 1382 memmove(n2->start, 1383 bset_bkey_idx(n2, keys), 1384 (void *) bset_bkey_last(n2) - 1385 (void *) bset_bkey_idx(n2, keys)); 1386 1387 n2->keys -= keys; 1388 1389 if (__bch_keylist_realloc(&keylist, 1390 bkey_u64s(&new_nodes[i]->key))) 1391 goto out_nocoalesce; 1392 1393 bch_btree_node_write(new_nodes[i], &cl); 1394 bch_keylist_add(&keylist, &new_nodes[i]->key); 1395 } 1396 1397 for (i = 0; i < nodes; i++) 1398 mutex_unlock(&new_nodes[i]->write_lock); 1399 1400 closure_sync(&cl); 1401 1402 /* We emptied out this node */ 1403 BUG_ON(btree_bset_first(new_nodes[0])->keys); 1404 btree_node_free(new_nodes[0]); 1405 rw_unlock(true, new_nodes[0]); 1406 1407 for (i = 0; i < nodes; i++) { 1408 if (__bch_keylist_realloc(&keylist, bkey_u64s(&r[i].b->key))) 1409 goto out_nocoalesce; 1410 1411 make_btree_freeing_key(r[i].b, keylist.top); 1412 bch_keylist_push(&keylist); 1413 } 1414 1415 bch_btree_insert_node(b, op, &keylist, NULL, NULL); 1416 BUG_ON(!bch_keylist_empty(&keylist)); 1417 1418 for (i = 0; i < nodes; i++) { 1419 btree_node_free(r[i].b); 1420 rw_unlock(true, r[i].b); 1421 1422 r[i].b = new_nodes[i]; 1423 } 1424 1425 memmove(r, r + 1, sizeof(r[0]) * (nodes - 1)); 1426 r[nodes - 1].b = ERR_PTR(-EINTR); 1427 1428 trace_bcache_btree_gc_coalesce(nodes); 1429 gc->nodes--; 1430 1431 bch_keylist_free(&keylist); 1432 1433 /* Invalidated our iterator */ 1434 return -EINTR; 1435 1436 out_nocoalesce: 1437 closure_sync(&cl); 1438 bch_keylist_free(&keylist); 1439 1440 while ((k = bch_keylist_pop(&keylist))) 1441 if (!bkey_cmp(k, &ZERO_KEY)) 1442 atomic_dec(&b->c->prio_blocked); 1443 1444 for (i = 0; i < nodes; i++) 1445 if (!IS_ERR_OR_NULL(new_nodes[i])) { 1446 btree_node_free(new_nodes[i]); 1447 rw_unlock(true, new_nodes[i]); 1448 } 1449 return 0; 1450 } 1451 1452 static int btree_gc_rewrite_node(struct btree *b, struct btree_op *op, 1453 struct btree *replace) 1454 { 1455 struct keylist keys; 1456 struct btree *n; 1457 1458 if (btree_check_reserve(b, NULL)) 1459 return 0; 1460 1461 n = btree_node_alloc_replacement(replace, NULL); 1462 1463 /* recheck reserve after allocating replacement node */ 1464 if (btree_check_reserve(b, NULL)) { 1465 btree_node_free(n); 1466 rw_unlock(true, n); 1467 return 0; 1468 } 1469 1470 bch_btree_node_write_sync(n); 1471 1472 bch_keylist_init(&keys); 1473 bch_keylist_add(&keys, &n->key); 1474 1475 make_btree_freeing_key(replace, keys.top); 1476 bch_keylist_push(&keys); 1477 1478 bch_btree_insert_node(b, op, &keys, NULL, NULL); 1479 BUG_ON(!bch_keylist_empty(&keys)); 1480 1481 btree_node_free(replace); 1482 rw_unlock(true, n); 1483 1484 /* Invalidated our iterator */ 1485 return -EINTR; 1486 } 1487 1488 static unsigned btree_gc_count_keys(struct btree *b) 1489 { 1490 struct bkey *k; 1491 struct btree_iter iter; 1492 unsigned ret = 0; 1493 1494 for_each_key_filter(&b->keys, k, &iter, bch_ptr_bad) 1495 ret += bkey_u64s(k); 1496 1497 return ret; 1498 } 1499 1500 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1501 struct closure *writes, struct gc_stat *gc) 1502 { 1503 int ret = 0; 1504 bool should_rewrite; 1505 struct bkey *k; 1506 struct btree_iter iter; 1507 struct gc_merge_info r[GC_MERGE_NODES]; 1508 struct gc_merge_info *i, *last = r + ARRAY_SIZE(r) - 1; 1509 1510 bch_btree_iter_init(&b->keys, &iter, &b->c->gc_done); 1511 1512 for (i = r; i < r + ARRAY_SIZE(r); i++) 1513 i->b = ERR_PTR(-EINTR); 1514 1515 while (1) { 1516 k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad); 1517 if (k) { 1518 r->b = bch_btree_node_get(b->c, op, k, b->level - 1, 1519 true); 1520 if (IS_ERR(r->b)) { 1521 ret = PTR_ERR(r->b); 1522 break; 1523 } 1524 1525 r->keys = btree_gc_count_keys(r->b); 1526 1527 ret = btree_gc_coalesce(b, op, gc, r); 1528 if (ret) 1529 break; 1530 } 1531 1532 if (!last->b) 1533 break; 1534 1535 if (!IS_ERR(last->b)) { 1536 should_rewrite = btree_gc_mark_node(last->b, gc); 1537 if (should_rewrite) { 1538 ret = btree_gc_rewrite_node(b, op, last->b); 1539 if (ret) 1540 break; 1541 } 1542 1543 if (last->b->level) { 1544 ret = btree_gc_recurse(last->b, op, writes, gc); 1545 if (ret) 1546 break; 1547 } 1548 1549 bkey_copy_key(&b->c->gc_done, &last->b->key); 1550 1551 /* 1552 * Must flush leaf nodes before gc ends, since replace 1553 * operations aren't journalled 1554 */ 1555 mutex_lock(&last->b->write_lock); 1556 if (btree_node_dirty(last->b)) 1557 bch_btree_node_write(last->b, writes); 1558 mutex_unlock(&last->b->write_lock); 1559 rw_unlock(true, last->b); 1560 } 1561 1562 memmove(r + 1, r, sizeof(r[0]) * (GC_MERGE_NODES - 1)); 1563 r->b = NULL; 1564 1565 if (need_resched()) { 1566 ret = -EAGAIN; 1567 break; 1568 } 1569 } 1570 1571 for (i = r; i < r + ARRAY_SIZE(r); i++) 1572 if (!IS_ERR_OR_NULL(i->b)) { 1573 mutex_lock(&i->b->write_lock); 1574 if (btree_node_dirty(i->b)) 1575 bch_btree_node_write(i->b, writes); 1576 mutex_unlock(&i->b->write_lock); 1577 rw_unlock(true, i->b); 1578 } 1579 1580 return ret; 1581 } 1582 1583 static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1584 struct closure *writes, struct gc_stat *gc) 1585 { 1586 struct btree *n = NULL; 1587 int ret = 0; 1588 bool should_rewrite; 1589 1590 should_rewrite = btree_gc_mark_node(b, gc); 1591 if (should_rewrite) { 1592 n = btree_node_alloc_replacement(b, NULL); 1593 1594 if (!IS_ERR_OR_NULL(n)) { 1595 bch_btree_node_write_sync(n); 1596 1597 bch_btree_set_root(n); 1598 btree_node_free(b); 1599 rw_unlock(true, n); 1600 1601 return -EINTR; 1602 } 1603 } 1604 1605 __bch_btree_mark_key(b->c, b->level + 1, &b->key); 1606 1607 if (b->level) { 1608 ret = btree_gc_recurse(b, op, writes, gc); 1609 if (ret) 1610 return ret; 1611 } 1612 1613 bkey_copy_key(&b->c->gc_done, &b->key); 1614 1615 return ret; 1616 } 1617 1618 static void btree_gc_start(struct cache_set *c) 1619 { 1620 struct cache *ca; 1621 struct bucket *b; 1622 unsigned i; 1623 1624 if (!c->gc_mark_valid) 1625 return; 1626 1627 mutex_lock(&c->bucket_lock); 1628 1629 c->gc_mark_valid = 0; 1630 c->gc_done = ZERO_KEY; 1631 1632 for_each_cache(ca, c, i) 1633 for_each_bucket(b, ca) { 1634 b->last_gc = b->gen; 1635 if (!atomic_read(&b->pin)) { 1636 SET_GC_MARK(b, 0); 1637 SET_GC_SECTORS_USED(b, 0); 1638 } 1639 } 1640 1641 mutex_unlock(&c->bucket_lock); 1642 } 1643 1644 static size_t bch_btree_gc_finish(struct cache_set *c) 1645 { 1646 size_t available = 0; 1647 struct bucket *b; 1648 struct cache *ca; 1649 unsigned i; 1650 1651 mutex_lock(&c->bucket_lock); 1652 1653 set_gc_sectors(c); 1654 c->gc_mark_valid = 1; 1655 c->need_gc = 0; 1656 1657 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1658 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1659 GC_MARK_METADATA); 1660 1661 /* don't reclaim buckets to which writeback keys point */ 1662 rcu_read_lock(); 1663 for (i = 0; i < c->nr_uuids; i++) { 1664 struct bcache_device *d = c->devices[i]; 1665 struct cached_dev *dc; 1666 struct keybuf_key *w, *n; 1667 unsigned j; 1668 1669 if (!d || UUID_FLASH_ONLY(&c->uuids[i])) 1670 continue; 1671 dc = container_of(d, struct cached_dev, disk); 1672 1673 spin_lock(&dc->writeback_keys.lock); 1674 rbtree_postorder_for_each_entry_safe(w, n, 1675 &dc->writeback_keys.keys, node) 1676 for (j = 0; j < KEY_PTRS(&w->key); j++) 1677 SET_GC_MARK(PTR_BUCKET(c, &w->key, j), 1678 GC_MARK_DIRTY); 1679 spin_unlock(&dc->writeback_keys.lock); 1680 } 1681 rcu_read_unlock(); 1682 1683 for_each_cache(ca, c, i) { 1684 uint64_t *i; 1685 1686 ca->invalidate_needs_gc = 0; 1687 1688 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1689 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1690 1691 for (i = ca->prio_buckets; 1692 i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1693 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1694 1695 for_each_bucket(b, ca) { 1696 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1697 1698 if (atomic_read(&b->pin)) 1699 continue; 1700 1701 BUG_ON(!GC_MARK(b) && GC_SECTORS_USED(b)); 1702 1703 if (!GC_MARK(b) || GC_MARK(b) == GC_MARK_RECLAIMABLE) 1704 available++; 1705 } 1706 } 1707 1708 mutex_unlock(&c->bucket_lock); 1709 return available; 1710 } 1711 1712 static void bch_btree_gc(struct cache_set *c) 1713 { 1714 int ret; 1715 unsigned long available; 1716 struct gc_stat stats; 1717 struct closure writes; 1718 struct btree_op op; 1719 uint64_t start_time = local_clock(); 1720 1721 trace_bcache_gc_start(c); 1722 1723 memset(&stats, 0, sizeof(struct gc_stat)); 1724 closure_init_stack(&writes); 1725 bch_btree_op_init(&op, SHRT_MAX); 1726 1727 btree_gc_start(c); 1728 1729 do { 1730 ret = btree_root(gc_root, c, &op, &writes, &stats); 1731 closure_sync(&writes); 1732 1733 if (ret && ret != -EAGAIN) 1734 pr_warn("gc failed!"); 1735 } while (ret); 1736 1737 available = bch_btree_gc_finish(c); 1738 wake_up_allocators(c); 1739 1740 bch_time_stats_update(&c->btree_gc_time, start_time); 1741 1742 stats.key_bytes *= sizeof(uint64_t); 1743 stats.data <<= 9; 1744 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1745 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1746 1747 trace_bcache_gc_end(c); 1748 1749 bch_moving_gc(c); 1750 } 1751 1752 static int bch_gc_thread(void *arg) 1753 { 1754 struct cache_set *c = arg; 1755 struct cache *ca; 1756 unsigned i; 1757 1758 while (1) { 1759 again: 1760 bch_btree_gc(c); 1761 1762 set_current_state(TASK_INTERRUPTIBLE); 1763 if (kthread_should_stop()) 1764 break; 1765 1766 mutex_lock(&c->bucket_lock); 1767 1768 for_each_cache(ca, c, i) 1769 if (ca->invalidate_needs_gc) { 1770 mutex_unlock(&c->bucket_lock); 1771 set_current_state(TASK_RUNNING); 1772 goto again; 1773 } 1774 1775 mutex_unlock(&c->bucket_lock); 1776 1777 try_to_freeze(); 1778 schedule(); 1779 } 1780 1781 return 0; 1782 } 1783 1784 int bch_gc_thread_start(struct cache_set *c) 1785 { 1786 c->gc_thread = kthread_create(bch_gc_thread, c, "bcache_gc"); 1787 if (IS_ERR(c->gc_thread)) 1788 return PTR_ERR(c->gc_thread); 1789 1790 set_task_state(c->gc_thread, TASK_INTERRUPTIBLE); 1791 return 0; 1792 } 1793 1794 /* Initial partial gc */ 1795 1796 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op) 1797 { 1798 int ret = 0; 1799 struct bkey *k, *p = NULL; 1800 struct btree_iter iter; 1801 1802 for_each_key_filter(&b->keys, k, &iter, bch_ptr_invalid) 1803 bch_initial_mark_key(b->c, b->level, k); 1804 1805 bch_initial_mark_key(b->c, b->level + 1, &b->key); 1806 1807 if (b->level) { 1808 bch_btree_iter_init(&b->keys, &iter, NULL); 1809 1810 do { 1811 k = bch_btree_iter_next_filter(&iter, &b->keys, 1812 bch_ptr_bad); 1813 if (k) 1814 btree_node_prefetch(b->c, k, b->level - 1); 1815 1816 if (p) 1817 ret = btree(check_recurse, p, b, op); 1818 1819 p = k; 1820 } while (p && !ret); 1821 } 1822 1823 return ret; 1824 } 1825 1826 int bch_btree_check(struct cache_set *c) 1827 { 1828 struct btree_op op; 1829 1830 bch_btree_op_init(&op, SHRT_MAX); 1831 1832 return btree_root(check_recurse, c, &op); 1833 } 1834 1835 void bch_initial_gc_finish(struct cache_set *c) 1836 { 1837 struct cache *ca; 1838 struct bucket *b; 1839 unsigned i; 1840 1841 bch_btree_gc_finish(c); 1842 1843 mutex_lock(&c->bucket_lock); 1844 1845 /* 1846 * We need to put some unused buckets directly on the prio freelist in 1847 * order to get the allocator thread started - it needs freed buckets in 1848 * order to rewrite the prios and gens, and it needs to rewrite prios 1849 * and gens in order to free buckets. 1850 * 1851 * This is only safe for buckets that have no live data in them, which 1852 * there should always be some of. 1853 */ 1854 for_each_cache(ca, c, i) { 1855 for_each_bucket(b, ca) { 1856 if (fifo_full(&ca->free[RESERVE_PRIO])) 1857 break; 1858 1859 if (bch_can_invalidate_bucket(ca, b) && 1860 !GC_MARK(b)) { 1861 __bch_invalidate_one_bucket(ca, b); 1862 fifo_push(&ca->free[RESERVE_PRIO], 1863 b - ca->buckets); 1864 } 1865 } 1866 } 1867 1868 mutex_unlock(&c->bucket_lock); 1869 } 1870 1871 /* Btree insertion */ 1872 1873 static bool btree_insert_key(struct btree *b, struct bkey *k, 1874 struct bkey *replace_key) 1875 { 1876 unsigned status; 1877 1878 BUG_ON(bkey_cmp(k, &b->key) > 0); 1879 1880 status = bch_btree_insert_key(&b->keys, k, replace_key); 1881 if (status != BTREE_INSERT_STATUS_NO_INSERT) { 1882 bch_check_keys(&b->keys, "%u for %s", status, 1883 replace_key ? "replace" : "insert"); 1884 1885 trace_bcache_btree_insert_key(b, k, replace_key != NULL, 1886 status); 1887 return true; 1888 } else 1889 return false; 1890 } 1891 1892 static size_t insert_u64s_remaining(struct btree *b) 1893 { 1894 long ret = bch_btree_keys_u64s_remaining(&b->keys); 1895 1896 /* 1897 * Might land in the middle of an existing extent and have to split it 1898 */ 1899 if (b->keys.ops->is_extents) 1900 ret -= KEY_MAX_U64S; 1901 1902 return max(ret, 0L); 1903 } 1904 1905 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op, 1906 struct keylist *insert_keys, 1907 struct bkey *replace_key) 1908 { 1909 bool ret = false; 1910 int oldsize = bch_count_data(&b->keys); 1911 1912 while (!bch_keylist_empty(insert_keys)) { 1913 struct bkey *k = insert_keys->keys; 1914 1915 if (bkey_u64s(k) > insert_u64s_remaining(b)) 1916 break; 1917 1918 if (bkey_cmp(k, &b->key) <= 0) { 1919 if (!b->level) 1920 bkey_put(b->c, k); 1921 1922 ret |= btree_insert_key(b, k, replace_key); 1923 bch_keylist_pop_front(insert_keys); 1924 } else if (bkey_cmp(&START_KEY(k), &b->key) < 0) { 1925 BKEY_PADDED(key) temp; 1926 bkey_copy(&temp.key, insert_keys->keys); 1927 1928 bch_cut_back(&b->key, &temp.key); 1929 bch_cut_front(&b->key, insert_keys->keys); 1930 1931 ret |= btree_insert_key(b, &temp.key, replace_key); 1932 break; 1933 } else { 1934 break; 1935 } 1936 } 1937 1938 if (!ret) 1939 op->insert_collision = true; 1940 1941 BUG_ON(!bch_keylist_empty(insert_keys) && b->level); 1942 1943 BUG_ON(bch_count_data(&b->keys) < oldsize); 1944 return ret; 1945 } 1946 1947 static int btree_split(struct btree *b, struct btree_op *op, 1948 struct keylist *insert_keys, 1949 struct bkey *replace_key) 1950 { 1951 bool split; 1952 struct btree *n1, *n2 = NULL, *n3 = NULL; 1953 uint64_t start_time = local_clock(); 1954 struct closure cl; 1955 struct keylist parent_keys; 1956 1957 closure_init_stack(&cl); 1958 bch_keylist_init(&parent_keys); 1959 1960 if (btree_check_reserve(b, op)) { 1961 if (!b->level) 1962 return -EINTR; 1963 else 1964 WARN(1, "insufficient reserve for split\n"); 1965 } 1966 1967 n1 = btree_node_alloc_replacement(b, op); 1968 if (IS_ERR(n1)) 1969 goto err; 1970 1971 split = set_blocks(btree_bset_first(n1), 1972 block_bytes(n1->c)) > (btree_blocks(b) * 4) / 5; 1973 1974 if (split) { 1975 unsigned keys = 0; 1976 1977 trace_bcache_btree_node_split(b, btree_bset_first(n1)->keys); 1978 1979 n2 = bch_btree_node_alloc(b->c, op, b->level); 1980 if (IS_ERR(n2)) 1981 goto err_free1; 1982 1983 if (!b->parent) { 1984 n3 = bch_btree_node_alloc(b->c, op, b->level + 1); 1985 if (IS_ERR(n3)) 1986 goto err_free2; 1987 } 1988 1989 mutex_lock(&n1->write_lock); 1990 mutex_lock(&n2->write_lock); 1991 1992 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 1993 1994 /* 1995 * Has to be a linear search because we don't have an auxiliary 1996 * search tree yet 1997 */ 1998 1999 while (keys < (btree_bset_first(n1)->keys * 3) / 5) 2000 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), 2001 keys)); 2002 2003 bkey_copy_key(&n1->key, 2004 bset_bkey_idx(btree_bset_first(n1), keys)); 2005 keys += bkey_u64s(bset_bkey_idx(btree_bset_first(n1), keys)); 2006 2007 btree_bset_first(n2)->keys = btree_bset_first(n1)->keys - keys; 2008 btree_bset_first(n1)->keys = keys; 2009 2010 memcpy(btree_bset_first(n2)->start, 2011 bset_bkey_last(btree_bset_first(n1)), 2012 btree_bset_first(n2)->keys * sizeof(uint64_t)); 2013 2014 bkey_copy_key(&n2->key, &b->key); 2015 2016 bch_keylist_add(&parent_keys, &n2->key); 2017 bch_btree_node_write(n2, &cl); 2018 mutex_unlock(&n2->write_lock); 2019 rw_unlock(true, n2); 2020 } else { 2021 trace_bcache_btree_node_compact(b, btree_bset_first(n1)->keys); 2022 2023 mutex_lock(&n1->write_lock); 2024 bch_btree_insert_keys(n1, op, insert_keys, replace_key); 2025 } 2026 2027 bch_keylist_add(&parent_keys, &n1->key); 2028 bch_btree_node_write(n1, &cl); 2029 mutex_unlock(&n1->write_lock); 2030 2031 if (n3) { 2032 /* Depth increases, make a new root */ 2033 mutex_lock(&n3->write_lock); 2034 bkey_copy_key(&n3->key, &MAX_KEY); 2035 bch_btree_insert_keys(n3, op, &parent_keys, NULL); 2036 bch_btree_node_write(n3, &cl); 2037 mutex_unlock(&n3->write_lock); 2038 2039 closure_sync(&cl); 2040 bch_btree_set_root(n3); 2041 rw_unlock(true, n3); 2042 } else if (!b->parent) { 2043 /* Root filled up but didn't need to be split */ 2044 closure_sync(&cl); 2045 bch_btree_set_root(n1); 2046 } else { 2047 /* Split a non root node */ 2048 closure_sync(&cl); 2049 make_btree_freeing_key(b, parent_keys.top); 2050 bch_keylist_push(&parent_keys); 2051 2052 bch_btree_insert_node(b->parent, op, &parent_keys, NULL, NULL); 2053 BUG_ON(!bch_keylist_empty(&parent_keys)); 2054 } 2055 2056 btree_node_free(b); 2057 rw_unlock(true, n1); 2058 2059 bch_time_stats_update(&b->c->btree_split_time, start_time); 2060 2061 return 0; 2062 err_free2: 2063 bkey_put(b->c, &n2->key); 2064 btree_node_free(n2); 2065 rw_unlock(true, n2); 2066 err_free1: 2067 bkey_put(b->c, &n1->key); 2068 btree_node_free(n1); 2069 rw_unlock(true, n1); 2070 err: 2071 WARN(1, "bcache: btree split failed (level %u)", b->level); 2072 2073 if (n3 == ERR_PTR(-EAGAIN) || 2074 n2 == ERR_PTR(-EAGAIN) || 2075 n1 == ERR_PTR(-EAGAIN)) 2076 return -EAGAIN; 2077 2078 return -ENOMEM; 2079 } 2080 2081 static int bch_btree_insert_node(struct btree *b, struct btree_op *op, 2082 struct keylist *insert_keys, 2083 atomic_t *journal_ref, 2084 struct bkey *replace_key) 2085 { 2086 struct closure cl; 2087 2088 BUG_ON(b->level && replace_key); 2089 2090 closure_init_stack(&cl); 2091 2092 mutex_lock(&b->write_lock); 2093 2094 if (write_block(b) != btree_bset_last(b) && 2095 b->keys.last_set_unwritten) 2096 bch_btree_init_next(b); /* just wrote a set */ 2097 2098 if (bch_keylist_nkeys(insert_keys) > insert_u64s_remaining(b)) { 2099 mutex_unlock(&b->write_lock); 2100 goto split; 2101 } 2102 2103 BUG_ON(write_block(b) != btree_bset_last(b)); 2104 2105 if (bch_btree_insert_keys(b, op, insert_keys, replace_key)) { 2106 if (!b->level) 2107 bch_btree_leaf_dirty(b, journal_ref); 2108 else 2109 bch_btree_node_write(b, &cl); 2110 } 2111 2112 mutex_unlock(&b->write_lock); 2113 2114 /* wait for btree node write if necessary, after unlock */ 2115 closure_sync(&cl); 2116 2117 return 0; 2118 split: 2119 if (current->bio_list) { 2120 op->lock = b->c->root->level + 1; 2121 return -EAGAIN; 2122 } else if (op->lock <= b->c->root->level) { 2123 op->lock = b->c->root->level + 1; 2124 return -EINTR; 2125 } else { 2126 /* Invalidated all iterators */ 2127 int ret = btree_split(b, op, insert_keys, replace_key); 2128 2129 if (bch_keylist_empty(insert_keys)) 2130 return 0; 2131 else if (!ret) 2132 return -EINTR; 2133 return ret; 2134 } 2135 } 2136 2137 int bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 2138 struct bkey *check_key) 2139 { 2140 int ret = -EINTR; 2141 uint64_t btree_ptr = b->key.ptr[0]; 2142 unsigned long seq = b->seq; 2143 struct keylist insert; 2144 bool upgrade = op->lock == -1; 2145 2146 bch_keylist_init(&insert); 2147 2148 if (upgrade) { 2149 rw_unlock(false, b); 2150 rw_lock(true, b, b->level); 2151 2152 if (b->key.ptr[0] != btree_ptr || 2153 b->seq != seq + 1) 2154 goto out; 2155 } 2156 2157 SET_KEY_PTRS(check_key, 1); 2158 get_random_bytes(&check_key->ptr[0], sizeof(uint64_t)); 2159 2160 SET_PTR_DEV(check_key, 0, PTR_CHECK_DEV); 2161 2162 bch_keylist_add(&insert, check_key); 2163 2164 ret = bch_btree_insert_node(b, op, &insert, NULL, NULL); 2165 2166 BUG_ON(!ret && !bch_keylist_empty(&insert)); 2167 out: 2168 if (upgrade) 2169 downgrade_write(&b->lock); 2170 return ret; 2171 } 2172 2173 struct btree_insert_op { 2174 struct btree_op op; 2175 struct keylist *keys; 2176 atomic_t *journal_ref; 2177 struct bkey *replace_key; 2178 }; 2179 2180 static int btree_insert_fn(struct btree_op *b_op, struct btree *b) 2181 { 2182 struct btree_insert_op *op = container_of(b_op, 2183 struct btree_insert_op, op); 2184 2185 int ret = bch_btree_insert_node(b, &op->op, op->keys, 2186 op->journal_ref, op->replace_key); 2187 if (ret && !bch_keylist_empty(op->keys)) 2188 return ret; 2189 else 2190 return MAP_DONE; 2191 } 2192 2193 int bch_btree_insert(struct cache_set *c, struct keylist *keys, 2194 atomic_t *journal_ref, struct bkey *replace_key) 2195 { 2196 struct btree_insert_op op; 2197 int ret = 0; 2198 2199 BUG_ON(current->bio_list); 2200 BUG_ON(bch_keylist_empty(keys)); 2201 2202 bch_btree_op_init(&op.op, 0); 2203 op.keys = keys; 2204 op.journal_ref = journal_ref; 2205 op.replace_key = replace_key; 2206 2207 while (!ret && !bch_keylist_empty(keys)) { 2208 op.op.lock = 0; 2209 ret = bch_btree_map_leaf_nodes(&op.op, c, 2210 &START_KEY(keys->keys), 2211 btree_insert_fn); 2212 } 2213 2214 if (ret) { 2215 struct bkey *k; 2216 2217 pr_err("error %i", ret); 2218 2219 while ((k = bch_keylist_pop(keys))) 2220 bkey_put(c, k); 2221 } else if (op.op.insert_collision) 2222 ret = -ESRCH; 2223 2224 return ret; 2225 } 2226 2227 void bch_btree_set_root(struct btree *b) 2228 { 2229 unsigned i; 2230 struct closure cl; 2231 2232 closure_init_stack(&cl); 2233 2234 trace_bcache_btree_set_root(b); 2235 2236 BUG_ON(!b->written); 2237 2238 for (i = 0; i < KEY_PTRS(&b->key); i++) 2239 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2240 2241 mutex_lock(&b->c->bucket_lock); 2242 list_del_init(&b->list); 2243 mutex_unlock(&b->c->bucket_lock); 2244 2245 b->c->root = b; 2246 2247 bch_journal_meta(b->c, &cl); 2248 closure_sync(&cl); 2249 } 2250 2251 /* Map across nodes or keys */ 2252 2253 static int bch_btree_map_nodes_recurse(struct btree *b, struct btree_op *op, 2254 struct bkey *from, 2255 btree_map_nodes_fn *fn, int flags) 2256 { 2257 int ret = MAP_CONTINUE; 2258 2259 if (b->level) { 2260 struct bkey *k; 2261 struct btree_iter iter; 2262 2263 bch_btree_iter_init(&b->keys, &iter, from); 2264 2265 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, 2266 bch_ptr_bad))) { 2267 ret = btree(map_nodes_recurse, k, b, 2268 op, from, fn, flags); 2269 from = NULL; 2270 2271 if (ret != MAP_CONTINUE) 2272 return ret; 2273 } 2274 } 2275 2276 if (!b->level || flags == MAP_ALL_NODES) 2277 ret = fn(op, b); 2278 2279 return ret; 2280 } 2281 2282 int __bch_btree_map_nodes(struct btree_op *op, struct cache_set *c, 2283 struct bkey *from, btree_map_nodes_fn *fn, int flags) 2284 { 2285 return btree_root(map_nodes_recurse, c, op, from, fn, flags); 2286 } 2287 2288 static int bch_btree_map_keys_recurse(struct btree *b, struct btree_op *op, 2289 struct bkey *from, btree_map_keys_fn *fn, 2290 int flags) 2291 { 2292 int ret = MAP_CONTINUE; 2293 struct bkey *k; 2294 struct btree_iter iter; 2295 2296 bch_btree_iter_init(&b->keys, &iter, from); 2297 2298 while ((k = bch_btree_iter_next_filter(&iter, &b->keys, bch_ptr_bad))) { 2299 ret = !b->level 2300 ? fn(op, b, k) 2301 : btree(map_keys_recurse, k, b, op, from, fn, flags); 2302 from = NULL; 2303 2304 if (ret != MAP_CONTINUE) 2305 return ret; 2306 } 2307 2308 if (!b->level && (flags & MAP_END_KEY)) 2309 ret = fn(op, b, &KEY(KEY_INODE(&b->key), 2310 KEY_OFFSET(&b->key), 0)); 2311 2312 return ret; 2313 } 2314 2315 int bch_btree_map_keys(struct btree_op *op, struct cache_set *c, 2316 struct bkey *from, btree_map_keys_fn *fn, int flags) 2317 { 2318 return btree_root(map_keys_recurse, c, op, from, fn, flags); 2319 } 2320 2321 /* Keybuf code */ 2322 2323 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2324 { 2325 /* Overlapping keys compare equal */ 2326 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2327 return -1; 2328 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2329 return 1; 2330 return 0; 2331 } 2332 2333 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2334 struct keybuf_key *r) 2335 { 2336 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2337 } 2338 2339 struct refill { 2340 struct btree_op op; 2341 unsigned nr_found; 2342 struct keybuf *buf; 2343 struct bkey *end; 2344 keybuf_pred_fn *pred; 2345 }; 2346 2347 static int refill_keybuf_fn(struct btree_op *op, struct btree *b, 2348 struct bkey *k) 2349 { 2350 struct refill *refill = container_of(op, struct refill, op); 2351 struct keybuf *buf = refill->buf; 2352 int ret = MAP_CONTINUE; 2353 2354 if (bkey_cmp(k, refill->end) >= 0) { 2355 ret = MAP_DONE; 2356 goto out; 2357 } 2358 2359 if (!KEY_SIZE(k)) /* end key */ 2360 goto out; 2361 2362 if (refill->pred(buf, k)) { 2363 struct keybuf_key *w; 2364 2365 spin_lock(&buf->lock); 2366 2367 w = array_alloc(&buf->freelist); 2368 if (!w) { 2369 spin_unlock(&buf->lock); 2370 return MAP_DONE; 2371 } 2372 2373 w->private = NULL; 2374 bkey_copy(&w->key, k); 2375 2376 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2377 array_free(&buf->freelist, w); 2378 else 2379 refill->nr_found++; 2380 2381 if (array_freelist_empty(&buf->freelist)) 2382 ret = MAP_DONE; 2383 2384 spin_unlock(&buf->lock); 2385 } 2386 out: 2387 buf->last_scanned = *k; 2388 return ret; 2389 } 2390 2391 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2392 struct bkey *end, keybuf_pred_fn *pred) 2393 { 2394 struct bkey start = buf->last_scanned; 2395 struct refill refill; 2396 2397 cond_resched(); 2398 2399 bch_btree_op_init(&refill.op, -1); 2400 refill.nr_found = 0; 2401 refill.buf = buf; 2402 refill.end = end; 2403 refill.pred = pred; 2404 2405 bch_btree_map_keys(&refill.op, c, &buf->last_scanned, 2406 refill_keybuf_fn, MAP_END_KEY); 2407 2408 trace_bcache_keyscan(refill.nr_found, 2409 KEY_INODE(&start), KEY_OFFSET(&start), 2410 KEY_INODE(&buf->last_scanned), 2411 KEY_OFFSET(&buf->last_scanned)); 2412 2413 spin_lock(&buf->lock); 2414 2415 if (!RB_EMPTY_ROOT(&buf->keys)) { 2416 struct keybuf_key *w; 2417 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2418 buf->start = START_KEY(&w->key); 2419 2420 w = RB_LAST(&buf->keys, struct keybuf_key, node); 2421 buf->end = w->key; 2422 } else { 2423 buf->start = MAX_KEY; 2424 buf->end = MAX_KEY; 2425 } 2426 2427 spin_unlock(&buf->lock); 2428 } 2429 2430 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2431 { 2432 rb_erase(&w->node, &buf->keys); 2433 array_free(&buf->freelist, w); 2434 } 2435 2436 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2437 { 2438 spin_lock(&buf->lock); 2439 __bch_keybuf_del(buf, w); 2440 spin_unlock(&buf->lock); 2441 } 2442 2443 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2444 struct bkey *end) 2445 { 2446 bool ret = false; 2447 struct keybuf_key *p, *w, s; 2448 s.key = *start; 2449 2450 if (bkey_cmp(end, &buf->start) <= 0 || 2451 bkey_cmp(start, &buf->end) >= 0) 2452 return false; 2453 2454 spin_lock(&buf->lock); 2455 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2456 2457 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2458 p = w; 2459 w = RB_NEXT(w, node); 2460 2461 if (p->private) 2462 ret = true; 2463 else 2464 __bch_keybuf_del(buf, p); 2465 } 2466 2467 spin_unlock(&buf->lock); 2468 return ret; 2469 } 2470 2471 struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2472 { 2473 struct keybuf_key *w; 2474 spin_lock(&buf->lock); 2475 2476 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2477 2478 while (w && w->private) 2479 w = RB_NEXT(w, node); 2480 2481 if (w) 2482 w->private = ERR_PTR(-EINTR); 2483 2484 spin_unlock(&buf->lock); 2485 return w; 2486 } 2487 2488 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2489 struct keybuf *buf, 2490 struct bkey *end, 2491 keybuf_pred_fn *pred) 2492 { 2493 struct keybuf_key *ret; 2494 2495 while (1) { 2496 ret = bch_keybuf_next(buf); 2497 if (ret) 2498 break; 2499 2500 if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2501 pr_debug("scan finished"); 2502 break; 2503 } 2504 2505 bch_refill_keybuf(c, buf, end, pred); 2506 } 2507 2508 return ret; 2509 } 2510 2511 void bch_keybuf_init(struct keybuf *buf) 2512 { 2513 buf->last_scanned = MAX_KEY; 2514 buf->keys = RB_ROOT; 2515 2516 spin_lock_init(&buf->lock); 2517 array_allocator_init(&buf->freelist); 2518 } 2519