1 /* 2 * Copyright (C) 2010 Kent Overstreet <kent.overstreet@gmail.com> 3 * 4 * Uses a block device as cache for other block devices; optimized for SSDs. 5 * All allocation is done in buckets, which should match the erase block size 6 * of the device. 7 * 8 * Buckets containing cached data are kept on a heap sorted by priority; 9 * bucket priority is increased on cache hit, and periodically all the buckets 10 * on the heap have their priority scaled down. This currently is just used as 11 * an LRU but in the future should allow for more intelligent heuristics. 12 * 13 * Buckets have an 8 bit counter; freeing is accomplished by incrementing the 14 * counter. Garbage collection is used to remove stale pointers. 15 * 16 * Indexing is done via a btree; nodes are not necessarily fully sorted, rather 17 * as keys are inserted we only sort the pages that have not yet been written. 18 * When garbage collection is run, we resort the entire node. 19 * 20 * All configuration is done via sysfs; see Documentation/bcache.txt. 21 */ 22 23 #include "bcache.h" 24 #include "btree.h" 25 #include "debug.h" 26 #include "request.h" 27 #include "writeback.h" 28 29 #include <linux/slab.h> 30 #include <linux/bitops.h> 31 #include <linux/hash.h> 32 #include <linux/prefetch.h> 33 #include <linux/random.h> 34 #include <linux/rcupdate.h> 35 #include <trace/events/bcache.h> 36 37 /* 38 * Todo: 39 * register_bcache: Return errors out to userspace correctly 40 * 41 * Writeback: don't undirty key until after a cache flush 42 * 43 * Create an iterator for key pointers 44 * 45 * On btree write error, mark bucket such that it won't be freed from the cache 46 * 47 * Journalling: 48 * Check for bad keys in replay 49 * Propagate barriers 50 * Refcount journal entries in journal_replay 51 * 52 * Garbage collection: 53 * Finish incremental gc 54 * Gc should free old UUIDs, data for invalid UUIDs 55 * 56 * Provide a way to list backing device UUIDs we have data cached for, and 57 * probably how long it's been since we've seen them, and a way to invalidate 58 * dirty data for devices that will never be attached again 59 * 60 * Keep 1 min/5 min/15 min statistics of how busy a block device has been, so 61 * that based on that and how much dirty data we have we can keep writeback 62 * from being starved 63 * 64 * Add a tracepoint or somesuch to watch for writeback starvation 65 * 66 * When btree depth > 1 and splitting an interior node, we have to make sure 67 * alloc_bucket() cannot fail. This should be true but is not completely 68 * obvious. 69 * 70 * Make sure all allocations get charged to the root cgroup 71 * 72 * Plugging? 73 * 74 * If data write is less than hard sector size of ssd, round up offset in open 75 * bucket to the next whole sector 76 * 77 * Also lookup by cgroup in get_open_bucket() 78 * 79 * Superblock needs to be fleshed out for multiple cache devices 80 * 81 * Add a sysfs tunable for the number of writeback IOs in flight 82 * 83 * Add a sysfs tunable for the number of open data buckets 84 * 85 * IO tracking: Can we track when one process is doing io on behalf of another? 86 * IO tracking: Don't use just an average, weigh more recent stuff higher 87 * 88 * Test module load/unload 89 */ 90 91 static const char * const op_types[] = { 92 "insert", "replace" 93 }; 94 95 static const char *op_type(struct btree_op *op) 96 { 97 return op_types[op->type]; 98 } 99 100 #define MAX_NEED_GC 64 101 #define MAX_SAVE_PRIO 72 102 103 #define PTR_DIRTY_BIT (((uint64_t) 1 << 36)) 104 105 #define PTR_HASH(c, k) \ 106 (((k)->ptr[0] >> c->bucket_bits) | PTR_GEN(k, 0)) 107 108 struct workqueue_struct *bch_gc_wq; 109 static struct workqueue_struct *btree_io_wq; 110 111 void bch_btree_op_init_stack(struct btree_op *op) 112 { 113 memset(op, 0, sizeof(struct btree_op)); 114 closure_init_stack(&op->cl); 115 op->lock = -1; 116 bch_keylist_init(&op->keys); 117 } 118 119 /* Btree key manipulation */ 120 121 static void bkey_put(struct cache_set *c, struct bkey *k, int level) 122 { 123 if ((level && KEY_OFFSET(k)) || !level) 124 __bkey_put(c, k); 125 } 126 127 /* Btree IO */ 128 129 static uint64_t btree_csum_set(struct btree *b, struct bset *i) 130 { 131 uint64_t crc = b->key.ptr[0]; 132 void *data = (void *) i + 8, *end = end(i); 133 134 crc = bch_crc64_update(crc, data, end - data); 135 return crc ^ 0xffffffffffffffffULL; 136 } 137 138 static void bch_btree_node_read_done(struct btree *b) 139 { 140 const char *err = "bad btree header"; 141 struct bset *i = b->sets[0].data; 142 struct btree_iter *iter; 143 144 iter = mempool_alloc(b->c->fill_iter, GFP_NOWAIT); 145 iter->size = b->c->sb.bucket_size / b->c->sb.block_size; 146 iter->used = 0; 147 148 if (!i->seq) 149 goto err; 150 151 for (; 152 b->written < btree_blocks(b) && i->seq == b->sets[0].data->seq; 153 i = write_block(b)) { 154 err = "unsupported bset version"; 155 if (i->version > BCACHE_BSET_VERSION) 156 goto err; 157 158 err = "bad btree header"; 159 if (b->written + set_blocks(i, b->c) > btree_blocks(b)) 160 goto err; 161 162 err = "bad magic"; 163 if (i->magic != bset_magic(b->c)) 164 goto err; 165 166 err = "bad checksum"; 167 switch (i->version) { 168 case 0: 169 if (i->csum != csum_set(i)) 170 goto err; 171 break; 172 case BCACHE_BSET_VERSION: 173 if (i->csum != btree_csum_set(b, i)) 174 goto err; 175 break; 176 } 177 178 err = "empty set"; 179 if (i != b->sets[0].data && !i->keys) 180 goto err; 181 182 bch_btree_iter_push(iter, i->start, end(i)); 183 184 b->written += set_blocks(i, b->c); 185 } 186 187 err = "corrupted btree"; 188 for (i = write_block(b); 189 index(i, b) < btree_blocks(b); 190 i = ((void *) i) + block_bytes(b->c)) 191 if (i->seq == b->sets[0].data->seq) 192 goto err; 193 194 bch_btree_sort_and_fix_extents(b, iter); 195 196 i = b->sets[0].data; 197 err = "short btree key"; 198 if (b->sets[0].size && 199 bkey_cmp(&b->key, &b->sets[0].end) < 0) 200 goto err; 201 202 if (b->written < btree_blocks(b)) 203 bch_bset_init_next(b); 204 out: 205 mempool_free(iter, b->c->fill_iter); 206 return; 207 err: 208 set_btree_node_io_error(b); 209 bch_cache_set_error(b->c, "%s at bucket %zu, block %zu, %u keys", 210 err, PTR_BUCKET_NR(b->c, &b->key, 0), 211 index(i, b), i->keys); 212 goto out; 213 } 214 215 static void btree_node_read_endio(struct bio *bio, int error) 216 { 217 struct closure *cl = bio->bi_private; 218 closure_put(cl); 219 } 220 221 void bch_btree_node_read(struct btree *b) 222 { 223 uint64_t start_time = local_clock(); 224 struct closure cl; 225 struct bio *bio; 226 227 trace_bcache_btree_read(b); 228 229 closure_init_stack(&cl); 230 231 bio = bch_bbio_alloc(b->c); 232 bio->bi_rw = REQ_META|READ_SYNC; 233 bio->bi_size = KEY_SIZE(&b->key) << 9; 234 bio->bi_end_io = btree_node_read_endio; 235 bio->bi_private = &cl; 236 237 bch_bio_map(bio, b->sets[0].data); 238 239 bch_submit_bbio(bio, b->c, &b->key, 0); 240 closure_sync(&cl); 241 242 if (!test_bit(BIO_UPTODATE, &bio->bi_flags)) 243 set_btree_node_io_error(b); 244 245 bch_bbio_free(bio, b->c); 246 247 if (btree_node_io_error(b)) 248 goto err; 249 250 bch_btree_node_read_done(b); 251 252 spin_lock(&b->c->btree_read_time_lock); 253 bch_time_stats_update(&b->c->btree_read_time, start_time); 254 spin_unlock(&b->c->btree_read_time_lock); 255 256 return; 257 err: 258 bch_cache_set_error(b->c, "io error reading bucket %zu", 259 PTR_BUCKET_NR(b->c, &b->key, 0)); 260 } 261 262 static void btree_complete_write(struct btree *b, struct btree_write *w) 263 { 264 if (w->prio_blocked && 265 !atomic_sub_return(w->prio_blocked, &b->c->prio_blocked)) 266 wake_up_allocators(b->c); 267 268 if (w->journal) { 269 atomic_dec_bug(w->journal); 270 __closure_wake_up(&b->c->journal.wait); 271 } 272 273 w->prio_blocked = 0; 274 w->journal = NULL; 275 } 276 277 static void __btree_node_write_done(struct closure *cl) 278 { 279 struct btree *b = container_of(cl, struct btree, io.cl); 280 struct btree_write *w = btree_prev_write(b); 281 282 bch_bbio_free(b->bio, b->c); 283 b->bio = NULL; 284 btree_complete_write(b, w); 285 286 if (btree_node_dirty(b)) 287 queue_delayed_work(btree_io_wq, &b->work, 288 msecs_to_jiffies(30000)); 289 290 closure_return(cl); 291 } 292 293 static void btree_node_write_done(struct closure *cl) 294 { 295 struct btree *b = container_of(cl, struct btree, io.cl); 296 struct bio_vec *bv; 297 int n; 298 299 __bio_for_each_segment(bv, b->bio, n, 0) 300 __free_page(bv->bv_page); 301 302 __btree_node_write_done(cl); 303 } 304 305 static void btree_node_write_endio(struct bio *bio, int error) 306 { 307 struct closure *cl = bio->bi_private; 308 struct btree *b = container_of(cl, struct btree, io.cl); 309 310 if (error) 311 set_btree_node_io_error(b); 312 313 bch_bbio_count_io_errors(b->c, bio, error, "writing btree"); 314 closure_put(cl); 315 } 316 317 static void do_btree_node_write(struct btree *b) 318 { 319 struct closure *cl = &b->io.cl; 320 struct bset *i = b->sets[b->nsets].data; 321 BKEY_PADDED(key) k; 322 323 i->version = BCACHE_BSET_VERSION; 324 i->csum = btree_csum_set(b, i); 325 326 BUG_ON(b->bio); 327 b->bio = bch_bbio_alloc(b->c); 328 329 b->bio->bi_end_io = btree_node_write_endio; 330 b->bio->bi_private = &b->io.cl; 331 b->bio->bi_rw = REQ_META|WRITE_SYNC|REQ_FUA; 332 b->bio->bi_size = set_blocks(i, b->c) * block_bytes(b->c); 333 bch_bio_map(b->bio, i); 334 335 /* 336 * If we're appending to a leaf node, we don't technically need FUA - 337 * this write just needs to be persisted before the next journal write, 338 * which will be marked FLUSH|FUA. 339 * 340 * Similarly if we're writing a new btree root - the pointer is going to 341 * be in the next journal entry. 342 * 343 * But if we're writing a new btree node (that isn't a root) or 344 * appending to a non leaf btree node, we need either FUA or a flush 345 * when we write the parent with the new pointer. FUA is cheaper than a 346 * flush, and writes appending to leaf nodes aren't blocking anything so 347 * just make all btree node writes FUA to keep things sane. 348 */ 349 350 bkey_copy(&k.key, &b->key); 351 SET_PTR_OFFSET(&k.key, 0, PTR_OFFSET(&k.key, 0) + bset_offset(b, i)); 352 353 if (!bio_alloc_pages(b->bio, GFP_NOIO)) { 354 int j; 355 struct bio_vec *bv; 356 void *base = (void *) ((unsigned long) i & ~(PAGE_SIZE - 1)); 357 358 bio_for_each_segment(bv, b->bio, j) 359 memcpy(page_address(bv->bv_page), 360 base + j * PAGE_SIZE, PAGE_SIZE); 361 362 bch_submit_bbio(b->bio, b->c, &k.key, 0); 363 364 continue_at(cl, btree_node_write_done, NULL); 365 } else { 366 b->bio->bi_vcnt = 0; 367 bch_bio_map(b->bio, i); 368 369 bch_submit_bbio(b->bio, b->c, &k.key, 0); 370 371 closure_sync(cl); 372 __btree_node_write_done(cl); 373 } 374 } 375 376 void bch_btree_node_write(struct btree *b, struct closure *parent) 377 { 378 struct bset *i = b->sets[b->nsets].data; 379 380 trace_bcache_btree_write(b); 381 382 BUG_ON(current->bio_list); 383 BUG_ON(b->written >= btree_blocks(b)); 384 BUG_ON(b->written && !i->keys); 385 BUG_ON(b->sets->data->seq != i->seq); 386 bch_check_key_order(b, i); 387 388 cancel_delayed_work(&b->work); 389 390 /* If caller isn't waiting for write, parent refcount is cache set */ 391 closure_lock(&b->io, parent ?: &b->c->cl); 392 393 clear_bit(BTREE_NODE_dirty, &b->flags); 394 change_bit(BTREE_NODE_write_idx, &b->flags); 395 396 do_btree_node_write(b); 397 398 b->written += set_blocks(i, b->c); 399 atomic_long_add(set_blocks(i, b->c) * b->c->sb.block_size, 400 &PTR_CACHE(b->c, &b->key, 0)->btree_sectors_written); 401 402 bch_btree_sort_lazy(b); 403 404 if (b->written < btree_blocks(b)) 405 bch_bset_init_next(b); 406 } 407 408 static void btree_node_write_work(struct work_struct *w) 409 { 410 struct btree *b = container_of(to_delayed_work(w), struct btree, work); 411 412 rw_lock(true, b, b->level); 413 414 if (btree_node_dirty(b)) 415 bch_btree_node_write(b, NULL); 416 rw_unlock(true, b); 417 } 418 419 static void bch_btree_leaf_dirty(struct btree *b, struct btree_op *op) 420 { 421 struct bset *i = b->sets[b->nsets].data; 422 struct btree_write *w = btree_current_write(b); 423 424 BUG_ON(!b->written); 425 BUG_ON(!i->keys); 426 427 if (!btree_node_dirty(b)) 428 queue_delayed_work(btree_io_wq, &b->work, 30 * HZ); 429 430 set_btree_node_dirty(b); 431 432 if (op && op->journal) { 433 if (w->journal && 434 journal_pin_cmp(b->c, w, op)) { 435 atomic_dec_bug(w->journal); 436 w->journal = NULL; 437 } 438 439 if (!w->journal) { 440 w->journal = op->journal; 441 atomic_inc(w->journal); 442 } 443 } 444 445 /* Force write if set is too big */ 446 if (set_bytes(i) > PAGE_SIZE - 48 && 447 !current->bio_list) 448 bch_btree_node_write(b, NULL); 449 } 450 451 /* 452 * Btree in memory cache - allocation/freeing 453 * mca -> memory cache 454 */ 455 456 static void mca_reinit(struct btree *b) 457 { 458 unsigned i; 459 460 b->flags = 0; 461 b->written = 0; 462 b->nsets = 0; 463 464 for (i = 0; i < MAX_BSETS; i++) 465 b->sets[i].size = 0; 466 /* 467 * Second loop starts at 1 because b->sets[0]->data is the memory we 468 * allocated 469 */ 470 for (i = 1; i < MAX_BSETS; i++) 471 b->sets[i].data = NULL; 472 } 473 474 #define mca_reserve(c) (((c->root && c->root->level) \ 475 ? c->root->level : 1) * 8 + 16) 476 #define mca_can_free(c) \ 477 max_t(int, 0, c->bucket_cache_used - mca_reserve(c)) 478 479 static void mca_data_free(struct btree *b) 480 { 481 struct bset_tree *t = b->sets; 482 BUG_ON(!closure_is_unlocked(&b->io.cl)); 483 484 if (bset_prev_bytes(b) < PAGE_SIZE) 485 kfree(t->prev); 486 else 487 free_pages((unsigned long) t->prev, 488 get_order(bset_prev_bytes(b))); 489 490 if (bset_tree_bytes(b) < PAGE_SIZE) 491 kfree(t->tree); 492 else 493 free_pages((unsigned long) t->tree, 494 get_order(bset_tree_bytes(b))); 495 496 free_pages((unsigned long) t->data, b->page_order); 497 498 t->prev = NULL; 499 t->tree = NULL; 500 t->data = NULL; 501 list_move(&b->list, &b->c->btree_cache_freed); 502 b->c->bucket_cache_used--; 503 } 504 505 static void mca_bucket_free(struct btree *b) 506 { 507 BUG_ON(btree_node_dirty(b)); 508 509 b->key.ptr[0] = 0; 510 hlist_del_init_rcu(&b->hash); 511 list_move(&b->list, &b->c->btree_cache_freeable); 512 } 513 514 static unsigned btree_order(struct bkey *k) 515 { 516 return ilog2(KEY_SIZE(k) / PAGE_SECTORS ?: 1); 517 } 518 519 static void mca_data_alloc(struct btree *b, struct bkey *k, gfp_t gfp) 520 { 521 struct bset_tree *t = b->sets; 522 BUG_ON(t->data); 523 524 b->page_order = max_t(unsigned, 525 ilog2(b->c->btree_pages), 526 btree_order(k)); 527 528 t->data = (void *) __get_free_pages(gfp, b->page_order); 529 if (!t->data) 530 goto err; 531 532 t->tree = bset_tree_bytes(b) < PAGE_SIZE 533 ? kmalloc(bset_tree_bytes(b), gfp) 534 : (void *) __get_free_pages(gfp, get_order(bset_tree_bytes(b))); 535 if (!t->tree) 536 goto err; 537 538 t->prev = bset_prev_bytes(b) < PAGE_SIZE 539 ? kmalloc(bset_prev_bytes(b), gfp) 540 : (void *) __get_free_pages(gfp, get_order(bset_prev_bytes(b))); 541 if (!t->prev) 542 goto err; 543 544 list_move(&b->list, &b->c->btree_cache); 545 b->c->bucket_cache_used++; 546 return; 547 err: 548 mca_data_free(b); 549 } 550 551 static struct btree *mca_bucket_alloc(struct cache_set *c, 552 struct bkey *k, gfp_t gfp) 553 { 554 struct btree *b = kzalloc(sizeof(struct btree), gfp); 555 if (!b) 556 return NULL; 557 558 init_rwsem(&b->lock); 559 lockdep_set_novalidate_class(&b->lock); 560 INIT_LIST_HEAD(&b->list); 561 INIT_DELAYED_WORK(&b->work, btree_node_write_work); 562 b->c = c; 563 closure_init_unlocked(&b->io); 564 565 mca_data_alloc(b, k, gfp); 566 return b; 567 } 568 569 static int mca_reap(struct btree *b, struct closure *cl, unsigned min_order) 570 { 571 lockdep_assert_held(&b->c->bucket_lock); 572 573 if (!down_write_trylock(&b->lock)) 574 return -ENOMEM; 575 576 if (b->page_order < min_order) { 577 rw_unlock(true, b); 578 return -ENOMEM; 579 } 580 581 BUG_ON(btree_node_dirty(b) && !b->sets[0].data); 582 583 if (cl && btree_node_dirty(b)) 584 bch_btree_node_write(b, NULL); 585 586 if (cl) 587 closure_wait_event_async(&b->io.wait, cl, 588 atomic_read(&b->io.cl.remaining) == -1); 589 590 if (btree_node_dirty(b) || 591 !closure_is_unlocked(&b->io.cl) || 592 work_pending(&b->work.work)) { 593 rw_unlock(true, b); 594 return -EAGAIN; 595 } 596 597 return 0; 598 } 599 600 static unsigned long bch_mca_scan(struct shrinker *shrink, 601 struct shrink_control *sc) 602 { 603 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 604 struct btree *b, *t; 605 unsigned long i, nr = sc->nr_to_scan; 606 unsigned long freed = 0; 607 608 if (c->shrinker_disabled) 609 return SHRINK_STOP; 610 611 if (c->try_harder) 612 return SHRINK_STOP; 613 614 /* Return -1 if we can't do anything right now */ 615 if (sc->gfp_mask & __GFP_IO) 616 mutex_lock(&c->bucket_lock); 617 else if (!mutex_trylock(&c->bucket_lock)) 618 return -1; 619 620 /* 621 * It's _really_ critical that we don't free too many btree nodes - we 622 * have to always leave ourselves a reserve. The reserve is how we 623 * guarantee that allocating memory for a new btree node can always 624 * succeed, so that inserting keys into the btree can always succeed and 625 * IO can always make forward progress: 626 */ 627 nr /= c->btree_pages; 628 nr = min_t(unsigned long, nr, mca_can_free(c)); 629 630 i = 0; 631 list_for_each_entry_safe(b, t, &c->btree_cache_freeable, list) { 632 if (freed >= nr) 633 break; 634 635 if (++i > 3 && 636 !mca_reap(b, NULL, 0)) { 637 mca_data_free(b); 638 rw_unlock(true, b); 639 freed++; 640 } 641 } 642 643 /* 644 * Can happen right when we first start up, before we've read in any 645 * btree nodes 646 */ 647 if (list_empty(&c->btree_cache)) 648 goto out; 649 650 for (i = 0; (nr--) && i < c->bucket_cache_used; i++) { 651 b = list_first_entry(&c->btree_cache, struct btree, list); 652 list_rotate_left(&c->btree_cache); 653 654 if (!b->accessed && 655 !mca_reap(b, NULL, 0)) { 656 mca_bucket_free(b); 657 mca_data_free(b); 658 rw_unlock(true, b); 659 freed++; 660 } else 661 b->accessed = 0; 662 } 663 out: 664 mutex_unlock(&c->bucket_lock); 665 return freed; 666 } 667 668 static unsigned long bch_mca_count(struct shrinker *shrink, 669 struct shrink_control *sc) 670 { 671 struct cache_set *c = container_of(shrink, struct cache_set, shrink); 672 673 if (c->shrinker_disabled) 674 return 0; 675 676 if (c->try_harder) 677 return 0; 678 679 return mca_can_free(c) * c->btree_pages; 680 } 681 682 void bch_btree_cache_free(struct cache_set *c) 683 { 684 struct btree *b; 685 struct closure cl; 686 closure_init_stack(&cl); 687 688 if (c->shrink.list.next) 689 unregister_shrinker(&c->shrink); 690 691 mutex_lock(&c->bucket_lock); 692 693 #ifdef CONFIG_BCACHE_DEBUG 694 if (c->verify_data) 695 list_move(&c->verify_data->list, &c->btree_cache); 696 #endif 697 698 list_splice(&c->btree_cache_freeable, 699 &c->btree_cache); 700 701 while (!list_empty(&c->btree_cache)) { 702 b = list_first_entry(&c->btree_cache, struct btree, list); 703 704 if (btree_node_dirty(b)) 705 btree_complete_write(b, btree_current_write(b)); 706 clear_bit(BTREE_NODE_dirty, &b->flags); 707 708 mca_data_free(b); 709 } 710 711 while (!list_empty(&c->btree_cache_freed)) { 712 b = list_first_entry(&c->btree_cache_freed, 713 struct btree, list); 714 list_del(&b->list); 715 cancel_delayed_work_sync(&b->work); 716 kfree(b); 717 } 718 719 mutex_unlock(&c->bucket_lock); 720 } 721 722 int bch_btree_cache_alloc(struct cache_set *c) 723 { 724 unsigned i; 725 726 /* XXX: doesn't check for errors */ 727 728 closure_init_unlocked(&c->gc); 729 730 for (i = 0; i < mca_reserve(c); i++) 731 mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 732 733 list_splice_init(&c->btree_cache, 734 &c->btree_cache_freeable); 735 736 #ifdef CONFIG_BCACHE_DEBUG 737 mutex_init(&c->verify_lock); 738 739 c->verify_data = mca_bucket_alloc(c, &ZERO_KEY, GFP_KERNEL); 740 741 if (c->verify_data && 742 c->verify_data->sets[0].data) 743 list_del_init(&c->verify_data->list); 744 else 745 c->verify_data = NULL; 746 #endif 747 748 c->shrink.count_objects = bch_mca_count; 749 c->shrink.scan_objects = bch_mca_scan; 750 c->shrink.seeks = 4; 751 c->shrink.batch = c->btree_pages * 2; 752 register_shrinker(&c->shrink); 753 754 return 0; 755 } 756 757 /* Btree in memory cache - hash table */ 758 759 static struct hlist_head *mca_hash(struct cache_set *c, struct bkey *k) 760 { 761 return &c->bucket_hash[hash_32(PTR_HASH(c, k), BUCKET_HASH_BITS)]; 762 } 763 764 static struct btree *mca_find(struct cache_set *c, struct bkey *k) 765 { 766 struct btree *b; 767 768 rcu_read_lock(); 769 hlist_for_each_entry_rcu(b, mca_hash(c, k), hash) 770 if (PTR_HASH(c, &b->key) == PTR_HASH(c, k)) 771 goto out; 772 b = NULL; 773 out: 774 rcu_read_unlock(); 775 return b; 776 } 777 778 static struct btree *mca_cannibalize(struct cache_set *c, struct bkey *k, 779 int level, struct closure *cl) 780 { 781 int ret = -ENOMEM; 782 struct btree *i; 783 784 trace_bcache_btree_cache_cannibalize(c); 785 786 if (!cl) 787 return ERR_PTR(-ENOMEM); 788 789 /* 790 * Trying to free up some memory - i.e. reuse some btree nodes - may 791 * require initiating IO to flush the dirty part of the node. If we're 792 * running under generic_make_request(), that IO will never finish and 793 * we would deadlock. Returning -EAGAIN causes the cache lookup code to 794 * punt to workqueue and retry. 795 */ 796 if (current->bio_list) 797 return ERR_PTR(-EAGAIN); 798 799 if (c->try_harder && c->try_harder != cl) { 800 closure_wait_event_async(&c->try_wait, cl, !c->try_harder); 801 return ERR_PTR(-EAGAIN); 802 } 803 804 c->try_harder = cl; 805 c->try_harder_start = local_clock(); 806 retry: 807 list_for_each_entry_reverse(i, &c->btree_cache, list) { 808 int r = mca_reap(i, cl, btree_order(k)); 809 if (!r) 810 return i; 811 if (r != -ENOMEM) 812 ret = r; 813 } 814 815 if (ret == -EAGAIN && 816 closure_blocking(cl)) { 817 mutex_unlock(&c->bucket_lock); 818 closure_sync(cl); 819 mutex_lock(&c->bucket_lock); 820 goto retry; 821 } 822 823 return ERR_PTR(ret); 824 } 825 826 /* 827 * We can only have one thread cannibalizing other cached btree nodes at a time, 828 * or we'll deadlock. We use an open coded mutex to ensure that, which a 829 * cannibalize_bucket() will take. This means every time we unlock the root of 830 * the btree, we need to release this lock if we have it held. 831 */ 832 void bch_cannibalize_unlock(struct cache_set *c, struct closure *cl) 833 { 834 if (c->try_harder == cl) { 835 bch_time_stats_update(&c->try_harder_time, c->try_harder_start); 836 c->try_harder = NULL; 837 __closure_wake_up(&c->try_wait); 838 } 839 } 840 841 static struct btree *mca_alloc(struct cache_set *c, struct bkey *k, 842 int level, struct closure *cl) 843 { 844 struct btree *b; 845 846 lockdep_assert_held(&c->bucket_lock); 847 848 if (mca_find(c, k)) 849 return NULL; 850 851 /* btree_free() doesn't free memory; it sticks the node on the end of 852 * the list. Check if there's any freed nodes there: 853 */ 854 list_for_each_entry(b, &c->btree_cache_freeable, list) 855 if (!mca_reap(b, NULL, btree_order(k))) 856 goto out; 857 858 /* We never free struct btree itself, just the memory that holds the on 859 * disk node. Check the freed list before allocating a new one: 860 */ 861 list_for_each_entry(b, &c->btree_cache_freed, list) 862 if (!mca_reap(b, NULL, 0)) { 863 mca_data_alloc(b, k, __GFP_NOWARN|GFP_NOIO); 864 if (!b->sets[0].data) 865 goto err; 866 else 867 goto out; 868 } 869 870 b = mca_bucket_alloc(c, k, __GFP_NOWARN|GFP_NOIO); 871 if (!b) 872 goto err; 873 874 BUG_ON(!down_write_trylock(&b->lock)); 875 if (!b->sets->data) 876 goto err; 877 out: 878 BUG_ON(!closure_is_unlocked(&b->io.cl)); 879 880 bkey_copy(&b->key, k); 881 list_move(&b->list, &c->btree_cache); 882 hlist_del_init_rcu(&b->hash); 883 hlist_add_head_rcu(&b->hash, mca_hash(c, k)); 884 885 lock_set_subclass(&b->lock.dep_map, level + 1, _THIS_IP_); 886 b->level = level; 887 888 mca_reinit(b); 889 890 return b; 891 err: 892 if (b) 893 rw_unlock(true, b); 894 895 b = mca_cannibalize(c, k, level, cl); 896 if (!IS_ERR(b)) 897 goto out; 898 899 return b; 900 } 901 902 /** 903 * bch_btree_node_get - find a btree node in the cache and lock it, reading it 904 * in from disk if necessary. 905 * 906 * If IO is necessary, it uses the closure embedded in struct btree_op to wait; 907 * if that closure is in non blocking mode, will return -EAGAIN. 908 * 909 * The btree node will have either a read or a write lock held, depending on 910 * level and op->lock. 911 */ 912 struct btree *bch_btree_node_get(struct cache_set *c, struct bkey *k, 913 int level, struct btree_op *op) 914 { 915 int i = 0; 916 bool write = level <= op->lock; 917 struct btree *b; 918 919 BUG_ON(level < 0); 920 retry: 921 b = mca_find(c, k); 922 923 if (!b) { 924 if (current->bio_list) 925 return ERR_PTR(-EAGAIN); 926 927 mutex_lock(&c->bucket_lock); 928 b = mca_alloc(c, k, level, &op->cl); 929 mutex_unlock(&c->bucket_lock); 930 931 if (!b) 932 goto retry; 933 if (IS_ERR(b)) 934 return b; 935 936 bch_btree_node_read(b); 937 938 if (!write) 939 downgrade_write(&b->lock); 940 } else { 941 rw_lock(write, b, level); 942 if (PTR_HASH(c, &b->key) != PTR_HASH(c, k)) { 943 rw_unlock(write, b); 944 goto retry; 945 } 946 BUG_ON(b->level != level); 947 } 948 949 b->accessed = 1; 950 951 for (; i <= b->nsets && b->sets[i].size; i++) { 952 prefetch(b->sets[i].tree); 953 prefetch(b->sets[i].data); 954 } 955 956 for (; i <= b->nsets; i++) 957 prefetch(b->sets[i].data); 958 959 if (btree_node_io_error(b)) { 960 rw_unlock(write, b); 961 return ERR_PTR(-EIO); 962 } 963 964 BUG_ON(!b->written); 965 966 return b; 967 } 968 969 static void btree_node_prefetch(struct cache_set *c, struct bkey *k, int level) 970 { 971 struct btree *b; 972 973 mutex_lock(&c->bucket_lock); 974 b = mca_alloc(c, k, level, NULL); 975 mutex_unlock(&c->bucket_lock); 976 977 if (!IS_ERR_OR_NULL(b)) { 978 bch_btree_node_read(b); 979 rw_unlock(true, b); 980 } 981 } 982 983 /* Btree alloc */ 984 985 static void btree_node_free(struct btree *b, struct btree_op *op) 986 { 987 unsigned i; 988 989 trace_bcache_btree_node_free(b); 990 991 /* 992 * The BUG_ON() in btree_node_get() implies that we must have a write 993 * lock on parent to free or even invalidate a node 994 */ 995 BUG_ON(op->lock <= b->level); 996 BUG_ON(b == b->c->root); 997 998 if (btree_node_dirty(b)) 999 btree_complete_write(b, btree_current_write(b)); 1000 clear_bit(BTREE_NODE_dirty, &b->flags); 1001 1002 cancel_delayed_work(&b->work); 1003 1004 mutex_lock(&b->c->bucket_lock); 1005 1006 for (i = 0; i < KEY_PTRS(&b->key); i++) { 1007 BUG_ON(atomic_read(&PTR_BUCKET(b->c, &b->key, i)->pin)); 1008 1009 bch_inc_gen(PTR_CACHE(b->c, &b->key, i), 1010 PTR_BUCKET(b->c, &b->key, i)); 1011 } 1012 1013 bch_bucket_free(b->c, &b->key); 1014 mca_bucket_free(b); 1015 mutex_unlock(&b->c->bucket_lock); 1016 } 1017 1018 struct btree *bch_btree_node_alloc(struct cache_set *c, int level, 1019 struct closure *cl) 1020 { 1021 BKEY_PADDED(key) k; 1022 struct btree *b = ERR_PTR(-EAGAIN); 1023 1024 mutex_lock(&c->bucket_lock); 1025 retry: 1026 if (__bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, cl)) 1027 goto err; 1028 1029 SET_KEY_SIZE(&k.key, c->btree_pages * PAGE_SECTORS); 1030 1031 b = mca_alloc(c, &k.key, level, cl); 1032 if (IS_ERR(b)) 1033 goto err_free; 1034 1035 if (!b) { 1036 cache_bug(c, 1037 "Tried to allocate bucket that was in btree cache"); 1038 __bkey_put(c, &k.key); 1039 goto retry; 1040 } 1041 1042 b->accessed = 1; 1043 bch_bset_init_next(b); 1044 1045 mutex_unlock(&c->bucket_lock); 1046 1047 trace_bcache_btree_node_alloc(b); 1048 return b; 1049 err_free: 1050 bch_bucket_free(c, &k.key); 1051 __bkey_put(c, &k.key); 1052 err: 1053 mutex_unlock(&c->bucket_lock); 1054 1055 trace_bcache_btree_node_alloc_fail(b); 1056 return b; 1057 } 1058 1059 static struct btree *btree_node_alloc_replacement(struct btree *b, 1060 struct closure *cl) 1061 { 1062 struct btree *n = bch_btree_node_alloc(b->c, b->level, cl); 1063 if (!IS_ERR_OR_NULL(n)) 1064 bch_btree_sort_into(b, n); 1065 1066 return n; 1067 } 1068 1069 /* Garbage collection */ 1070 1071 uint8_t __bch_btree_mark_key(struct cache_set *c, int level, struct bkey *k) 1072 { 1073 uint8_t stale = 0; 1074 unsigned i; 1075 struct bucket *g; 1076 1077 /* 1078 * ptr_invalid() can't return true for the keys that mark btree nodes as 1079 * freed, but since ptr_bad() returns true we'll never actually use them 1080 * for anything and thus we don't want mark their pointers here 1081 */ 1082 if (!bkey_cmp(k, &ZERO_KEY)) 1083 return stale; 1084 1085 for (i = 0; i < KEY_PTRS(k); i++) { 1086 if (!ptr_available(c, k, i)) 1087 continue; 1088 1089 g = PTR_BUCKET(c, k, i); 1090 1091 if (gen_after(g->gc_gen, PTR_GEN(k, i))) 1092 g->gc_gen = PTR_GEN(k, i); 1093 1094 if (ptr_stale(c, k, i)) { 1095 stale = max(stale, ptr_stale(c, k, i)); 1096 continue; 1097 } 1098 1099 cache_bug_on(GC_MARK(g) && 1100 (GC_MARK(g) == GC_MARK_METADATA) != (level != 0), 1101 c, "inconsistent ptrs: mark = %llu, level = %i", 1102 GC_MARK(g), level); 1103 1104 if (level) 1105 SET_GC_MARK(g, GC_MARK_METADATA); 1106 else if (KEY_DIRTY(k)) 1107 SET_GC_MARK(g, GC_MARK_DIRTY); 1108 1109 /* guard against overflow */ 1110 SET_GC_SECTORS_USED(g, min_t(unsigned, 1111 GC_SECTORS_USED(g) + KEY_SIZE(k), 1112 (1 << 14) - 1)); 1113 1114 BUG_ON(!GC_SECTORS_USED(g)); 1115 } 1116 1117 return stale; 1118 } 1119 1120 #define btree_mark_key(b, k) __bch_btree_mark_key(b->c, b->level, k) 1121 1122 static int btree_gc_mark_node(struct btree *b, unsigned *keys, 1123 struct gc_stat *gc) 1124 { 1125 uint8_t stale = 0; 1126 unsigned last_dev = -1; 1127 struct bcache_device *d = NULL; 1128 struct bkey *k; 1129 struct btree_iter iter; 1130 struct bset_tree *t; 1131 1132 gc->nodes++; 1133 1134 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1135 if (last_dev != KEY_INODE(k)) { 1136 last_dev = KEY_INODE(k); 1137 1138 d = KEY_INODE(k) < b->c->nr_uuids 1139 ? b->c->devices[last_dev] 1140 : NULL; 1141 } 1142 1143 stale = max(stale, btree_mark_key(b, k)); 1144 1145 if (bch_ptr_bad(b, k)) 1146 continue; 1147 1148 *keys += bkey_u64s(k); 1149 1150 gc->key_bytes += bkey_u64s(k); 1151 gc->nkeys++; 1152 1153 gc->data += KEY_SIZE(k); 1154 if (KEY_DIRTY(k)) 1155 gc->dirty += KEY_SIZE(k); 1156 } 1157 1158 for (t = b->sets; t <= &b->sets[b->nsets]; t++) 1159 btree_bug_on(t->size && 1160 bset_written(b, t) && 1161 bkey_cmp(&b->key, &t->end) < 0, 1162 b, "found short btree key in gc"); 1163 1164 return stale; 1165 } 1166 1167 static struct btree *btree_gc_alloc(struct btree *b, struct bkey *k, 1168 struct btree_op *op) 1169 { 1170 /* 1171 * We block priorities from being written for the duration of garbage 1172 * collection, so we can't sleep in btree_alloc() -> 1173 * bch_bucket_alloc_set(), or we'd risk deadlock - so we don't pass it 1174 * our closure. 1175 */ 1176 struct btree *n = btree_node_alloc_replacement(b, NULL); 1177 1178 if (!IS_ERR_OR_NULL(n)) { 1179 swap(b, n); 1180 __bkey_put(b->c, &b->key); 1181 1182 memcpy(k->ptr, b->key.ptr, 1183 sizeof(uint64_t) * KEY_PTRS(&b->key)); 1184 1185 btree_node_free(n, op); 1186 up_write(&n->lock); 1187 } 1188 1189 return b; 1190 } 1191 1192 /* 1193 * Leaving this at 2 until we've got incremental garbage collection done; it 1194 * could be higher (and has been tested with 4) except that garbage collection 1195 * could take much longer, adversely affecting latency. 1196 */ 1197 #define GC_MERGE_NODES 2U 1198 1199 struct gc_merge_info { 1200 struct btree *b; 1201 struct bkey *k; 1202 unsigned keys; 1203 }; 1204 1205 static void btree_gc_coalesce(struct btree *b, struct btree_op *op, 1206 struct gc_stat *gc, struct gc_merge_info *r) 1207 { 1208 unsigned nodes = 0, keys = 0, blocks; 1209 int i; 1210 1211 while (nodes < GC_MERGE_NODES && r[nodes].b) 1212 keys += r[nodes++].keys; 1213 1214 blocks = btree_default_blocks(b->c) * 2 / 3; 1215 1216 if (nodes < 2 || 1217 __set_blocks(b->sets[0].data, keys, b->c) > blocks * (nodes - 1)) 1218 return; 1219 1220 for (i = nodes - 1; i >= 0; --i) { 1221 if (r[i].b->written) 1222 r[i].b = btree_gc_alloc(r[i].b, r[i].k, op); 1223 1224 if (r[i].b->written) 1225 return; 1226 } 1227 1228 for (i = nodes - 1; i > 0; --i) { 1229 struct bset *n1 = r[i].b->sets->data; 1230 struct bset *n2 = r[i - 1].b->sets->data; 1231 struct bkey *k, *last = NULL; 1232 1233 keys = 0; 1234 1235 if (i == 1) { 1236 /* 1237 * Last node we're not getting rid of - we're getting 1238 * rid of the node at r[0]. Have to try and fit all of 1239 * the remaining keys into this node; we can't ensure 1240 * they will always fit due to rounding and variable 1241 * length keys (shouldn't be possible in practice, 1242 * though) 1243 */ 1244 if (__set_blocks(n1, n1->keys + r->keys, 1245 b->c) > btree_blocks(r[i].b)) 1246 return; 1247 1248 keys = n2->keys; 1249 last = &r->b->key; 1250 } else 1251 for (k = n2->start; 1252 k < end(n2); 1253 k = bkey_next(k)) { 1254 if (__set_blocks(n1, n1->keys + keys + 1255 bkey_u64s(k), b->c) > blocks) 1256 break; 1257 1258 last = k; 1259 keys += bkey_u64s(k); 1260 } 1261 1262 BUG_ON(__set_blocks(n1, n1->keys + keys, 1263 b->c) > btree_blocks(r[i].b)); 1264 1265 if (last) { 1266 bkey_copy_key(&r[i].b->key, last); 1267 bkey_copy_key(r[i].k, last); 1268 } 1269 1270 memcpy(end(n1), 1271 n2->start, 1272 (void *) node(n2, keys) - (void *) n2->start); 1273 1274 n1->keys += keys; 1275 1276 memmove(n2->start, 1277 node(n2, keys), 1278 (void *) end(n2) - (void *) node(n2, keys)); 1279 1280 n2->keys -= keys; 1281 1282 r[i].keys = n1->keys; 1283 r[i - 1].keys = n2->keys; 1284 } 1285 1286 btree_node_free(r->b, op); 1287 up_write(&r->b->lock); 1288 1289 trace_bcache_btree_gc_coalesce(nodes); 1290 1291 gc->nodes--; 1292 nodes--; 1293 1294 memmove(&r[0], &r[1], sizeof(struct gc_merge_info) * nodes); 1295 memset(&r[nodes], 0, sizeof(struct gc_merge_info)); 1296 } 1297 1298 static int btree_gc_recurse(struct btree *b, struct btree_op *op, 1299 struct closure *writes, struct gc_stat *gc) 1300 { 1301 void write(struct btree *r) 1302 { 1303 if (!r->written) 1304 bch_btree_node_write(r, &op->cl); 1305 else if (btree_node_dirty(r)) 1306 bch_btree_node_write(r, writes); 1307 1308 up_write(&r->lock); 1309 } 1310 1311 int ret = 0, stale; 1312 unsigned i; 1313 struct gc_merge_info r[GC_MERGE_NODES]; 1314 1315 memset(r, 0, sizeof(r)); 1316 1317 while ((r->k = bch_next_recurse_key(b, &b->c->gc_done))) { 1318 r->b = bch_btree_node_get(b->c, r->k, b->level - 1, op); 1319 1320 if (IS_ERR(r->b)) { 1321 ret = PTR_ERR(r->b); 1322 break; 1323 } 1324 1325 r->keys = 0; 1326 stale = btree_gc_mark_node(r->b, &r->keys, gc); 1327 1328 if (!b->written && 1329 (r->b->level || stale > 10 || 1330 b->c->gc_always_rewrite)) 1331 r->b = btree_gc_alloc(r->b, r->k, op); 1332 1333 if (r->b->level) 1334 ret = btree_gc_recurse(r->b, op, writes, gc); 1335 1336 if (ret) { 1337 write(r->b); 1338 break; 1339 } 1340 1341 bkey_copy_key(&b->c->gc_done, r->k); 1342 1343 if (!b->written) 1344 btree_gc_coalesce(b, op, gc, r); 1345 1346 if (r[GC_MERGE_NODES - 1].b) 1347 write(r[GC_MERGE_NODES - 1].b); 1348 1349 memmove(&r[1], &r[0], 1350 sizeof(struct gc_merge_info) * (GC_MERGE_NODES - 1)); 1351 1352 /* When we've got incremental GC working, we'll want to do 1353 * if (should_resched()) 1354 * return -EAGAIN; 1355 */ 1356 cond_resched(); 1357 #if 0 1358 if (need_resched()) { 1359 ret = -EAGAIN; 1360 break; 1361 } 1362 #endif 1363 } 1364 1365 for (i = 1; i < GC_MERGE_NODES && r[i].b; i++) 1366 write(r[i].b); 1367 1368 /* Might have freed some children, must remove their keys */ 1369 if (!b->written) 1370 bch_btree_sort(b); 1371 1372 return ret; 1373 } 1374 1375 static int bch_btree_gc_root(struct btree *b, struct btree_op *op, 1376 struct closure *writes, struct gc_stat *gc) 1377 { 1378 struct btree *n = NULL; 1379 unsigned keys = 0; 1380 int ret = 0, stale = btree_gc_mark_node(b, &keys, gc); 1381 1382 if (b->level || stale > 10) 1383 n = btree_node_alloc_replacement(b, NULL); 1384 1385 if (!IS_ERR_OR_NULL(n)) 1386 swap(b, n); 1387 1388 if (b->level) 1389 ret = btree_gc_recurse(b, op, writes, gc); 1390 1391 if (!b->written || btree_node_dirty(b)) { 1392 bch_btree_node_write(b, n ? &op->cl : NULL); 1393 } 1394 1395 if (!IS_ERR_OR_NULL(n)) { 1396 closure_sync(&op->cl); 1397 bch_btree_set_root(b); 1398 btree_node_free(n, op); 1399 rw_unlock(true, b); 1400 } 1401 1402 return ret; 1403 } 1404 1405 static void btree_gc_start(struct cache_set *c) 1406 { 1407 struct cache *ca; 1408 struct bucket *b; 1409 unsigned i; 1410 1411 if (!c->gc_mark_valid) 1412 return; 1413 1414 mutex_lock(&c->bucket_lock); 1415 1416 c->gc_mark_valid = 0; 1417 c->gc_done = ZERO_KEY; 1418 1419 for_each_cache(ca, c, i) 1420 for_each_bucket(b, ca) { 1421 b->gc_gen = b->gen; 1422 if (!atomic_read(&b->pin)) { 1423 SET_GC_MARK(b, GC_MARK_RECLAIMABLE); 1424 SET_GC_SECTORS_USED(b, 0); 1425 } 1426 } 1427 1428 mutex_unlock(&c->bucket_lock); 1429 } 1430 1431 size_t bch_btree_gc_finish(struct cache_set *c) 1432 { 1433 size_t available = 0; 1434 struct bucket *b; 1435 struct cache *ca; 1436 unsigned i; 1437 1438 mutex_lock(&c->bucket_lock); 1439 1440 set_gc_sectors(c); 1441 c->gc_mark_valid = 1; 1442 c->need_gc = 0; 1443 1444 if (c->root) 1445 for (i = 0; i < KEY_PTRS(&c->root->key); i++) 1446 SET_GC_MARK(PTR_BUCKET(c, &c->root->key, i), 1447 GC_MARK_METADATA); 1448 1449 for (i = 0; i < KEY_PTRS(&c->uuid_bucket); i++) 1450 SET_GC_MARK(PTR_BUCKET(c, &c->uuid_bucket, i), 1451 GC_MARK_METADATA); 1452 1453 for_each_cache(ca, c, i) { 1454 uint64_t *i; 1455 1456 ca->invalidate_needs_gc = 0; 1457 1458 for (i = ca->sb.d; i < ca->sb.d + ca->sb.keys; i++) 1459 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1460 1461 for (i = ca->prio_buckets; 1462 i < ca->prio_buckets + prio_buckets(ca) * 2; i++) 1463 SET_GC_MARK(ca->buckets + *i, GC_MARK_METADATA); 1464 1465 for_each_bucket(b, ca) { 1466 b->last_gc = b->gc_gen; 1467 c->need_gc = max(c->need_gc, bucket_gc_gen(b)); 1468 1469 if (!atomic_read(&b->pin) && 1470 GC_MARK(b) == GC_MARK_RECLAIMABLE) { 1471 available++; 1472 if (!GC_SECTORS_USED(b)) 1473 bch_bucket_add_unused(ca, b); 1474 } 1475 } 1476 } 1477 1478 mutex_unlock(&c->bucket_lock); 1479 return available; 1480 } 1481 1482 static void bch_btree_gc(struct closure *cl) 1483 { 1484 struct cache_set *c = container_of(cl, struct cache_set, gc.cl); 1485 int ret; 1486 unsigned long available; 1487 struct gc_stat stats; 1488 struct closure writes; 1489 struct btree_op op; 1490 uint64_t start_time = local_clock(); 1491 1492 trace_bcache_gc_start(c); 1493 1494 memset(&stats, 0, sizeof(struct gc_stat)); 1495 closure_init_stack(&writes); 1496 bch_btree_op_init_stack(&op); 1497 op.lock = SHRT_MAX; 1498 1499 btree_gc_start(c); 1500 1501 atomic_inc(&c->prio_blocked); 1502 1503 ret = btree_root(gc_root, c, &op, &writes, &stats); 1504 closure_sync(&op.cl); 1505 closure_sync(&writes); 1506 1507 if (ret) { 1508 pr_warn("gc failed!"); 1509 continue_at(cl, bch_btree_gc, bch_gc_wq); 1510 } 1511 1512 /* Possibly wait for new UUIDs or whatever to hit disk */ 1513 bch_journal_meta(c, &op.cl); 1514 closure_sync(&op.cl); 1515 1516 available = bch_btree_gc_finish(c); 1517 1518 atomic_dec(&c->prio_blocked); 1519 wake_up_allocators(c); 1520 1521 bch_time_stats_update(&c->btree_gc_time, start_time); 1522 1523 stats.key_bytes *= sizeof(uint64_t); 1524 stats.dirty <<= 9; 1525 stats.data <<= 9; 1526 stats.in_use = (c->nbuckets - available) * 100 / c->nbuckets; 1527 memcpy(&c->gc_stats, &stats, sizeof(struct gc_stat)); 1528 1529 trace_bcache_gc_end(c); 1530 1531 continue_at(cl, bch_moving_gc, bch_gc_wq); 1532 } 1533 1534 void bch_queue_gc(struct cache_set *c) 1535 { 1536 closure_trylock_call(&c->gc.cl, bch_btree_gc, bch_gc_wq, &c->cl); 1537 } 1538 1539 /* Initial partial gc */ 1540 1541 static int bch_btree_check_recurse(struct btree *b, struct btree_op *op, 1542 unsigned long **seen) 1543 { 1544 int ret; 1545 unsigned i; 1546 struct bkey *k; 1547 struct bucket *g; 1548 struct btree_iter iter; 1549 1550 for_each_key_filter(b, k, &iter, bch_ptr_invalid) { 1551 for (i = 0; i < KEY_PTRS(k); i++) { 1552 if (!ptr_available(b->c, k, i)) 1553 continue; 1554 1555 g = PTR_BUCKET(b->c, k, i); 1556 1557 if (!__test_and_set_bit(PTR_BUCKET_NR(b->c, k, i), 1558 seen[PTR_DEV(k, i)]) || 1559 !ptr_stale(b->c, k, i)) { 1560 g->gen = PTR_GEN(k, i); 1561 1562 if (b->level) 1563 g->prio = BTREE_PRIO; 1564 else if (g->prio == BTREE_PRIO) 1565 g->prio = INITIAL_PRIO; 1566 } 1567 } 1568 1569 btree_mark_key(b, k); 1570 } 1571 1572 if (b->level) { 1573 k = bch_next_recurse_key(b, &ZERO_KEY); 1574 1575 while (k) { 1576 struct bkey *p = bch_next_recurse_key(b, k); 1577 if (p) 1578 btree_node_prefetch(b->c, p, b->level - 1); 1579 1580 ret = btree(check_recurse, k, b, op, seen); 1581 if (ret) 1582 return ret; 1583 1584 k = p; 1585 } 1586 } 1587 1588 return 0; 1589 } 1590 1591 int bch_btree_check(struct cache_set *c, struct btree_op *op) 1592 { 1593 int ret = -ENOMEM; 1594 unsigned i; 1595 unsigned long *seen[MAX_CACHES_PER_SET]; 1596 1597 memset(seen, 0, sizeof(seen)); 1598 1599 for (i = 0; c->cache[i]; i++) { 1600 size_t n = DIV_ROUND_UP(c->cache[i]->sb.nbuckets, 8); 1601 seen[i] = kmalloc(n, GFP_KERNEL); 1602 if (!seen[i]) 1603 goto err; 1604 1605 /* Disables the seen array until prio_read() uses it too */ 1606 memset(seen[i], 0xFF, n); 1607 } 1608 1609 ret = btree_root(check_recurse, c, op, seen); 1610 err: 1611 for (i = 0; i < MAX_CACHES_PER_SET; i++) 1612 kfree(seen[i]); 1613 return ret; 1614 } 1615 1616 /* Btree insertion */ 1617 1618 static void shift_keys(struct btree *b, struct bkey *where, struct bkey *insert) 1619 { 1620 struct bset *i = b->sets[b->nsets].data; 1621 1622 memmove((uint64_t *) where + bkey_u64s(insert), 1623 where, 1624 (void *) end(i) - (void *) where); 1625 1626 i->keys += bkey_u64s(insert); 1627 bkey_copy(where, insert); 1628 bch_bset_fix_lookup_table(b, where); 1629 } 1630 1631 static bool fix_overlapping_extents(struct btree *b, 1632 struct bkey *insert, 1633 struct btree_iter *iter, 1634 struct btree_op *op) 1635 { 1636 void subtract_dirty(struct bkey *k, uint64_t offset, int sectors) 1637 { 1638 if (KEY_DIRTY(k)) 1639 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1640 offset, -sectors); 1641 } 1642 1643 uint64_t old_offset; 1644 unsigned old_size, sectors_found = 0; 1645 1646 while (1) { 1647 struct bkey *k = bch_btree_iter_next(iter); 1648 if (!k || 1649 bkey_cmp(&START_KEY(k), insert) >= 0) 1650 break; 1651 1652 if (bkey_cmp(k, &START_KEY(insert)) <= 0) 1653 continue; 1654 1655 old_offset = KEY_START(k); 1656 old_size = KEY_SIZE(k); 1657 1658 /* 1659 * We might overlap with 0 size extents; we can't skip these 1660 * because if they're in the set we're inserting to we have to 1661 * adjust them so they don't overlap with the key we're 1662 * inserting. But we don't want to check them for BTREE_REPLACE 1663 * operations. 1664 */ 1665 1666 if (op->type == BTREE_REPLACE && 1667 KEY_SIZE(k)) { 1668 /* 1669 * k might have been split since we inserted/found the 1670 * key we're replacing 1671 */ 1672 unsigned i; 1673 uint64_t offset = KEY_START(k) - 1674 KEY_START(&op->replace); 1675 1676 /* But it must be a subset of the replace key */ 1677 if (KEY_START(k) < KEY_START(&op->replace) || 1678 KEY_OFFSET(k) > KEY_OFFSET(&op->replace)) 1679 goto check_failed; 1680 1681 /* We didn't find a key that we were supposed to */ 1682 if (KEY_START(k) > KEY_START(insert) + sectors_found) 1683 goto check_failed; 1684 1685 if (KEY_PTRS(&op->replace) != KEY_PTRS(k)) 1686 goto check_failed; 1687 1688 /* skip past gen */ 1689 offset <<= 8; 1690 1691 BUG_ON(!KEY_PTRS(&op->replace)); 1692 1693 for (i = 0; i < KEY_PTRS(&op->replace); i++) 1694 if (k->ptr[i] != op->replace.ptr[i] + offset) 1695 goto check_failed; 1696 1697 sectors_found = KEY_OFFSET(k) - KEY_START(insert); 1698 } 1699 1700 if (bkey_cmp(insert, k) < 0 && 1701 bkey_cmp(&START_KEY(insert), &START_KEY(k)) > 0) { 1702 /* 1703 * We overlapped in the middle of an existing key: that 1704 * means we have to split the old key. But we have to do 1705 * slightly different things depending on whether the 1706 * old key has been written out yet. 1707 */ 1708 1709 struct bkey *top; 1710 1711 subtract_dirty(k, KEY_START(insert), KEY_SIZE(insert)); 1712 1713 if (bkey_written(b, k)) { 1714 /* 1715 * We insert a new key to cover the top of the 1716 * old key, and the old key is modified in place 1717 * to represent the bottom split. 1718 * 1719 * It's completely arbitrary whether the new key 1720 * is the top or the bottom, but it has to match 1721 * up with what btree_sort_fixup() does - it 1722 * doesn't check for this kind of overlap, it 1723 * depends on us inserting a new key for the top 1724 * here. 1725 */ 1726 top = bch_bset_search(b, &b->sets[b->nsets], 1727 insert); 1728 shift_keys(b, top, k); 1729 } else { 1730 BKEY_PADDED(key) temp; 1731 bkey_copy(&temp.key, k); 1732 shift_keys(b, k, &temp.key); 1733 top = bkey_next(k); 1734 } 1735 1736 bch_cut_front(insert, top); 1737 bch_cut_back(&START_KEY(insert), k); 1738 bch_bset_fix_invalidated_key(b, k); 1739 return false; 1740 } 1741 1742 if (bkey_cmp(insert, k) < 0) { 1743 bch_cut_front(insert, k); 1744 } else { 1745 if (bkey_written(b, k) && 1746 bkey_cmp(&START_KEY(insert), &START_KEY(k)) <= 0) { 1747 /* 1748 * Completely overwrote, so we don't have to 1749 * invalidate the binary search tree 1750 */ 1751 bch_cut_front(k, k); 1752 } else { 1753 __bch_cut_back(&START_KEY(insert), k); 1754 bch_bset_fix_invalidated_key(b, k); 1755 } 1756 } 1757 1758 subtract_dirty(k, old_offset, old_size - KEY_SIZE(k)); 1759 } 1760 1761 check_failed: 1762 if (op->type == BTREE_REPLACE) { 1763 if (!sectors_found) { 1764 op->insert_collision = true; 1765 return true; 1766 } else if (sectors_found < KEY_SIZE(insert)) { 1767 SET_KEY_OFFSET(insert, KEY_OFFSET(insert) - 1768 (KEY_SIZE(insert) - sectors_found)); 1769 SET_KEY_SIZE(insert, sectors_found); 1770 } 1771 } 1772 1773 return false; 1774 } 1775 1776 static bool btree_insert_key(struct btree *b, struct btree_op *op, 1777 struct bkey *k) 1778 { 1779 struct bset *i = b->sets[b->nsets].data; 1780 struct bkey *m, *prev; 1781 unsigned status = BTREE_INSERT_STATUS_INSERT; 1782 1783 BUG_ON(bkey_cmp(k, &b->key) > 0); 1784 BUG_ON(b->level && !KEY_PTRS(k)); 1785 BUG_ON(!b->level && !KEY_OFFSET(k)); 1786 1787 if (!b->level) { 1788 struct btree_iter iter; 1789 struct bkey search = KEY(KEY_INODE(k), KEY_START(k), 0); 1790 1791 /* 1792 * bset_search() returns the first key that is strictly greater 1793 * than the search key - but for back merging, we want to find 1794 * the first key that is greater than or equal to KEY_START(k) - 1795 * unless KEY_START(k) is 0. 1796 */ 1797 if (KEY_OFFSET(&search)) 1798 SET_KEY_OFFSET(&search, KEY_OFFSET(&search) - 1); 1799 1800 prev = NULL; 1801 m = bch_btree_iter_init(b, &iter, &search); 1802 1803 if (fix_overlapping_extents(b, k, &iter, op)) 1804 return false; 1805 1806 while (m != end(i) && 1807 bkey_cmp(k, &START_KEY(m)) > 0) 1808 prev = m, m = bkey_next(m); 1809 1810 if (key_merging_disabled(b->c)) 1811 goto insert; 1812 1813 /* prev is in the tree, if we merge we're done */ 1814 status = BTREE_INSERT_STATUS_BACK_MERGE; 1815 if (prev && 1816 bch_bkey_try_merge(b, prev, k)) 1817 goto merged; 1818 1819 status = BTREE_INSERT_STATUS_OVERWROTE; 1820 if (m != end(i) && 1821 KEY_PTRS(m) == KEY_PTRS(k) && !KEY_SIZE(m)) 1822 goto copy; 1823 1824 status = BTREE_INSERT_STATUS_FRONT_MERGE; 1825 if (m != end(i) && 1826 bch_bkey_try_merge(b, k, m)) 1827 goto copy; 1828 } else 1829 m = bch_bset_search(b, &b->sets[b->nsets], k); 1830 1831 insert: shift_keys(b, m, k); 1832 copy: bkey_copy(m, k); 1833 merged: 1834 if (KEY_DIRTY(k)) 1835 bcache_dev_sectors_dirty_add(b->c, KEY_INODE(k), 1836 KEY_START(k), KEY_SIZE(k)); 1837 1838 bch_check_keys(b, "%u for %s", status, op_type(op)); 1839 1840 if (b->level && !KEY_OFFSET(k)) 1841 btree_current_write(b)->prio_blocked++; 1842 1843 trace_bcache_btree_insert_key(b, k, op->type, status); 1844 1845 return true; 1846 } 1847 1848 static bool bch_btree_insert_keys(struct btree *b, struct btree_op *op) 1849 { 1850 bool ret = false; 1851 struct bkey *k; 1852 unsigned oldsize = bch_count_data(b); 1853 1854 while ((k = bch_keylist_pop(&op->keys))) { 1855 bkey_put(b->c, k, b->level); 1856 ret |= btree_insert_key(b, op, k); 1857 } 1858 1859 BUG_ON(bch_count_data(b) < oldsize); 1860 return ret; 1861 } 1862 1863 bool bch_btree_insert_check_key(struct btree *b, struct btree_op *op, 1864 struct bio *bio) 1865 { 1866 bool ret = false; 1867 uint64_t btree_ptr = b->key.ptr[0]; 1868 unsigned long seq = b->seq; 1869 BKEY_PADDED(k) tmp; 1870 1871 rw_unlock(false, b); 1872 rw_lock(true, b, b->level); 1873 1874 if (b->key.ptr[0] != btree_ptr || 1875 b->seq != seq + 1 || 1876 should_split(b)) 1877 goto out; 1878 1879 op->replace = KEY(op->inode, bio_end_sector(bio), bio_sectors(bio)); 1880 1881 SET_KEY_PTRS(&op->replace, 1); 1882 get_random_bytes(&op->replace.ptr[0], sizeof(uint64_t)); 1883 1884 SET_PTR_DEV(&op->replace, 0, PTR_CHECK_DEV); 1885 1886 bkey_copy(&tmp.k, &op->replace); 1887 1888 BUG_ON(op->type != BTREE_INSERT); 1889 BUG_ON(!btree_insert_key(b, op, &tmp.k)); 1890 ret = true; 1891 out: 1892 downgrade_write(&b->lock); 1893 return ret; 1894 } 1895 1896 static int btree_split(struct btree *b, struct btree_op *op) 1897 { 1898 bool split, root = b == b->c->root; 1899 struct btree *n1, *n2 = NULL, *n3 = NULL; 1900 uint64_t start_time = local_clock(); 1901 1902 if (b->level) 1903 set_closure_blocking(&op->cl); 1904 1905 n1 = btree_node_alloc_replacement(b, &op->cl); 1906 if (IS_ERR(n1)) 1907 goto err; 1908 1909 split = set_blocks(n1->sets[0].data, n1->c) > (btree_blocks(b) * 4) / 5; 1910 1911 if (split) { 1912 unsigned keys = 0; 1913 1914 trace_bcache_btree_node_split(b, n1->sets[0].data->keys); 1915 1916 n2 = bch_btree_node_alloc(b->c, b->level, &op->cl); 1917 if (IS_ERR(n2)) 1918 goto err_free1; 1919 1920 if (root) { 1921 n3 = bch_btree_node_alloc(b->c, b->level + 1, &op->cl); 1922 if (IS_ERR(n3)) 1923 goto err_free2; 1924 } 1925 1926 bch_btree_insert_keys(n1, op); 1927 1928 /* Has to be a linear search because we don't have an auxiliary 1929 * search tree yet 1930 */ 1931 1932 while (keys < (n1->sets[0].data->keys * 3) / 5) 1933 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1934 1935 bkey_copy_key(&n1->key, node(n1->sets[0].data, keys)); 1936 keys += bkey_u64s(node(n1->sets[0].data, keys)); 1937 1938 n2->sets[0].data->keys = n1->sets[0].data->keys - keys; 1939 n1->sets[0].data->keys = keys; 1940 1941 memcpy(n2->sets[0].data->start, 1942 end(n1->sets[0].data), 1943 n2->sets[0].data->keys * sizeof(uint64_t)); 1944 1945 bkey_copy_key(&n2->key, &b->key); 1946 1947 bch_keylist_add(&op->keys, &n2->key); 1948 bch_btree_node_write(n2, &op->cl); 1949 rw_unlock(true, n2); 1950 } else { 1951 trace_bcache_btree_node_compact(b, n1->sets[0].data->keys); 1952 1953 bch_btree_insert_keys(n1, op); 1954 } 1955 1956 bch_keylist_add(&op->keys, &n1->key); 1957 bch_btree_node_write(n1, &op->cl); 1958 1959 if (n3) { 1960 bkey_copy_key(&n3->key, &MAX_KEY); 1961 bch_btree_insert_keys(n3, op); 1962 bch_btree_node_write(n3, &op->cl); 1963 1964 closure_sync(&op->cl); 1965 bch_btree_set_root(n3); 1966 rw_unlock(true, n3); 1967 } else if (root) { 1968 op->keys.top = op->keys.bottom; 1969 closure_sync(&op->cl); 1970 bch_btree_set_root(n1); 1971 } else { 1972 unsigned i; 1973 1974 bkey_copy(op->keys.top, &b->key); 1975 bkey_copy_key(op->keys.top, &ZERO_KEY); 1976 1977 for (i = 0; i < KEY_PTRS(&b->key); i++) { 1978 uint8_t g = PTR_BUCKET(b->c, &b->key, i)->gen + 1; 1979 1980 SET_PTR_GEN(op->keys.top, i, g); 1981 } 1982 1983 bch_keylist_push(&op->keys); 1984 closure_sync(&op->cl); 1985 atomic_inc(&b->c->prio_blocked); 1986 } 1987 1988 rw_unlock(true, n1); 1989 btree_node_free(b, op); 1990 1991 bch_time_stats_update(&b->c->btree_split_time, start_time); 1992 1993 return 0; 1994 err_free2: 1995 __bkey_put(n2->c, &n2->key); 1996 btree_node_free(n2, op); 1997 rw_unlock(true, n2); 1998 err_free1: 1999 __bkey_put(n1->c, &n1->key); 2000 btree_node_free(n1, op); 2001 rw_unlock(true, n1); 2002 err: 2003 if (n3 == ERR_PTR(-EAGAIN) || 2004 n2 == ERR_PTR(-EAGAIN) || 2005 n1 == ERR_PTR(-EAGAIN)) 2006 return -EAGAIN; 2007 2008 pr_warn("couldn't split"); 2009 return -ENOMEM; 2010 } 2011 2012 static int bch_btree_insert_recurse(struct btree *b, struct btree_op *op, 2013 struct keylist *stack_keys) 2014 { 2015 if (b->level) { 2016 int ret; 2017 struct bkey *insert = op->keys.bottom; 2018 struct bkey *k = bch_next_recurse_key(b, &START_KEY(insert)); 2019 2020 if (!k) { 2021 btree_bug(b, "no key to recurse on at level %i/%i", 2022 b->level, b->c->root->level); 2023 2024 op->keys.top = op->keys.bottom; 2025 return -EIO; 2026 } 2027 2028 if (bkey_cmp(insert, k) > 0) { 2029 unsigned i; 2030 2031 if (op->type == BTREE_REPLACE) { 2032 __bkey_put(b->c, insert); 2033 op->keys.top = op->keys.bottom; 2034 op->insert_collision = true; 2035 return 0; 2036 } 2037 2038 for (i = 0; i < KEY_PTRS(insert); i++) 2039 atomic_inc(&PTR_BUCKET(b->c, insert, i)->pin); 2040 2041 bkey_copy(stack_keys->top, insert); 2042 2043 bch_cut_back(k, insert); 2044 bch_cut_front(k, stack_keys->top); 2045 2046 bch_keylist_push(stack_keys); 2047 } 2048 2049 ret = btree(insert_recurse, k, b, op, stack_keys); 2050 if (ret) 2051 return ret; 2052 } 2053 2054 if (!bch_keylist_empty(&op->keys)) { 2055 if (should_split(b)) { 2056 if (op->lock <= b->c->root->level) { 2057 BUG_ON(b->level); 2058 op->lock = b->c->root->level + 1; 2059 return -EINTR; 2060 } 2061 return btree_split(b, op); 2062 } 2063 2064 BUG_ON(write_block(b) != b->sets[b->nsets].data); 2065 2066 if (bch_btree_insert_keys(b, op)) { 2067 if (!b->level) 2068 bch_btree_leaf_dirty(b, op); 2069 else 2070 bch_btree_node_write(b, &op->cl); 2071 } 2072 } 2073 2074 return 0; 2075 } 2076 2077 int bch_btree_insert(struct btree_op *op, struct cache_set *c) 2078 { 2079 int ret = 0; 2080 struct keylist stack_keys; 2081 2082 /* 2083 * Don't want to block with the btree locked unless we have to, 2084 * otherwise we get deadlocks with try_harder and between split/gc 2085 */ 2086 clear_closure_blocking(&op->cl); 2087 2088 BUG_ON(bch_keylist_empty(&op->keys)); 2089 bch_keylist_copy(&stack_keys, &op->keys); 2090 bch_keylist_init(&op->keys); 2091 2092 while (!bch_keylist_empty(&stack_keys) || 2093 !bch_keylist_empty(&op->keys)) { 2094 if (bch_keylist_empty(&op->keys)) { 2095 bch_keylist_add(&op->keys, 2096 bch_keylist_pop(&stack_keys)); 2097 op->lock = 0; 2098 } 2099 2100 ret = btree_root(insert_recurse, c, op, &stack_keys); 2101 2102 if (ret == -EAGAIN) { 2103 ret = 0; 2104 closure_sync(&op->cl); 2105 } else if (ret) { 2106 struct bkey *k; 2107 2108 pr_err("error %i trying to insert key for %s", 2109 ret, op_type(op)); 2110 2111 while ((k = bch_keylist_pop(&stack_keys) ?: 2112 bch_keylist_pop(&op->keys))) 2113 bkey_put(c, k, 0); 2114 } 2115 } 2116 2117 bch_keylist_free(&stack_keys); 2118 2119 if (op->journal) 2120 atomic_dec_bug(op->journal); 2121 op->journal = NULL; 2122 return ret; 2123 } 2124 2125 void bch_btree_set_root(struct btree *b) 2126 { 2127 unsigned i; 2128 struct closure cl; 2129 2130 closure_init_stack(&cl); 2131 2132 trace_bcache_btree_set_root(b); 2133 2134 BUG_ON(!b->written); 2135 2136 for (i = 0; i < KEY_PTRS(&b->key); i++) 2137 BUG_ON(PTR_BUCKET(b->c, &b->key, i)->prio != BTREE_PRIO); 2138 2139 mutex_lock(&b->c->bucket_lock); 2140 list_del_init(&b->list); 2141 mutex_unlock(&b->c->bucket_lock); 2142 2143 b->c->root = b; 2144 __bkey_put(b->c, &b->key); 2145 2146 bch_journal_meta(b->c, &cl); 2147 closure_sync(&cl); 2148 } 2149 2150 /* Cache lookup */ 2151 2152 static int submit_partial_cache_miss(struct btree *b, struct btree_op *op, 2153 struct bkey *k) 2154 { 2155 struct search *s = container_of(op, struct search, op); 2156 struct bio *bio = &s->bio.bio; 2157 int ret = 0; 2158 2159 while (!ret && 2160 !op->lookup_done) { 2161 unsigned sectors = INT_MAX; 2162 2163 if (KEY_INODE(k) == op->inode) { 2164 if (KEY_START(k) <= bio->bi_sector) 2165 break; 2166 2167 sectors = min_t(uint64_t, sectors, 2168 KEY_START(k) - bio->bi_sector); 2169 } 2170 2171 ret = s->d->cache_miss(b, s, bio, sectors); 2172 } 2173 2174 return ret; 2175 } 2176 2177 /* 2178 * Read from a single key, handling the initial cache miss if the key starts in 2179 * the middle of the bio 2180 */ 2181 static int submit_partial_cache_hit(struct btree *b, struct btree_op *op, 2182 struct bkey *k) 2183 { 2184 struct search *s = container_of(op, struct search, op); 2185 struct bio *bio = &s->bio.bio; 2186 unsigned ptr; 2187 struct bio *n; 2188 2189 int ret = submit_partial_cache_miss(b, op, k); 2190 if (ret || op->lookup_done) 2191 return ret; 2192 2193 /* XXX: figure out best pointer - for multiple cache devices */ 2194 ptr = 0; 2195 2196 PTR_BUCKET(b->c, k, ptr)->prio = INITIAL_PRIO; 2197 2198 while (!op->lookup_done && 2199 KEY_INODE(k) == op->inode && 2200 bio->bi_sector < KEY_OFFSET(k)) { 2201 struct bkey *bio_key; 2202 sector_t sector = PTR_OFFSET(k, ptr) + 2203 (bio->bi_sector - KEY_START(k)); 2204 unsigned sectors = min_t(uint64_t, INT_MAX, 2205 KEY_OFFSET(k) - bio->bi_sector); 2206 2207 n = bch_bio_split(bio, sectors, GFP_NOIO, s->d->bio_split); 2208 if (n == bio) 2209 op->lookup_done = true; 2210 2211 bio_key = &container_of(n, struct bbio, bio)->key; 2212 2213 /* 2214 * The bucket we're reading from might be reused while our bio 2215 * is in flight, and we could then end up reading the wrong 2216 * data. 2217 * 2218 * We guard against this by checking (in cache_read_endio()) if 2219 * the pointer is stale again; if so, we treat it as an error 2220 * and reread from the backing device (but we don't pass that 2221 * error up anywhere). 2222 */ 2223 2224 bch_bkey_copy_single_ptr(bio_key, k, ptr); 2225 SET_PTR_OFFSET(bio_key, 0, sector); 2226 2227 n->bi_end_io = bch_cache_read_endio; 2228 n->bi_private = &s->cl; 2229 2230 __bch_submit_bbio(n, b->c); 2231 } 2232 2233 return 0; 2234 } 2235 2236 int bch_btree_search_recurse(struct btree *b, struct btree_op *op) 2237 { 2238 struct search *s = container_of(op, struct search, op); 2239 struct bio *bio = &s->bio.bio; 2240 2241 int ret = 0; 2242 struct bkey *k; 2243 struct btree_iter iter; 2244 bch_btree_iter_init(b, &iter, &KEY(op->inode, bio->bi_sector, 0)); 2245 2246 do { 2247 k = bch_btree_iter_next_filter(&iter, b, bch_ptr_bad); 2248 if (!k) { 2249 /* 2250 * b->key would be exactly what we want, except that 2251 * pointers to btree nodes have nonzero size - we 2252 * wouldn't go far enough 2253 */ 2254 2255 ret = submit_partial_cache_miss(b, op, 2256 &KEY(KEY_INODE(&b->key), 2257 KEY_OFFSET(&b->key), 0)); 2258 break; 2259 } 2260 2261 ret = b->level 2262 ? btree(search_recurse, k, b, op) 2263 : submit_partial_cache_hit(b, op, k); 2264 } while (!ret && 2265 !op->lookup_done); 2266 2267 return ret; 2268 } 2269 2270 /* Keybuf code */ 2271 2272 static inline int keybuf_cmp(struct keybuf_key *l, struct keybuf_key *r) 2273 { 2274 /* Overlapping keys compare equal */ 2275 if (bkey_cmp(&l->key, &START_KEY(&r->key)) <= 0) 2276 return -1; 2277 if (bkey_cmp(&START_KEY(&l->key), &r->key) >= 0) 2278 return 1; 2279 return 0; 2280 } 2281 2282 static inline int keybuf_nonoverlapping_cmp(struct keybuf_key *l, 2283 struct keybuf_key *r) 2284 { 2285 return clamp_t(int64_t, bkey_cmp(&l->key, &r->key), -1, 1); 2286 } 2287 2288 static int bch_btree_refill_keybuf(struct btree *b, struct btree_op *op, 2289 struct keybuf *buf, struct bkey *end, 2290 keybuf_pred_fn *pred) 2291 { 2292 struct btree_iter iter; 2293 bch_btree_iter_init(b, &iter, &buf->last_scanned); 2294 2295 while (!array_freelist_empty(&buf->freelist)) { 2296 struct bkey *k = bch_btree_iter_next_filter(&iter, b, 2297 bch_ptr_bad); 2298 2299 if (!b->level) { 2300 if (!k) { 2301 buf->last_scanned = b->key; 2302 break; 2303 } 2304 2305 buf->last_scanned = *k; 2306 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2307 break; 2308 2309 if (pred(buf, k)) { 2310 struct keybuf_key *w; 2311 2312 spin_lock(&buf->lock); 2313 2314 w = array_alloc(&buf->freelist); 2315 2316 w->private = NULL; 2317 bkey_copy(&w->key, k); 2318 2319 if (RB_INSERT(&buf->keys, w, node, keybuf_cmp)) 2320 array_free(&buf->freelist, w); 2321 2322 spin_unlock(&buf->lock); 2323 } 2324 } else { 2325 if (!k) 2326 break; 2327 2328 btree(refill_keybuf, k, b, op, buf, end, pred); 2329 /* 2330 * Might get an error here, but can't really do anything 2331 * and it'll get logged elsewhere. Just read what we 2332 * can. 2333 */ 2334 2335 if (bkey_cmp(&buf->last_scanned, end) >= 0) 2336 break; 2337 2338 cond_resched(); 2339 } 2340 } 2341 2342 return 0; 2343 } 2344 2345 void bch_refill_keybuf(struct cache_set *c, struct keybuf *buf, 2346 struct bkey *end, keybuf_pred_fn *pred) 2347 { 2348 struct bkey start = buf->last_scanned; 2349 struct btree_op op; 2350 bch_btree_op_init_stack(&op); 2351 2352 cond_resched(); 2353 2354 btree_root(refill_keybuf, c, &op, buf, end, pred); 2355 closure_sync(&op.cl); 2356 2357 pr_debug("found %s keys from %llu:%llu to %llu:%llu", 2358 RB_EMPTY_ROOT(&buf->keys) ? "no" : 2359 array_freelist_empty(&buf->freelist) ? "some" : "a few", 2360 KEY_INODE(&start), KEY_OFFSET(&start), 2361 KEY_INODE(&buf->last_scanned), KEY_OFFSET(&buf->last_scanned)); 2362 2363 spin_lock(&buf->lock); 2364 2365 if (!RB_EMPTY_ROOT(&buf->keys)) { 2366 struct keybuf_key *w; 2367 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2368 buf->start = START_KEY(&w->key); 2369 2370 w = RB_LAST(&buf->keys, struct keybuf_key, node); 2371 buf->end = w->key; 2372 } else { 2373 buf->start = MAX_KEY; 2374 buf->end = MAX_KEY; 2375 } 2376 2377 spin_unlock(&buf->lock); 2378 } 2379 2380 static void __bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2381 { 2382 rb_erase(&w->node, &buf->keys); 2383 array_free(&buf->freelist, w); 2384 } 2385 2386 void bch_keybuf_del(struct keybuf *buf, struct keybuf_key *w) 2387 { 2388 spin_lock(&buf->lock); 2389 __bch_keybuf_del(buf, w); 2390 spin_unlock(&buf->lock); 2391 } 2392 2393 bool bch_keybuf_check_overlapping(struct keybuf *buf, struct bkey *start, 2394 struct bkey *end) 2395 { 2396 bool ret = false; 2397 struct keybuf_key *p, *w, s; 2398 s.key = *start; 2399 2400 if (bkey_cmp(end, &buf->start) <= 0 || 2401 bkey_cmp(start, &buf->end) >= 0) 2402 return false; 2403 2404 spin_lock(&buf->lock); 2405 w = RB_GREATER(&buf->keys, s, node, keybuf_nonoverlapping_cmp); 2406 2407 while (w && bkey_cmp(&START_KEY(&w->key), end) < 0) { 2408 p = w; 2409 w = RB_NEXT(w, node); 2410 2411 if (p->private) 2412 ret = true; 2413 else 2414 __bch_keybuf_del(buf, p); 2415 } 2416 2417 spin_unlock(&buf->lock); 2418 return ret; 2419 } 2420 2421 struct keybuf_key *bch_keybuf_next(struct keybuf *buf) 2422 { 2423 struct keybuf_key *w; 2424 spin_lock(&buf->lock); 2425 2426 w = RB_FIRST(&buf->keys, struct keybuf_key, node); 2427 2428 while (w && w->private) 2429 w = RB_NEXT(w, node); 2430 2431 if (w) 2432 w->private = ERR_PTR(-EINTR); 2433 2434 spin_unlock(&buf->lock); 2435 return w; 2436 } 2437 2438 struct keybuf_key *bch_keybuf_next_rescan(struct cache_set *c, 2439 struct keybuf *buf, 2440 struct bkey *end, 2441 keybuf_pred_fn *pred) 2442 { 2443 struct keybuf_key *ret; 2444 2445 while (1) { 2446 ret = bch_keybuf_next(buf); 2447 if (ret) 2448 break; 2449 2450 if (bkey_cmp(&buf->last_scanned, end) >= 0) { 2451 pr_debug("scan finished"); 2452 break; 2453 } 2454 2455 bch_refill_keybuf(c, buf, end, pred); 2456 } 2457 2458 return ret; 2459 } 2460 2461 void bch_keybuf_init(struct keybuf *buf) 2462 { 2463 buf->last_scanned = MAX_KEY; 2464 buf->keys = RB_ROOT; 2465 2466 spin_lock_init(&buf->lock); 2467 array_allocator_init(&buf->freelist); 2468 } 2469 2470 void bch_btree_exit(void) 2471 { 2472 if (btree_io_wq) 2473 destroy_workqueue(btree_io_wq); 2474 if (bch_gc_wq) 2475 destroy_workqueue(bch_gc_wq); 2476 } 2477 2478 int __init bch_btree_init(void) 2479 { 2480 if (!(bch_gc_wq = create_singlethread_workqueue("bch_btree_gc")) || 2481 !(btree_io_wq = create_singlethread_workqueue("bch_btree_io"))) 2482 return -ENOMEM; 2483 2484 return 0; 2485 } 2486