1 /* 2 * bcache setup/teardown code, and some metadata io - read a superblock and 3 * figure out what to do with it. 4 * 5 * Copyright 2010, 2011 Kent Overstreet <kent.overstreet@gmail.com> 6 * Copyright 2012 Google, Inc. 7 */ 8 9 #include "bcache.h" 10 #include "btree.h" 11 #include "debug.h" 12 #include "request.h" 13 #include "writeback.h" 14 15 #include <linux/blkdev.h> 16 #include <linux/buffer_head.h> 17 #include <linux/debugfs.h> 18 #include <linux/genhd.h> 19 #include <linux/module.h> 20 #include <linux/random.h> 21 #include <linux/reboot.h> 22 #include <linux/sysfs.h> 23 24 MODULE_LICENSE("GPL"); 25 MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); 26 27 static const char bcache_magic[] = { 28 0xc6, 0x85, 0x73, 0xf6, 0x4e, 0x1a, 0x45, 0xca, 29 0x82, 0x65, 0xf5, 0x7f, 0x48, 0xba, 0x6d, 0x81 30 }; 31 32 static const char invalid_uuid[] = { 33 0xa0, 0x3e, 0xf8, 0xed, 0x3e, 0xe1, 0xb8, 0x78, 34 0xc8, 0x50, 0xfc, 0x5e, 0xcb, 0x16, 0xcd, 0x99 35 }; 36 37 /* Default is -1; we skip past it for struct cached_dev's cache mode */ 38 const char * const bch_cache_modes[] = { 39 "default", 40 "writethrough", 41 "writeback", 42 "writearound", 43 "none", 44 NULL 45 }; 46 47 struct uuid_entry_v0 { 48 uint8_t uuid[16]; 49 uint8_t label[32]; 50 uint32_t first_reg; 51 uint32_t last_reg; 52 uint32_t invalidated; 53 uint32_t pad; 54 }; 55 56 static struct kobject *bcache_kobj; 57 struct mutex bch_register_lock; 58 LIST_HEAD(bch_cache_sets); 59 static LIST_HEAD(uncached_devices); 60 61 static int bcache_major, bcache_minor; 62 static wait_queue_head_t unregister_wait; 63 struct workqueue_struct *bcache_wq; 64 65 #define BTREE_MAX_PAGES (256 * 1024 / PAGE_SIZE) 66 67 static void bio_split_pool_free(struct bio_split_pool *p) 68 { 69 if (p->bio_split_hook) 70 mempool_destroy(p->bio_split_hook); 71 72 if (p->bio_split) 73 bioset_free(p->bio_split); 74 } 75 76 static int bio_split_pool_init(struct bio_split_pool *p) 77 { 78 p->bio_split = bioset_create(4, 0); 79 if (!p->bio_split) 80 return -ENOMEM; 81 82 p->bio_split_hook = mempool_create_kmalloc_pool(4, 83 sizeof(struct bio_split_hook)); 84 if (!p->bio_split_hook) 85 return -ENOMEM; 86 87 return 0; 88 } 89 90 /* Superblock */ 91 92 static const char *read_super(struct cache_sb *sb, struct block_device *bdev, 93 struct page **res) 94 { 95 const char *err; 96 struct cache_sb *s; 97 struct buffer_head *bh = __bread(bdev, 1, SB_SIZE); 98 unsigned i; 99 100 if (!bh) 101 return "IO error"; 102 103 s = (struct cache_sb *) bh->b_data; 104 105 sb->offset = le64_to_cpu(s->offset); 106 sb->version = le64_to_cpu(s->version); 107 108 memcpy(sb->magic, s->magic, 16); 109 memcpy(sb->uuid, s->uuid, 16); 110 memcpy(sb->set_uuid, s->set_uuid, 16); 111 memcpy(sb->label, s->label, SB_LABEL_SIZE); 112 113 sb->flags = le64_to_cpu(s->flags); 114 sb->seq = le64_to_cpu(s->seq); 115 sb->last_mount = le32_to_cpu(s->last_mount); 116 sb->first_bucket = le16_to_cpu(s->first_bucket); 117 sb->keys = le16_to_cpu(s->keys); 118 119 for (i = 0; i < SB_JOURNAL_BUCKETS; i++) 120 sb->d[i] = le64_to_cpu(s->d[i]); 121 122 pr_debug("read sb version %llu, flags %llu, seq %llu, journal size %u", 123 sb->version, sb->flags, sb->seq, sb->keys); 124 125 err = "Not a bcache superblock"; 126 if (sb->offset != SB_SECTOR) 127 goto err; 128 129 if (memcmp(sb->magic, bcache_magic, 16)) 130 goto err; 131 132 err = "Too many journal buckets"; 133 if (sb->keys > SB_JOURNAL_BUCKETS) 134 goto err; 135 136 err = "Bad checksum"; 137 if (s->csum != csum_set(s)) 138 goto err; 139 140 err = "Bad UUID"; 141 if (bch_is_zero(sb->uuid, 16)) 142 goto err; 143 144 sb->block_size = le16_to_cpu(s->block_size); 145 146 err = "Superblock block size smaller than device block size"; 147 if (sb->block_size << 9 < bdev_logical_block_size(bdev)) 148 goto err; 149 150 switch (sb->version) { 151 case BCACHE_SB_VERSION_BDEV: 152 sb->data_offset = BDEV_DATA_START_DEFAULT; 153 break; 154 case BCACHE_SB_VERSION_BDEV_WITH_OFFSET: 155 sb->data_offset = le64_to_cpu(s->data_offset); 156 157 err = "Bad data offset"; 158 if (sb->data_offset < BDEV_DATA_START_DEFAULT) 159 goto err; 160 161 break; 162 case BCACHE_SB_VERSION_CDEV: 163 case BCACHE_SB_VERSION_CDEV_WITH_UUID: 164 sb->nbuckets = le64_to_cpu(s->nbuckets); 165 sb->block_size = le16_to_cpu(s->block_size); 166 sb->bucket_size = le16_to_cpu(s->bucket_size); 167 168 sb->nr_in_set = le16_to_cpu(s->nr_in_set); 169 sb->nr_this_dev = le16_to_cpu(s->nr_this_dev); 170 171 err = "Too many buckets"; 172 if (sb->nbuckets > LONG_MAX) 173 goto err; 174 175 err = "Not enough buckets"; 176 if (sb->nbuckets < 1 << 7) 177 goto err; 178 179 err = "Bad block/bucket size"; 180 if (!is_power_of_2(sb->block_size) || 181 sb->block_size > PAGE_SECTORS || 182 !is_power_of_2(sb->bucket_size) || 183 sb->bucket_size < PAGE_SECTORS) 184 goto err; 185 186 err = "Invalid superblock: device too small"; 187 if (get_capacity(bdev->bd_disk) < sb->bucket_size * sb->nbuckets) 188 goto err; 189 190 err = "Bad UUID"; 191 if (bch_is_zero(sb->set_uuid, 16)) 192 goto err; 193 194 err = "Bad cache device number in set"; 195 if (!sb->nr_in_set || 196 sb->nr_in_set <= sb->nr_this_dev || 197 sb->nr_in_set > MAX_CACHES_PER_SET) 198 goto err; 199 200 err = "Journal buckets not sequential"; 201 for (i = 0; i < sb->keys; i++) 202 if (sb->d[i] != sb->first_bucket + i) 203 goto err; 204 205 err = "Too many journal buckets"; 206 if (sb->first_bucket + sb->keys > sb->nbuckets) 207 goto err; 208 209 err = "Invalid superblock: first bucket comes before end of super"; 210 if (sb->first_bucket * sb->bucket_size < 16) 211 goto err; 212 213 break; 214 default: 215 err = "Unsupported superblock version"; 216 goto err; 217 } 218 219 sb->last_mount = get_seconds(); 220 err = NULL; 221 222 get_page(bh->b_page); 223 *res = bh->b_page; 224 err: 225 put_bh(bh); 226 return err; 227 } 228 229 static void write_bdev_super_endio(struct bio *bio, int error) 230 { 231 struct cached_dev *dc = bio->bi_private; 232 /* XXX: error checking */ 233 234 closure_put(&dc->sb_write.cl); 235 } 236 237 static void __write_super(struct cache_sb *sb, struct bio *bio) 238 { 239 struct cache_sb *out = page_address(bio->bi_io_vec[0].bv_page); 240 unsigned i; 241 242 bio->bi_sector = SB_SECTOR; 243 bio->bi_rw = REQ_SYNC|REQ_META; 244 bio->bi_size = SB_SIZE; 245 bch_bio_map(bio, NULL); 246 247 out->offset = cpu_to_le64(sb->offset); 248 out->version = cpu_to_le64(sb->version); 249 250 memcpy(out->uuid, sb->uuid, 16); 251 memcpy(out->set_uuid, sb->set_uuid, 16); 252 memcpy(out->label, sb->label, SB_LABEL_SIZE); 253 254 out->flags = cpu_to_le64(sb->flags); 255 out->seq = cpu_to_le64(sb->seq); 256 257 out->last_mount = cpu_to_le32(sb->last_mount); 258 out->first_bucket = cpu_to_le16(sb->first_bucket); 259 out->keys = cpu_to_le16(sb->keys); 260 261 for (i = 0; i < sb->keys; i++) 262 out->d[i] = cpu_to_le64(sb->d[i]); 263 264 out->csum = csum_set(out); 265 266 pr_debug("ver %llu, flags %llu, seq %llu", 267 sb->version, sb->flags, sb->seq); 268 269 submit_bio(REQ_WRITE, bio); 270 } 271 272 void bch_write_bdev_super(struct cached_dev *dc, struct closure *parent) 273 { 274 struct closure *cl = &dc->sb_write.cl; 275 struct bio *bio = &dc->sb_bio; 276 277 closure_lock(&dc->sb_write, parent); 278 279 bio_reset(bio); 280 bio->bi_bdev = dc->bdev; 281 bio->bi_end_io = write_bdev_super_endio; 282 bio->bi_private = dc; 283 284 closure_get(cl); 285 __write_super(&dc->sb, bio); 286 287 closure_return(cl); 288 } 289 290 static void write_super_endio(struct bio *bio, int error) 291 { 292 struct cache *ca = bio->bi_private; 293 294 bch_count_io_errors(ca, error, "writing superblock"); 295 closure_put(&ca->set->sb_write.cl); 296 } 297 298 void bcache_write_super(struct cache_set *c) 299 { 300 struct closure *cl = &c->sb_write.cl; 301 struct cache *ca; 302 unsigned i; 303 304 closure_lock(&c->sb_write, &c->cl); 305 306 c->sb.seq++; 307 308 for_each_cache(ca, c, i) { 309 struct bio *bio = &ca->sb_bio; 310 311 ca->sb.version = BCACHE_SB_VERSION_CDEV_WITH_UUID; 312 ca->sb.seq = c->sb.seq; 313 ca->sb.last_mount = c->sb.last_mount; 314 315 SET_CACHE_SYNC(&ca->sb, CACHE_SYNC(&c->sb)); 316 317 bio_reset(bio); 318 bio->bi_bdev = ca->bdev; 319 bio->bi_end_io = write_super_endio; 320 bio->bi_private = ca; 321 322 closure_get(cl); 323 __write_super(&ca->sb, bio); 324 } 325 326 closure_return(cl); 327 } 328 329 /* UUID io */ 330 331 static void uuid_endio(struct bio *bio, int error) 332 { 333 struct closure *cl = bio->bi_private; 334 struct cache_set *c = container_of(cl, struct cache_set, uuid_write.cl); 335 336 cache_set_err_on(error, c, "accessing uuids"); 337 bch_bbio_free(bio, c); 338 closure_put(cl); 339 } 340 341 static void uuid_io(struct cache_set *c, unsigned long rw, 342 struct bkey *k, struct closure *parent) 343 { 344 struct closure *cl = &c->uuid_write.cl; 345 struct uuid_entry *u; 346 unsigned i; 347 char buf[80]; 348 349 BUG_ON(!parent); 350 closure_lock(&c->uuid_write, parent); 351 352 for (i = 0; i < KEY_PTRS(k); i++) { 353 struct bio *bio = bch_bbio_alloc(c); 354 355 bio->bi_rw = REQ_SYNC|REQ_META|rw; 356 bio->bi_size = KEY_SIZE(k) << 9; 357 358 bio->bi_end_io = uuid_endio; 359 bio->bi_private = cl; 360 bch_bio_map(bio, c->uuids); 361 362 bch_submit_bbio(bio, c, k, i); 363 364 if (!(rw & WRITE)) 365 break; 366 } 367 368 bch_bkey_to_text(buf, sizeof(buf), k); 369 pr_debug("%s UUIDs at %s", rw & REQ_WRITE ? "wrote" : "read", buf); 370 371 for (u = c->uuids; u < c->uuids + c->nr_uuids; u++) 372 if (!bch_is_zero(u->uuid, 16)) 373 pr_debug("Slot %zi: %pU: %s: 1st: %u last: %u inv: %u", 374 u - c->uuids, u->uuid, u->label, 375 u->first_reg, u->last_reg, u->invalidated); 376 377 closure_return(cl); 378 } 379 380 static char *uuid_read(struct cache_set *c, struct jset *j, struct closure *cl) 381 { 382 struct bkey *k = &j->uuid_bucket; 383 384 if (__bch_ptr_invalid(c, 1, k)) 385 return "bad uuid pointer"; 386 387 bkey_copy(&c->uuid_bucket, k); 388 uuid_io(c, READ_SYNC, k, cl); 389 390 if (j->version < BCACHE_JSET_VERSION_UUIDv1) { 391 struct uuid_entry_v0 *u0 = (void *) c->uuids; 392 struct uuid_entry *u1 = (void *) c->uuids; 393 int i; 394 395 closure_sync(cl); 396 397 /* 398 * Since the new uuid entry is bigger than the old, we have to 399 * convert starting at the highest memory address and work down 400 * in order to do it in place 401 */ 402 403 for (i = c->nr_uuids - 1; 404 i >= 0; 405 --i) { 406 memcpy(u1[i].uuid, u0[i].uuid, 16); 407 memcpy(u1[i].label, u0[i].label, 32); 408 409 u1[i].first_reg = u0[i].first_reg; 410 u1[i].last_reg = u0[i].last_reg; 411 u1[i].invalidated = u0[i].invalidated; 412 413 u1[i].flags = 0; 414 u1[i].sectors = 0; 415 } 416 } 417 418 return NULL; 419 } 420 421 static int __uuid_write(struct cache_set *c) 422 { 423 BKEY_PADDED(key) k; 424 struct closure cl; 425 closure_init_stack(&cl); 426 427 lockdep_assert_held(&bch_register_lock); 428 429 if (bch_bucket_alloc_set(c, WATERMARK_METADATA, &k.key, 1, &cl)) 430 return 1; 431 432 SET_KEY_SIZE(&k.key, c->sb.bucket_size); 433 uuid_io(c, REQ_WRITE, &k.key, &cl); 434 closure_sync(&cl); 435 436 bkey_copy(&c->uuid_bucket, &k.key); 437 __bkey_put(c, &k.key); 438 return 0; 439 } 440 441 int bch_uuid_write(struct cache_set *c) 442 { 443 int ret = __uuid_write(c); 444 445 if (!ret) 446 bch_journal_meta(c, NULL); 447 448 return ret; 449 } 450 451 static struct uuid_entry *uuid_find(struct cache_set *c, const char *uuid) 452 { 453 struct uuid_entry *u; 454 455 for (u = c->uuids; 456 u < c->uuids + c->nr_uuids; u++) 457 if (!memcmp(u->uuid, uuid, 16)) 458 return u; 459 460 return NULL; 461 } 462 463 static struct uuid_entry *uuid_find_empty(struct cache_set *c) 464 { 465 static const char zero_uuid[16] = "\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0\0"; 466 return uuid_find(c, zero_uuid); 467 } 468 469 /* 470 * Bucket priorities/gens: 471 * 472 * For each bucket, we store on disk its 473 * 8 bit gen 474 * 16 bit priority 475 * 476 * See alloc.c for an explanation of the gen. The priority is used to implement 477 * lru (and in the future other) cache replacement policies; for most purposes 478 * it's just an opaque integer. 479 * 480 * The gens and the priorities don't have a whole lot to do with each other, and 481 * it's actually the gens that must be written out at specific times - it's no 482 * big deal if the priorities don't get written, if we lose them we just reuse 483 * buckets in suboptimal order. 484 * 485 * On disk they're stored in a packed array, and in as many buckets are required 486 * to fit them all. The buckets we use to store them form a list; the journal 487 * header points to the first bucket, the first bucket points to the second 488 * bucket, et cetera. 489 * 490 * This code is used by the allocation code; periodically (whenever it runs out 491 * of buckets to allocate from) the allocation code will invalidate some 492 * buckets, but it can't use those buckets until their new gens are safely on 493 * disk. 494 */ 495 496 static void prio_endio(struct bio *bio, int error) 497 { 498 struct cache *ca = bio->bi_private; 499 500 cache_set_err_on(error, ca->set, "accessing priorities"); 501 bch_bbio_free(bio, ca->set); 502 closure_put(&ca->prio); 503 } 504 505 static void prio_io(struct cache *ca, uint64_t bucket, unsigned long rw) 506 { 507 struct closure *cl = &ca->prio; 508 struct bio *bio = bch_bbio_alloc(ca->set); 509 510 closure_init_stack(cl); 511 512 bio->bi_sector = bucket * ca->sb.bucket_size; 513 bio->bi_bdev = ca->bdev; 514 bio->bi_rw = REQ_SYNC|REQ_META|rw; 515 bio->bi_size = bucket_bytes(ca); 516 517 bio->bi_end_io = prio_endio; 518 bio->bi_private = ca; 519 bch_bio_map(bio, ca->disk_buckets); 520 521 closure_bio_submit(bio, &ca->prio, ca); 522 closure_sync(cl); 523 } 524 525 #define buckets_free(c) "free %zu, free_inc %zu, unused %zu", \ 526 fifo_used(&c->free), fifo_used(&c->free_inc), fifo_used(&c->unused) 527 528 void bch_prio_write(struct cache *ca) 529 { 530 int i; 531 struct bucket *b; 532 struct closure cl; 533 534 closure_init_stack(&cl); 535 536 lockdep_assert_held(&ca->set->bucket_lock); 537 538 for (b = ca->buckets; 539 b < ca->buckets + ca->sb.nbuckets; b++) 540 b->disk_gen = b->gen; 541 542 ca->disk_buckets->seq++; 543 544 atomic_long_add(ca->sb.bucket_size * prio_buckets(ca), 545 &ca->meta_sectors_written); 546 547 pr_debug("free %zu, free_inc %zu, unused %zu", fifo_used(&ca->free), 548 fifo_used(&ca->free_inc), fifo_used(&ca->unused)); 549 550 for (i = prio_buckets(ca) - 1; i >= 0; --i) { 551 long bucket; 552 struct prio_set *p = ca->disk_buckets; 553 struct bucket_disk *d = p->data; 554 struct bucket_disk *end = d + prios_per_bucket(ca); 555 556 for (b = ca->buckets + i * prios_per_bucket(ca); 557 b < ca->buckets + ca->sb.nbuckets && d < end; 558 b++, d++) { 559 d->prio = cpu_to_le16(b->prio); 560 d->gen = b->gen; 561 } 562 563 p->next_bucket = ca->prio_buckets[i + 1]; 564 p->magic = pset_magic(ca); 565 p->csum = bch_crc64(&p->magic, bucket_bytes(ca) - 8); 566 567 bucket = bch_bucket_alloc(ca, WATERMARK_PRIO, &cl); 568 BUG_ON(bucket == -1); 569 570 mutex_unlock(&ca->set->bucket_lock); 571 prio_io(ca, bucket, REQ_WRITE); 572 mutex_lock(&ca->set->bucket_lock); 573 574 ca->prio_buckets[i] = bucket; 575 atomic_dec_bug(&ca->buckets[bucket].pin); 576 } 577 578 mutex_unlock(&ca->set->bucket_lock); 579 580 bch_journal_meta(ca->set, &cl); 581 closure_sync(&cl); 582 583 mutex_lock(&ca->set->bucket_lock); 584 585 ca->need_save_prio = 0; 586 587 /* 588 * Don't want the old priorities to get garbage collected until after we 589 * finish writing the new ones, and they're journalled 590 */ 591 for (i = 0; i < prio_buckets(ca); i++) 592 ca->prio_last_buckets[i] = ca->prio_buckets[i]; 593 } 594 595 static void prio_read(struct cache *ca, uint64_t bucket) 596 { 597 struct prio_set *p = ca->disk_buckets; 598 struct bucket_disk *d = p->data + prios_per_bucket(ca), *end = d; 599 struct bucket *b; 600 unsigned bucket_nr = 0; 601 602 for (b = ca->buckets; 603 b < ca->buckets + ca->sb.nbuckets; 604 b++, d++) { 605 if (d == end) { 606 ca->prio_buckets[bucket_nr] = bucket; 607 ca->prio_last_buckets[bucket_nr] = bucket; 608 bucket_nr++; 609 610 prio_io(ca, bucket, READ_SYNC); 611 612 if (p->csum != bch_crc64(&p->magic, bucket_bytes(ca) - 8)) 613 pr_warn("bad csum reading priorities"); 614 615 if (p->magic != pset_magic(ca)) 616 pr_warn("bad magic reading priorities"); 617 618 bucket = p->next_bucket; 619 d = p->data; 620 } 621 622 b->prio = le16_to_cpu(d->prio); 623 b->gen = b->disk_gen = b->last_gc = b->gc_gen = d->gen; 624 } 625 } 626 627 /* Bcache device */ 628 629 static int open_dev(struct block_device *b, fmode_t mode) 630 { 631 struct bcache_device *d = b->bd_disk->private_data; 632 if (atomic_read(&d->closing)) 633 return -ENXIO; 634 635 closure_get(&d->cl); 636 return 0; 637 } 638 639 static void release_dev(struct gendisk *b, fmode_t mode) 640 { 641 struct bcache_device *d = b->private_data; 642 closure_put(&d->cl); 643 } 644 645 static int ioctl_dev(struct block_device *b, fmode_t mode, 646 unsigned int cmd, unsigned long arg) 647 { 648 struct bcache_device *d = b->bd_disk->private_data; 649 return d->ioctl(d, mode, cmd, arg); 650 } 651 652 static const struct block_device_operations bcache_ops = { 653 .open = open_dev, 654 .release = release_dev, 655 .ioctl = ioctl_dev, 656 .owner = THIS_MODULE, 657 }; 658 659 void bcache_device_stop(struct bcache_device *d) 660 { 661 if (!atomic_xchg(&d->closing, 1)) 662 closure_queue(&d->cl); 663 } 664 665 static void bcache_device_unlink(struct bcache_device *d) 666 { 667 unsigned i; 668 struct cache *ca; 669 670 sysfs_remove_link(&d->c->kobj, d->name); 671 sysfs_remove_link(&d->kobj, "cache"); 672 673 for_each_cache(ca, d->c, i) 674 bd_unlink_disk_holder(ca->bdev, d->disk); 675 } 676 677 static void bcache_device_link(struct bcache_device *d, struct cache_set *c, 678 const char *name) 679 { 680 unsigned i; 681 struct cache *ca; 682 683 for_each_cache(ca, d->c, i) 684 bd_link_disk_holder(ca->bdev, d->disk); 685 686 snprintf(d->name, BCACHEDEVNAME_SIZE, 687 "%s%u", name, d->id); 688 689 WARN(sysfs_create_link(&d->kobj, &c->kobj, "cache") || 690 sysfs_create_link(&c->kobj, &d->kobj, d->name), 691 "Couldn't create device <-> cache set symlinks"); 692 } 693 694 static void bcache_device_detach(struct bcache_device *d) 695 { 696 lockdep_assert_held(&bch_register_lock); 697 698 if (atomic_read(&d->detaching)) { 699 struct uuid_entry *u = d->c->uuids + d->id; 700 701 SET_UUID_FLASH_ONLY(u, 0); 702 memcpy(u->uuid, invalid_uuid, 16); 703 u->invalidated = cpu_to_le32(get_seconds()); 704 bch_uuid_write(d->c); 705 706 atomic_set(&d->detaching, 0); 707 } 708 709 bcache_device_unlink(d); 710 711 d->c->devices[d->id] = NULL; 712 closure_put(&d->c->caching); 713 d->c = NULL; 714 } 715 716 static void bcache_device_attach(struct bcache_device *d, struct cache_set *c, 717 unsigned id) 718 { 719 BUG_ON(test_bit(CACHE_SET_STOPPING, &c->flags)); 720 721 d->id = id; 722 d->c = c; 723 c->devices[id] = d; 724 725 closure_get(&c->caching); 726 } 727 728 static void bcache_device_free(struct bcache_device *d) 729 { 730 lockdep_assert_held(&bch_register_lock); 731 732 pr_info("%s stopped", d->disk->disk_name); 733 734 if (d->c) 735 bcache_device_detach(d); 736 if (d->disk && d->disk->flags & GENHD_FL_UP) 737 del_gendisk(d->disk); 738 if (d->disk && d->disk->queue) 739 blk_cleanup_queue(d->disk->queue); 740 if (d->disk) 741 put_disk(d->disk); 742 743 bio_split_pool_free(&d->bio_split_hook); 744 if (d->unaligned_bvec) 745 mempool_destroy(d->unaligned_bvec); 746 if (d->bio_split) 747 bioset_free(d->bio_split); 748 if (is_vmalloc_addr(d->stripe_sectors_dirty)) 749 vfree(d->stripe_sectors_dirty); 750 else 751 kfree(d->stripe_sectors_dirty); 752 753 closure_debug_destroy(&d->cl); 754 } 755 756 static int bcache_device_init(struct bcache_device *d, unsigned block_size, 757 sector_t sectors) 758 { 759 struct request_queue *q; 760 size_t n; 761 762 if (!d->stripe_size_bits) 763 d->stripe_size_bits = 31; 764 765 d->nr_stripes = round_up(sectors, 1 << d->stripe_size_bits) >> 766 d->stripe_size_bits; 767 768 if (!d->nr_stripes || d->nr_stripes > SIZE_MAX / sizeof(atomic_t)) 769 return -ENOMEM; 770 771 n = d->nr_stripes * sizeof(atomic_t); 772 d->stripe_sectors_dirty = n < PAGE_SIZE << 6 773 ? kzalloc(n, GFP_KERNEL) 774 : vzalloc(n); 775 if (!d->stripe_sectors_dirty) 776 return -ENOMEM; 777 778 if (!(d->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 779 !(d->unaligned_bvec = mempool_create_kmalloc_pool(1, 780 sizeof(struct bio_vec) * BIO_MAX_PAGES)) || 781 bio_split_pool_init(&d->bio_split_hook) || 782 !(d->disk = alloc_disk(1)) || 783 !(q = blk_alloc_queue(GFP_KERNEL))) 784 return -ENOMEM; 785 786 set_capacity(d->disk, sectors); 787 snprintf(d->disk->disk_name, DISK_NAME_LEN, "bcache%i", bcache_minor); 788 789 d->disk->major = bcache_major; 790 d->disk->first_minor = bcache_minor++; 791 d->disk->fops = &bcache_ops; 792 d->disk->private_data = d; 793 794 blk_queue_make_request(q, NULL); 795 d->disk->queue = q; 796 q->queuedata = d; 797 q->backing_dev_info.congested_data = d; 798 q->limits.max_hw_sectors = UINT_MAX; 799 q->limits.max_sectors = UINT_MAX; 800 q->limits.max_segment_size = UINT_MAX; 801 q->limits.max_segments = BIO_MAX_PAGES; 802 q->limits.max_discard_sectors = UINT_MAX; 803 q->limits.io_min = block_size; 804 q->limits.logical_block_size = block_size; 805 q->limits.physical_block_size = block_size; 806 set_bit(QUEUE_FLAG_NONROT, &d->disk->queue->queue_flags); 807 set_bit(QUEUE_FLAG_DISCARD, &d->disk->queue->queue_flags); 808 809 blk_queue_flush(q, REQ_FLUSH|REQ_FUA); 810 811 return 0; 812 } 813 814 /* Cached device */ 815 816 static void calc_cached_dev_sectors(struct cache_set *c) 817 { 818 uint64_t sectors = 0; 819 struct cached_dev *dc; 820 821 list_for_each_entry(dc, &c->cached_devs, list) 822 sectors += bdev_sectors(dc->bdev); 823 824 c->cached_dev_sectors = sectors; 825 } 826 827 void bch_cached_dev_run(struct cached_dev *dc) 828 { 829 struct bcache_device *d = &dc->disk; 830 char buf[SB_LABEL_SIZE + 1]; 831 char *env[] = { 832 "DRIVER=bcache", 833 kasprintf(GFP_KERNEL, "CACHED_UUID=%pU", dc->sb.uuid), 834 NULL, 835 NULL, 836 }; 837 838 memcpy(buf, dc->sb.label, SB_LABEL_SIZE); 839 buf[SB_LABEL_SIZE] = '\0'; 840 env[2] = kasprintf(GFP_KERNEL, "CACHED_LABEL=%s", buf); 841 842 if (atomic_xchg(&dc->running, 1)) 843 return; 844 845 if (!d->c && 846 BDEV_STATE(&dc->sb) != BDEV_STATE_NONE) { 847 struct closure cl; 848 closure_init_stack(&cl); 849 850 SET_BDEV_STATE(&dc->sb, BDEV_STATE_STALE); 851 bch_write_bdev_super(dc, &cl); 852 closure_sync(&cl); 853 } 854 855 add_disk(d->disk); 856 bd_link_disk_holder(dc->bdev, dc->disk.disk); 857 /* won't show up in the uevent file, use udevadm monitor -e instead 858 * only class / kset properties are persistent */ 859 kobject_uevent_env(&disk_to_dev(d->disk)->kobj, KOBJ_CHANGE, env); 860 kfree(env[1]); 861 kfree(env[2]); 862 863 if (sysfs_create_link(&d->kobj, &disk_to_dev(d->disk)->kobj, "dev") || 864 sysfs_create_link(&disk_to_dev(d->disk)->kobj, &d->kobj, "bcache")) 865 pr_debug("error creating sysfs link"); 866 } 867 868 static void cached_dev_detach_finish(struct work_struct *w) 869 { 870 struct cached_dev *dc = container_of(w, struct cached_dev, detach); 871 char buf[BDEVNAME_SIZE]; 872 struct closure cl; 873 closure_init_stack(&cl); 874 875 BUG_ON(!atomic_read(&dc->disk.detaching)); 876 BUG_ON(atomic_read(&dc->count)); 877 878 mutex_lock(&bch_register_lock); 879 880 memset(&dc->sb.set_uuid, 0, 16); 881 SET_BDEV_STATE(&dc->sb, BDEV_STATE_NONE); 882 883 bch_write_bdev_super(dc, &cl); 884 closure_sync(&cl); 885 886 bcache_device_detach(&dc->disk); 887 list_move(&dc->list, &uncached_devices); 888 889 mutex_unlock(&bch_register_lock); 890 891 pr_info("Caching disabled for %s", bdevname(dc->bdev, buf)); 892 893 /* Drop ref we took in cached_dev_detach() */ 894 closure_put(&dc->disk.cl); 895 } 896 897 void bch_cached_dev_detach(struct cached_dev *dc) 898 { 899 lockdep_assert_held(&bch_register_lock); 900 901 if (atomic_read(&dc->disk.closing)) 902 return; 903 904 if (atomic_xchg(&dc->disk.detaching, 1)) 905 return; 906 907 /* 908 * Block the device from being closed and freed until we're finished 909 * detaching 910 */ 911 closure_get(&dc->disk.cl); 912 913 bch_writeback_queue(dc); 914 cached_dev_put(dc); 915 } 916 917 int bch_cached_dev_attach(struct cached_dev *dc, struct cache_set *c) 918 { 919 uint32_t rtime = cpu_to_le32(get_seconds()); 920 struct uuid_entry *u; 921 char buf[BDEVNAME_SIZE]; 922 923 bdevname(dc->bdev, buf); 924 925 if (memcmp(dc->sb.set_uuid, c->sb.set_uuid, 16)) 926 return -ENOENT; 927 928 if (dc->disk.c) { 929 pr_err("Can't attach %s: already attached", buf); 930 return -EINVAL; 931 } 932 933 if (test_bit(CACHE_SET_STOPPING, &c->flags)) { 934 pr_err("Can't attach %s: shutting down", buf); 935 return -EINVAL; 936 } 937 938 if (dc->sb.block_size < c->sb.block_size) { 939 /* Will die */ 940 pr_err("Couldn't attach %s: block size less than set's block size", 941 buf); 942 return -EINVAL; 943 } 944 945 u = uuid_find(c, dc->sb.uuid); 946 947 if (u && 948 (BDEV_STATE(&dc->sb) == BDEV_STATE_STALE || 949 BDEV_STATE(&dc->sb) == BDEV_STATE_NONE)) { 950 memcpy(u->uuid, invalid_uuid, 16); 951 u->invalidated = cpu_to_le32(get_seconds()); 952 u = NULL; 953 } 954 955 if (!u) { 956 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 957 pr_err("Couldn't find uuid for %s in set", buf); 958 return -ENOENT; 959 } 960 961 u = uuid_find_empty(c); 962 if (!u) { 963 pr_err("Not caching %s, no room for UUID", buf); 964 return -EINVAL; 965 } 966 } 967 968 /* Deadlocks since we're called via sysfs... 969 sysfs_remove_file(&dc->kobj, &sysfs_attach); 970 */ 971 972 if (bch_is_zero(u->uuid, 16)) { 973 struct closure cl; 974 closure_init_stack(&cl); 975 976 memcpy(u->uuid, dc->sb.uuid, 16); 977 memcpy(u->label, dc->sb.label, SB_LABEL_SIZE); 978 u->first_reg = u->last_reg = rtime; 979 bch_uuid_write(c); 980 981 memcpy(dc->sb.set_uuid, c->sb.set_uuid, 16); 982 SET_BDEV_STATE(&dc->sb, BDEV_STATE_CLEAN); 983 984 bch_write_bdev_super(dc, &cl); 985 closure_sync(&cl); 986 } else { 987 u->last_reg = rtime; 988 bch_uuid_write(c); 989 } 990 991 bcache_device_attach(&dc->disk, c, u - c->uuids); 992 list_move(&dc->list, &c->cached_devs); 993 calc_cached_dev_sectors(c); 994 995 smp_wmb(); 996 /* 997 * dc->c must be set before dc->count != 0 - paired with the mb in 998 * cached_dev_get() 999 */ 1000 atomic_set(&dc->count, 1); 1001 1002 if (BDEV_STATE(&dc->sb) == BDEV_STATE_DIRTY) { 1003 bch_sectors_dirty_init(dc); 1004 atomic_set(&dc->has_dirty, 1); 1005 atomic_inc(&dc->count); 1006 bch_writeback_queue(dc); 1007 } 1008 1009 bch_cached_dev_run(dc); 1010 bcache_device_link(&dc->disk, c, "bdev"); 1011 1012 pr_info("Caching %s as %s on set %pU", 1013 bdevname(dc->bdev, buf), dc->disk.disk->disk_name, 1014 dc->disk.c->sb.set_uuid); 1015 return 0; 1016 } 1017 1018 void bch_cached_dev_release(struct kobject *kobj) 1019 { 1020 struct cached_dev *dc = container_of(kobj, struct cached_dev, 1021 disk.kobj); 1022 kfree(dc); 1023 module_put(THIS_MODULE); 1024 } 1025 1026 static void cached_dev_free(struct closure *cl) 1027 { 1028 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1029 1030 cancel_delayed_work_sync(&dc->writeback_rate_update); 1031 1032 mutex_lock(&bch_register_lock); 1033 1034 if (atomic_read(&dc->running)) 1035 bd_unlink_disk_holder(dc->bdev, dc->disk.disk); 1036 bcache_device_free(&dc->disk); 1037 list_del(&dc->list); 1038 1039 mutex_unlock(&bch_register_lock); 1040 1041 if (!IS_ERR_OR_NULL(dc->bdev)) { 1042 if (dc->bdev->bd_disk) 1043 blk_sync_queue(bdev_get_queue(dc->bdev)); 1044 1045 blkdev_put(dc->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1046 } 1047 1048 wake_up(&unregister_wait); 1049 1050 kobject_put(&dc->disk.kobj); 1051 } 1052 1053 static void cached_dev_flush(struct closure *cl) 1054 { 1055 struct cached_dev *dc = container_of(cl, struct cached_dev, disk.cl); 1056 struct bcache_device *d = &dc->disk; 1057 1058 bch_cache_accounting_destroy(&dc->accounting); 1059 kobject_del(&d->kobj); 1060 1061 continue_at(cl, cached_dev_free, system_wq); 1062 } 1063 1064 static int cached_dev_init(struct cached_dev *dc, unsigned block_size) 1065 { 1066 int ret; 1067 struct io *io; 1068 struct request_queue *q = bdev_get_queue(dc->bdev); 1069 1070 __module_get(THIS_MODULE); 1071 INIT_LIST_HEAD(&dc->list); 1072 closure_init(&dc->disk.cl, NULL); 1073 set_closure_fn(&dc->disk.cl, cached_dev_flush, system_wq); 1074 kobject_init(&dc->disk.kobj, &bch_cached_dev_ktype); 1075 INIT_WORK(&dc->detach, cached_dev_detach_finish); 1076 closure_init_unlocked(&dc->sb_write); 1077 INIT_LIST_HEAD(&dc->io_lru); 1078 spin_lock_init(&dc->io_lock); 1079 bch_cache_accounting_init(&dc->accounting, &dc->disk.cl); 1080 1081 dc->sequential_merge = true; 1082 dc->sequential_cutoff = 4 << 20; 1083 1084 for (io = dc->io; io < dc->io + RECENT_IO; io++) { 1085 list_add(&io->lru, &dc->io_lru); 1086 hlist_add_head(&io->hash, dc->io_hash + RECENT_IO); 1087 } 1088 1089 ret = bcache_device_init(&dc->disk, block_size, 1090 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); 1091 if (ret) 1092 return ret; 1093 1094 set_capacity(dc->disk.disk, 1095 dc->bdev->bd_part->nr_sects - dc->sb.data_offset); 1096 1097 dc->disk.disk->queue->backing_dev_info.ra_pages = 1098 max(dc->disk.disk->queue->backing_dev_info.ra_pages, 1099 q->backing_dev_info.ra_pages); 1100 1101 bch_cached_dev_request_init(dc); 1102 bch_cached_dev_writeback_init(dc); 1103 return 0; 1104 } 1105 1106 /* Cached device - bcache superblock */ 1107 1108 static void register_bdev(struct cache_sb *sb, struct page *sb_page, 1109 struct block_device *bdev, 1110 struct cached_dev *dc) 1111 { 1112 char name[BDEVNAME_SIZE]; 1113 const char *err = "cannot allocate memory"; 1114 struct cache_set *c; 1115 1116 memcpy(&dc->sb, sb, sizeof(struct cache_sb)); 1117 dc->bdev = bdev; 1118 dc->bdev->bd_holder = dc; 1119 1120 bio_init(&dc->sb_bio); 1121 dc->sb_bio.bi_max_vecs = 1; 1122 dc->sb_bio.bi_io_vec = dc->sb_bio.bi_inline_vecs; 1123 dc->sb_bio.bi_io_vec[0].bv_page = sb_page; 1124 get_page(sb_page); 1125 1126 if (cached_dev_init(dc, sb->block_size << 9)) 1127 goto err; 1128 1129 err = "error creating kobject"; 1130 if (kobject_add(&dc->disk.kobj, &part_to_dev(bdev->bd_part)->kobj, 1131 "bcache")) 1132 goto err; 1133 if (bch_cache_accounting_add_kobjs(&dc->accounting, &dc->disk.kobj)) 1134 goto err; 1135 1136 pr_info("registered backing device %s", bdevname(bdev, name)); 1137 1138 list_add(&dc->list, &uncached_devices); 1139 list_for_each_entry(c, &bch_cache_sets, list) 1140 bch_cached_dev_attach(dc, c); 1141 1142 if (BDEV_STATE(&dc->sb) == BDEV_STATE_NONE || 1143 BDEV_STATE(&dc->sb) == BDEV_STATE_STALE) 1144 bch_cached_dev_run(dc); 1145 1146 return; 1147 err: 1148 pr_notice("error opening %s: %s", bdevname(bdev, name), err); 1149 bcache_device_stop(&dc->disk); 1150 } 1151 1152 /* Flash only volumes */ 1153 1154 void bch_flash_dev_release(struct kobject *kobj) 1155 { 1156 struct bcache_device *d = container_of(kobj, struct bcache_device, 1157 kobj); 1158 kfree(d); 1159 } 1160 1161 static void flash_dev_free(struct closure *cl) 1162 { 1163 struct bcache_device *d = container_of(cl, struct bcache_device, cl); 1164 bcache_device_free(d); 1165 kobject_put(&d->kobj); 1166 } 1167 1168 static void flash_dev_flush(struct closure *cl) 1169 { 1170 struct bcache_device *d = container_of(cl, struct bcache_device, cl); 1171 1172 bcache_device_unlink(d); 1173 kobject_del(&d->kobj); 1174 continue_at(cl, flash_dev_free, system_wq); 1175 } 1176 1177 static int flash_dev_run(struct cache_set *c, struct uuid_entry *u) 1178 { 1179 struct bcache_device *d = kzalloc(sizeof(struct bcache_device), 1180 GFP_KERNEL); 1181 if (!d) 1182 return -ENOMEM; 1183 1184 closure_init(&d->cl, NULL); 1185 set_closure_fn(&d->cl, flash_dev_flush, system_wq); 1186 1187 kobject_init(&d->kobj, &bch_flash_dev_ktype); 1188 1189 if (bcache_device_init(d, block_bytes(c), u->sectors)) 1190 goto err; 1191 1192 bcache_device_attach(d, c, u - c->uuids); 1193 bch_flash_dev_request_init(d); 1194 add_disk(d->disk); 1195 1196 if (kobject_add(&d->kobj, &disk_to_dev(d->disk)->kobj, "bcache")) 1197 goto err; 1198 1199 bcache_device_link(d, c, "volume"); 1200 1201 return 0; 1202 err: 1203 kobject_put(&d->kobj); 1204 return -ENOMEM; 1205 } 1206 1207 static int flash_devs_run(struct cache_set *c) 1208 { 1209 int ret = 0; 1210 struct uuid_entry *u; 1211 1212 for (u = c->uuids; 1213 u < c->uuids + c->nr_uuids && !ret; 1214 u++) 1215 if (UUID_FLASH_ONLY(u)) 1216 ret = flash_dev_run(c, u); 1217 1218 return ret; 1219 } 1220 1221 int bch_flash_dev_create(struct cache_set *c, uint64_t size) 1222 { 1223 struct uuid_entry *u; 1224 1225 if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1226 return -EINTR; 1227 1228 u = uuid_find_empty(c); 1229 if (!u) { 1230 pr_err("Can't create volume, no room for UUID"); 1231 return -EINVAL; 1232 } 1233 1234 get_random_bytes(u->uuid, 16); 1235 memset(u->label, 0, 32); 1236 u->first_reg = u->last_reg = cpu_to_le32(get_seconds()); 1237 1238 SET_UUID_FLASH_ONLY(u, 1); 1239 u->sectors = size >> 9; 1240 1241 bch_uuid_write(c); 1242 1243 return flash_dev_run(c, u); 1244 } 1245 1246 /* Cache set */ 1247 1248 __printf(2, 3) 1249 bool bch_cache_set_error(struct cache_set *c, const char *fmt, ...) 1250 { 1251 va_list args; 1252 1253 if (test_bit(CACHE_SET_STOPPING, &c->flags)) 1254 return false; 1255 1256 /* XXX: we can be called from atomic context 1257 acquire_console_sem(); 1258 */ 1259 1260 printk(KERN_ERR "bcache: error on %pU: ", c->sb.set_uuid); 1261 1262 va_start(args, fmt); 1263 vprintk(fmt, args); 1264 va_end(args); 1265 1266 printk(", disabling caching\n"); 1267 1268 bch_cache_set_unregister(c); 1269 return true; 1270 } 1271 1272 void bch_cache_set_release(struct kobject *kobj) 1273 { 1274 struct cache_set *c = container_of(kobj, struct cache_set, kobj); 1275 kfree(c); 1276 module_put(THIS_MODULE); 1277 } 1278 1279 static void cache_set_free(struct closure *cl) 1280 { 1281 struct cache_set *c = container_of(cl, struct cache_set, cl); 1282 struct cache *ca; 1283 unsigned i; 1284 1285 if (!IS_ERR_OR_NULL(c->debug)) 1286 debugfs_remove(c->debug); 1287 1288 bch_open_buckets_free(c); 1289 bch_btree_cache_free(c); 1290 bch_journal_free(c); 1291 1292 for_each_cache(ca, c, i) 1293 if (ca) 1294 kobject_put(&ca->kobj); 1295 1296 free_pages((unsigned long) c->uuids, ilog2(bucket_pages(c))); 1297 free_pages((unsigned long) c->sort, ilog2(bucket_pages(c))); 1298 1299 if (c->bio_split) 1300 bioset_free(c->bio_split); 1301 if (c->fill_iter) 1302 mempool_destroy(c->fill_iter); 1303 if (c->bio_meta) 1304 mempool_destroy(c->bio_meta); 1305 if (c->search) 1306 mempool_destroy(c->search); 1307 kfree(c->devices); 1308 1309 mutex_lock(&bch_register_lock); 1310 list_del(&c->list); 1311 mutex_unlock(&bch_register_lock); 1312 1313 pr_info("Cache set %pU unregistered", c->sb.set_uuid); 1314 wake_up(&unregister_wait); 1315 1316 closure_debug_destroy(&c->cl); 1317 kobject_put(&c->kobj); 1318 } 1319 1320 static void cache_set_flush(struct closure *cl) 1321 { 1322 struct cache_set *c = container_of(cl, struct cache_set, caching); 1323 struct btree *b; 1324 1325 /* Shut down allocator threads */ 1326 set_bit(CACHE_SET_STOPPING_2, &c->flags); 1327 wake_up_allocators(c); 1328 1329 bch_cache_accounting_destroy(&c->accounting); 1330 1331 kobject_put(&c->internal); 1332 kobject_del(&c->kobj); 1333 1334 if (!IS_ERR_OR_NULL(c->root)) 1335 list_add(&c->root->list, &c->btree_cache); 1336 1337 /* Should skip this if we're unregistering because of an error */ 1338 list_for_each_entry(b, &c->btree_cache, list) 1339 if (btree_node_dirty(b)) 1340 bch_btree_node_write(b, NULL); 1341 1342 closure_return(cl); 1343 } 1344 1345 static void __cache_set_unregister(struct closure *cl) 1346 { 1347 struct cache_set *c = container_of(cl, struct cache_set, caching); 1348 struct cached_dev *dc, *t; 1349 size_t i; 1350 1351 mutex_lock(&bch_register_lock); 1352 1353 if (test_bit(CACHE_SET_UNREGISTERING, &c->flags)) 1354 list_for_each_entry_safe(dc, t, &c->cached_devs, list) 1355 bch_cached_dev_detach(dc); 1356 1357 for (i = 0; i < c->nr_uuids; i++) 1358 if (c->devices[i] && UUID_FLASH_ONLY(&c->uuids[i])) 1359 bcache_device_stop(c->devices[i]); 1360 1361 mutex_unlock(&bch_register_lock); 1362 1363 continue_at(cl, cache_set_flush, system_wq); 1364 } 1365 1366 void bch_cache_set_stop(struct cache_set *c) 1367 { 1368 if (!test_and_set_bit(CACHE_SET_STOPPING, &c->flags)) 1369 closure_queue(&c->caching); 1370 } 1371 1372 void bch_cache_set_unregister(struct cache_set *c) 1373 { 1374 set_bit(CACHE_SET_UNREGISTERING, &c->flags); 1375 bch_cache_set_stop(c); 1376 } 1377 1378 #define alloc_bucket_pages(gfp, c) \ 1379 ((void *) __get_free_pages(__GFP_ZERO|gfp, ilog2(bucket_pages(c)))) 1380 1381 struct cache_set *bch_cache_set_alloc(struct cache_sb *sb) 1382 { 1383 int iter_size; 1384 struct cache_set *c = kzalloc(sizeof(struct cache_set), GFP_KERNEL); 1385 if (!c) 1386 return NULL; 1387 1388 __module_get(THIS_MODULE); 1389 closure_init(&c->cl, NULL); 1390 set_closure_fn(&c->cl, cache_set_free, system_wq); 1391 1392 closure_init(&c->caching, &c->cl); 1393 set_closure_fn(&c->caching, __cache_set_unregister, system_wq); 1394 1395 /* Maybe create continue_at_noreturn() and use it here? */ 1396 closure_set_stopped(&c->cl); 1397 closure_put(&c->cl); 1398 1399 kobject_init(&c->kobj, &bch_cache_set_ktype); 1400 kobject_init(&c->internal, &bch_cache_set_internal_ktype); 1401 1402 bch_cache_accounting_init(&c->accounting, &c->cl); 1403 1404 memcpy(c->sb.set_uuid, sb->set_uuid, 16); 1405 c->sb.block_size = sb->block_size; 1406 c->sb.bucket_size = sb->bucket_size; 1407 c->sb.nr_in_set = sb->nr_in_set; 1408 c->sb.last_mount = sb->last_mount; 1409 c->bucket_bits = ilog2(sb->bucket_size); 1410 c->block_bits = ilog2(sb->block_size); 1411 c->nr_uuids = bucket_bytes(c) / sizeof(struct uuid_entry); 1412 1413 c->btree_pages = c->sb.bucket_size / PAGE_SECTORS; 1414 if (c->btree_pages > BTREE_MAX_PAGES) 1415 c->btree_pages = max_t(int, c->btree_pages / 4, 1416 BTREE_MAX_PAGES); 1417 1418 c->sort_crit_factor = int_sqrt(c->btree_pages); 1419 1420 mutex_init(&c->bucket_lock); 1421 mutex_init(&c->sort_lock); 1422 spin_lock_init(&c->sort_time_lock); 1423 closure_init_unlocked(&c->sb_write); 1424 closure_init_unlocked(&c->uuid_write); 1425 spin_lock_init(&c->btree_read_time_lock); 1426 bch_moving_init_cache_set(c); 1427 1428 INIT_LIST_HEAD(&c->list); 1429 INIT_LIST_HEAD(&c->cached_devs); 1430 INIT_LIST_HEAD(&c->btree_cache); 1431 INIT_LIST_HEAD(&c->btree_cache_freeable); 1432 INIT_LIST_HEAD(&c->btree_cache_freed); 1433 INIT_LIST_HEAD(&c->data_buckets); 1434 1435 c->search = mempool_create_slab_pool(32, bch_search_cache); 1436 if (!c->search) 1437 goto err; 1438 1439 iter_size = (sb->bucket_size / sb->block_size + 1) * 1440 sizeof(struct btree_iter_set); 1441 1442 if (!(c->devices = kzalloc(c->nr_uuids * sizeof(void *), GFP_KERNEL)) || 1443 !(c->bio_meta = mempool_create_kmalloc_pool(2, 1444 sizeof(struct bbio) + sizeof(struct bio_vec) * 1445 bucket_pages(c))) || 1446 !(c->fill_iter = mempool_create_kmalloc_pool(1, iter_size)) || 1447 !(c->bio_split = bioset_create(4, offsetof(struct bbio, bio))) || 1448 !(c->sort = alloc_bucket_pages(GFP_KERNEL, c)) || 1449 !(c->uuids = alloc_bucket_pages(GFP_KERNEL, c)) || 1450 bch_journal_alloc(c) || 1451 bch_btree_cache_alloc(c) || 1452 bch_open_buckets_alloc(c)) 1453 goto err; 1454 1455 c->congested_read_threshold_us = 2000; 1456 c->congested_write_threshold_us = 20000; 1457 c->error_limit = 8 << IO_ERROR_SHIFT; 1458 1459 return c; 1460 err: 1461 bch_cache_set_unregister(c); 1462 return NULL; 1463 } 1464 1465 static void run_cache_set(struct cache_set *c) 1466 { 1467 const char *err = "cannot allocate memory"; 1468 struct cached_dev *dc, *t; 1469 struct cache *ca; 1470 unsigned i; 1471 1472 struct btree_op op; 1473 bch_btree_op_init_stack(&op); 1474 op.lock = SHRT_MAX; 1475 1476 for_each_cache(ca, c, i) 1477 c->nbuckets += ca->sb.nbuckets; 1478 1479 if (CACHE_SYNC(&c->sb)) { 1480 LIST_HEAD(journal); 1481 struct bkey *k; 1482 struct jset *j; 1483 1484 err = "cannot allocate memory for journal"; 1485 if (bch_journal_read(c, &journal, &op)) 1486 goto err; 1487 1488 pr_debug("btree_journal_read() done"); 1489 1490 err = "no journal entries found"; 1491 if (list_empty(&journal)) 1492 goto err; 1493 1494 j = &list_entry(journal.prev, struct journal_replay, list)->j; 1495 1496 err = "IO error reading priorities"; 1497 for_each_cache(ca, c, i) 1498 prio_read(ca, j->prio_bucket[ca->sb.nr_this_dev]); 1499 1500 /* 1501 * If prio_read() fails it'll call cache_set_error and we'll 1502 * tear everything down right away, but if we perhaps checked 1503 * sooner we could avoid journal replay. 1504 */ 1505 1506 k = &j->btree_root; 1507 1508 err = "bad btree root"; 1509 if (__bch_ptr_invalid(c, j->btree_level + 1, k)) 1510 goto err; 1511 1512 err = "error reading btree root"; 1513 c->root = bch_btree_node_get(c, k, j->btree_level, &op); 1514 if (IS_ERR_OR_NULL(c->root)) 1515 goto err; 1516 1517 list_del_init(&c->root->list); 1518 rw_unlock(true, c->root); 1519 1520 err = uuid_read(c, j, &op.cl); 1521 if (err) 1522 goto err; 1523 1524 err = "error in recovery"; 1525 if (bch_btree_check(c, &op)) 1526 goto err; 1527 1528 bch_journal_mark(c, &journal); 1529 bch_btree_gc_finish(c); 1530 pr_debug("btree_check() done"); 1531 1532 /* 1533 * bcache_journal_next() can't happen sooner, or 1534 * btree_gc_finish() will give spurious errors about last_gc > 1535 * gc_gen - this is a hack but oh well. 1536 */ 1537 bch_journal_next(&c->journal); 1538 1539 err = "error starting allocator thread"; 1540 for_each_cache(ca, c, i) 1541 if (bch_cache_allocator_start(ca)) 1542 goto err; 1543 1544 /* 1545 * First place it's safe to allocate: btree_check() and 1546 * btree_gc_finish() have to run before we have buckets to 1547 * allocate, and bch_bucket_alloc_set() might cause a journal 1548 * entry to be written so bcache_journal_next() has to be called 1549 * first. 1550 * 1551 * If the uuids were in the old format we have to rewrite them 1552 * before the next journal entry is written: 1553 */ 1554 if (j->version < BCACHE_JSET_VERSION_UUID) 1555 __uuid_write(c); 1556 1557 bch_journal_replay(c, &journal, &op); 1558 } else { 1559 pr_notice("invalidating existing data"); 1560 /* Don't want invalidate_buckets() to queue a gc yet */ 1561 closure_lock(&c->gc, NULL); 1562 1563 for_each_cache(ca, c, i) { 1564 unsigned j; 1565 1566 ca->sb.keys = clamp_t(int, ca->sb.nbuckets >> 7, 1567 2, SB_JOURNAL_BUCKETS); 1568 1569 for (j = 0; j < ca->sb.keys; j++) 1570 ca->sb.d[j] = ca->sb.first_bucket + j; 1571 } 1572 1573 bch_btree_gc_finish(c); 1574 1575 err = "error starting allocator thread"; 1576 for_each_cache(ca, c, i) 1577 if (bch_cache_allocator_start(ca)) 1578 goto err; 1579 1580 mutex_lock(&c->bucket_lock); 1581 for_each_cache(ca, c, i) 1582 bch_prio_write(ca); 1583 mutex_unlock(&c->bucket_lock); 1584 1585 err = "cannot allocate new UUID bucket"; 1586 if (__uuid_write(c)) 1587 goto err_unlock_gc; 1588 1589 err = "cannot allocate new btree root"; 1590 c->root = bch_btree_node_alloc(c, 0, &op.cl); 1591 if (IS_ERR_OR_NULL(c->root)) 1592 goto err_unlock_gc; 1593 1594 bkey_copy_key(&c->root->key, &MAX_KEY); 1595 bch_btree_node_write(c->root, &op.cl); 1596 1597 bch_btree_set_root(c->root); 1598 rw_unlock(true, c->root); 1599 1600 /* 1601 * We don't want to write the first journal entry until 1602 * everything is set up - fortunately journal entries won't be 1603 * written until the SET_CACHE_SYNC() here: 1604 */ 1605 SET_CACHE_SYNC(&c->sb, true); 1606 1607 bch_journal_next(&c->journal); 1608 bch_journal_meta(c, &op.cl); 1609 1610 /* Unlock */ 1611 closure_set_stopped(&c->gc.cl); 1612 closure_put(&c->gc.cl); 1613 } 1614 1615 closure_sync(&op.cl); 1616 c->sb.last_mount = get_seconds(); 1617 bcache_write_super(c); 1618 1619 list_for_each_entry_safe(dc, t, &uncached_devices, list) 1620 bch_cached_dev_attach(dc, c); 1621 1622 flash_devs_run(c); 1623 1624 return; 1625 err_unlock_gc: 1626 closure_set_stopped(&c->gc.cl); 1627 closure_put(&c->gc.cl); 1628 err: 1629 closure_sync(&op.cl); 1630 /* XXX: test this, it's broken */ 1631 bch_cache_set_error(c, err); 1632 } 1633 1634 static bool can_attach_cache(struct cache *ca, struct cache_set *c) 1635 { 1636 return ca->sb.block_size == c->sb.block_size && 1637 ca->sb.bucket_size == c->sb.block_size && 1638 ca->sb.nr_in_set == c->sb.nr_in_set; 1639 } 1640 1641 static const char *register_cache_set(struct cache *ca) 1642 { 1643 char buf[12]; 1644 const char *err = "cannot allocate memory"; 1645 struct cache_set *c; 1646 1647 list_for_each_entry(c, &bch_cache_sets, list) 1648 if (!memcmp(c->sb.set_uuid, ca->sb.set_uuid, 16)) { 1649 if (c->cache[ca->sb.nr_this_dev]) 1650 return "duplicate cache set member"; 1651 1652 if (!can_attach_cache(ca, c)) 1653 return "cache sb does not match set"; 1654 1655 if (!CACHE_SYNC(&ca->sb)) 1656 SET_CACHE_SYNC(&c->sb, false); 1657 1658 goto found; 1659 } 1660 1661 c = bch_cache_set_alloc(&ca->sb); 1662 if (!c) 1663 return err; 1664 1665 err = "error creating kobject"; 1666 if (kobject_add(&c->kobj, bcache_kobj, "%pU", c->sb.set_uuid) || 1667 kobject_add(&c->internal, &c->kobj, "internal")) 1668 goto err; 1669 1670 if (bch_cache_accounting_add_kobjs(&c->accounting, &c->kobj)) 1671 goto err; 1672 1673 bch_debug_init_cache_set(c); 1674 1675 list_add(&c->list, &bch_cache_sets); 1676 found: 1677 sprintf(buf, "cache%i", ca->sb.nr_this_dev); 1678 if (sysfs_create_link(&ca->kobj, &c->kobj, "set") || 1679 sysfs_create_link(&c->kobj, &ca->kobj, buf)) 1680 goto err; 1681 1682 if (ca->sb.seq > c->sb.seq) { 1683 c->sb.version = ca->sb.version; 1684 memcpy(c->sb.set_uuid, ca->sb.set_uuid, 16); 1685 c->sb.flags = ca->sb.flags; 1686 c->sb.seq = ca->sb.seq; 1687 pr_debug("set version = %llu", c->sb.version); 1688 } 1689 1690 ca->set = c; 1691 ca->set->cache[ca->sb.nr_this_dev] = ca; 1692 c->cache_by_alloc[c->caches_loaded++] = ca; 1693 1694 if (c->caches_loaded == c->sb.nr_in_set) 1695 run_cache_set(c); 1696 1697 return NULL; 1698 err: 1699 bch_cache_set_unregister(c); 1700 return err; 1701 } 1702 1703 /* Cache device */ 1704 1705 void bch_cache_release(struct kobject *kobj) 1706 { 1707 struct cache *ca = container_of(kobj, struct cache, kobj); 1708 1709 if (ca->set) 1710 ca->set->cache[ca->sb.nr_this_dev] = NULL; 1711 1712 bch_cache_allocator_exit(ca); 1713 1714 bio_split_pool_free(&ca->bio_split_hook); 1715 1716 free_pages((unsigned long) ca->disk_buckets, ilog2(bucket_pages(ca))); 1717 kfree(ca->prio_buckets); 1718 vfree(ca->buckets); 1719 1720 free_heap(&ca->heap); 1721 free_fifo(&ca->unused); 1722 free_fifo(&ca->free_inc); 1723 free_fifo(&ca->free); 1724 1725 if (ca->sb_bio.bi_inline_vecs[0].bv_page) 1726 put_page(ca->sb_bio.bi_io_vec[0].bv_page); 1727 1728 if (!IS_ERR_OR_NULL(ca->bdev)) { 1729 blk_sync_queue(bdev_get_queue(ca->bdev)); 1730 blkdev_put(ca->bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1731 } 1732 1733 kfree(ca); 1734 module_put(THIS_MODULE); 1735 } 1736 1737 static int cache_alloc(struct cache_sb *sb, struct cache *ca) 1738 { 1739 size_t free; 1740 struct bucket *b; 1741 1742 __module_get(THIS_MODULE); 1743 kobject_init(&ca->kobj, &bch_cache_ktype); 1744 1745 INIT_LIST_HEAD(&ca->discards); 1746 1747 bio_init(&ca->journal.bio); 1748 ca->journal.bio.bi_max_vecs = 8; 1749 ca->journal.bio.bi_io_vec = ca->journal.bio.bi_inline_vecs; 1750 1751 free = roundup_pow_of_two(ca->sb.nbuckets) >> 9; 1752 free = max_t(size_t, free, (prio_buckets(ca) + 8) * 2); 1753 1754 if (!init_fifo(&ca->free, free, GFP_KERNEL) || 1755 !init_fifo(&ca->free_inc, free << 2, GFP_KERNEL) || 1756 !init_fifo(&ca->unused, free << 2, GFP_KERNEL) || 1757 !init_heap(&ca->heap, free << 3, GFP_KERNEL) || 1758 !(ca->buckets = vzalloc(sizeof(struct bucket) * 1759 ca->sb.nbuckets)) || 1760 !(ca->prio_buckets = kzalloc(sizeof(uint64_t) * prio_buckets(ca) * 1761 2, GFP_KERNEL)) || 1762 !(ca->disk_buckets = alloc_bucket_pages(GFP_KERNEL, ca)) || 1763 bio_split_pool_init(&ca->bio_split_hook)) 1764 return -ENOMEM; 1765 1766 ca->prio_last_buckets = ca->prio_buckets + prio_buckets(ca); 1767 1768 for_each_bucket(b, ca) 1769 atomic_set(&b->pin, 0); 1770 1771 if (bch_cache_allocator_init(ca)) 1772 goto err; 1773 1774 return 0; 1775 err: 1776 kobject_put(&ca->kobj); 1777 return -ENOMEM; 1778 } 1779 1780 static void register_cache(struct cache_sb *sb, struct page *sb_page, 1781 struct block_device *bdev, struct cache *ca) 1782 { 1783 char name[BDEVNAME_SIZE]; 1784 const char *err = "cannot allocate memory"; 1785 1786 memcpy(&ca->sb, sb, sizeof(struct cache_sb)); 1787 ca->bdev = bdev; 1788 ca->bdev->bd_holder = ca; 1789 1790 bio_init(&ca->sb_bio); 1791 ca->sb_bio.bi_max_vecs = 1; 1792 ca->sb_bio.bi_io_vec = ca->sb_bio.bi_inline_vecs; 1793 ca->sb_bio.bi_io_vec[0].bv_page = sb_page; 1794 get_page(sb_page); 1795 1796 if (blk_queue_discard(bdev_get_queue(ca->bdev))) 1797 ca->discard = CACHE_DISCARD(&ca->sb); 1798 1799 if (cache_alloc(sb, ca) != 0) 1800 goto err; 1801 1802 err = "error creating kobject"; 1803 if (kobject_add(&ca->kobj, &part_to_dev(bdev->bd_part)->kobj, "bcache")) 1804 goto err; 1805 1806 err = register_cache_set(ca); 1807 if (err) 1808 goto err; 1809 1810 pr_info("registered cache device %s", bdevname(bdev, name)); 1811 return; 1812 err: 1813 pr_notice("error opening %s: %s", bdevname(bdev, name), err); 1814 kobject_put(&ca->kobj); 1815 } 1816 1817 /* Global interfaces/init */ 1818 1819 static ssize_t register_bcache(struct kobject *, struct kobj_attribute *, 1820 const char *, size_t); 1821 1822 kobj_attribute_write(register, register_bcache); 1823 kobj_attribute_write(register_quiet, register_bcache); 1824 1825 static bool bch_is_open_backing(struct block_device *bdev) { 1826 struct cache_set *c, *tc; 1827 struct cached_dev *dc, *t; 1828 1829 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) 1830 list_for_each_entry_safe(dc, t, &c->cached_devs, list) 1831 if (dc->bdev == bdev) 1832 return true; 1833 list_for_each_entry_safe(dc, t, &uncached_devices, list) 1834 if (dc->bdev == bdev) 1835 return true; 1836 return false; 1837 } 1838 1839 static bool bch_is_open_cache(struct block_device *bdev) { 1840 struct cache_set *c, *tc; 1841 struct cache *ca; 1842 unsigned i; 1843 1844 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) 1845 for_each_cache(ca, c, i) 1846 if (ca->bdev == bdev) 1847 return true; 1848 return false; 1849 } 1850 1851 static bool bch_is_open(struct block_device *bdev) { 1852 return bch_is_open_cache(bdev) || bch_is_open_backing(bdev); 1853 } 1854 1855 static ssize_t register_bcache(struct kobject *k, struct kobj_attribute *attr, 1856 const char *buffer, size_t size) 1857 { 1858 ssize_t ret = size; 1859 const char *err = "cannot allocate memory"; 1860 char *path = NULL; 1861 struct cache_sb *sb = NULL; 1862 struct block_device *bdev = NULL; 1863 struct page *sb_page = NULL; 1864 1865 if (!try_module_get(THIS_MODULE)) 1866 return -EBUSY; 1867 1868 mutex_lock(&bch_register_lock); 1869 1870 if (!(path = kstrndup(buffer, size, GFP_KERNEL)) || 1871 !(sb = kmalloc(sizeof(struct cache_sb), GFP_KERNEL))) 1872 goto err; 1873 1874 err = "failed to open device"; 1875 bdev = blkdev_get_by_path(strim(path), 1876 FMODE_READ|FMODE_WRITE|FMODE_EXCL, 1877 sb); 1878 if (IS_ERR(bdev)) { 1879 if (bdev == ERR_PTR(-EBUSY)) { 1880 bdev = lookup_bdev(strim(path)); 1881 if (!IS_ERR(bdev) && bch_is_open(bdev)) 1882 err = "device already registered"; 1883 else 1884 err = "device busy"; 1885 } 1886 goto err; 1887 } 1888 1889 err = "failed to set blocksize"; 1890 if (set_blocksize(bdev, 4096)) 1891 goto err_close; 1892 1893 err = read_super(sb, bdev, &sb_page); 1894 if (err) 1895 goto err_close; 1896 1897 if (SB_IS_BDEV(sb)) { 1898 struct cached_dev *dc = kzalloc(sizeof(*dc), GFP_KERNEL); 1899 if (!dc) 1900 goto err_close; 1901 1902 register_bdev(sb, sb_page, bdev, dc); 1903 } else { 1904 struct cache *ca = kzalloc(sizeof(*ca), GFP_KERNEL); 1905 if (!ca) 1906 goto err_close; 1907 1908 register_cache(sb, sb_page, bdev, ca); 1909 } 1910 out: 1911 if (sb_page) 1912 put_page(sb_page); 1913 kfree(sb); 1914 kfree(path); 1915 mutex_unlock(&bch_register_lock); 1916 module_put(THIS_MODULE); 1917 return ret; 1918 1919 err_close: 1920 blkdev_put(bdev, FMODE_READ|FMODE_WRITE|FMODE_EXCL); 1921 err: 1922 if (attr != &ksysfs_register_quiet) 1923 pr_info("error opening %s: %s", path, err); 1924 ret = -EINVAL; 1925 goto out; 1926 } 1927 1928 static int bcache_reboot(struct notifier_block *n, unsigned long code, void *x) 1929 { 1930 if (code == SYS_DOWN || 1931 code == SYS_HALT || 1932 code == SYS_POWER_OFF) { 1933 DEFINE_WAIT(wait); 1934 unsigned long start = jiffies; 1935 bool stopped = false; 1936 1937 struct cache_set *c, *tc; 1938 struct cached_dev *dc, *tdc; 1939 1940 mutex_lock(&bch_register_lock); 1941 1942 if (list_empty(&bch_cache_sets) && 1943 list_empty(&uncached_devices)) 1944 goto out; 1945 1946 pr_info("Stopping all devices:"); 1947 1948 list_for_each_entry_safe(c, tc, &bch_cache_sets, list) 1949 bch_cache_set_stop(c); 1950 1951 list_for_each_entry_safe(dc, tdc, &uncached_devices, list) 1952 bcache_device_stop(&dc->disk); 1953 1954 /* What's a condition variable? */ 1955 while (1) { 1956 long timeout = start + 2 * HZ - jiffies; 1957 1958 stopped = list_empty(&bch_cache_sets) && 1959 list_empty(&uncached_devices); 1960 1961 if (timeout < 0 || stopped) 1962 break; 1963 1964 prepare_to_wait(&unregister_wait, &wait, 1965 TASK_UNINTERRUPTIBLE); 1966 1967 mutex_unlock(&bch_register_lock); 1968 schedule_timeout(timeout); 1969 mutex_lock(&bch_register_lock); 1970 } 1971 1972 finish_wait(&unregister_wait, &wait); 1973 1974 if (stopped) 1975 pr_info("All devices stopped"); 1976 else 1977 pr_notice("Timeout waiting for devices to be closed"); 1978 out: 1979 mutex_unlock(&bch_register_lock); 1980 } 1981 1982 return NOTIFY_DONE; 1983 } 1984 1985 static struct notifier_block reboot = { 1986 .notifier_call = bcache_reboot, 1987 .priority = INT_MAX, /* before any real devices */ 1988 }; 1989 1990 static void bcache_exit(void) 1991 { 1992 bch_debug_exit(); 1993 bch_writeback_exit(); 1994 bch_request_exit(); 1995 bch_btree_exit(); 1996 if (bcache_kobj) 1997 kobject_put(bcache_kobj); 1998 if (bcache_wq) 1999 destroy_workqueue(bcache_wq); 2000 unregister_blkdev(bcache_major, "bcache"); 2001 unregister_reboot_notifier(&reboot); 2002 } 2003 2004 static int __init bcache_init(void) 2005 { 2006 static const struct attribute *files[] = { 2007 &ksysfs_register.attr, 2008 &ksysfs_register_quiet.attr, 2009 NULL 2010 }; 2011 2012 mutex_init(&bch_register_lock); 2013 init_waitqueue_head(&unregister_wait); 2014 register_reboot_notifier(&reboot); 2015 closure_debug_init(); 2016 2017 bcache_major = register_blkdev(0, "bcache"); 2018 if (bcache_major < 0) 2019 return bcache_major; 2020 2021 if (!(bcache_wq = create_workqueue("bcache")) || 2022 !(bcache_kobj = kobject_create_and_add("bcache", fs_kobj)) || 2023 sysfs_create_files(bcache_kobj, files) || 2024 bch_btree_init() || 2025 bch_request_init() || 2026 bch_writeback_init() || 2027 bch_debug_init(bcache_kobj)) 2028 goto err; 2029 2030 return 0; 2031 err: 2032 bcache_exit(); 2033 return -ENOMEM; 2034 } 2035 2036 module_exit(bcache_exit); 2037 module_init(bcache_init); 2038