1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "persistent-data/dm-btree.h" 9 #include "persistent-data/dm-space-map.h" 10 #include "persistent-data/dm-space-map-disk.h" 11 #include "persistent-data/dm-transaction-manager.h" 12 13 #include <linux/list.h> 14 #include <linux/device-mapper.h> 15 #include <linux/workqueue.h> 16 17 /*-------------------------------------------------------------------------- 18 * As far as the metadata goes, there is: 19 * 20 * - A superblock in block zero, taking up fewer than 512 bytes for 21 * atomic writes. 22 * 23 * - A space map managing the metadata blocks. 24 * 25 * - A space map managing the data blocks. 26 * 27 * - A btree mapping our internal thin dev ids onto struct disk_device_details. 28 * 29 * - A hierarchical btree, with 2 levels which effectively maps (thin 30 * dev id, virtual block) -> block_time. Block time is a 64-bit 31 * field holding the time in the low 24 bits, and block in the top 48 32 * bits. 33 * 34 * BTrees consist solely of btree_nodes, that fill a block. Some are 35 * internal nodes, as such their values are a __le64 pointing to other 36 * nodes. Leaf nodes can store data of any reasonable size (ie. much 37 * smaller than the block size). The nodes consist of the header, 38 * followed by an array of keys, followed by an array of values. We have 39 * to binary search on the keys so they're all held together to help the 40 * cpu cache. 41 * 42 * Space maps have 2 btrees: 43 * 44 * - One maps a uint64_t onto a struct index_entry. Which points to a 45 * bitmap block, and has some details about how many free entries there 46 * are etc. 47 * 48 * - The bitmap blocks have a header (for the checksum). Then the rest 49 * of the block is pairs of bits. With the meaning being: 50 * 51 * 0 - ref count is 0 52 * 1 - ref count is 1 53 * 2 - ref count is 2 54 * 3 - ref count is higher than 2 55 * 56 * - If the count is higher than 2 then the ref count is entered in a 57 * second btree that directly maps the block_address to a uint32_t ref 58 * count. 59 * 60 * The space map metadata variant doesn't have a bitmaps btree. Instead 61 * it has one single blocks worth of index_entries. This avoids 62 * recursive issues with the bitmap btree needing to allocate space in 63 * order to insert. With a small data block size such as 64k the 64 * metadata support data devices that are hundreds of terrabytes. 65 * 66 * The space maps allocate space linearly from front to back. Space that 67 * is freed in a transaction is never recycled within that transaction. 68 * To try and avoid fragmenting _free_ space the allocator always goes 69 * back and fills in gaps. 70 * 71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks 72 * from the block manager. 73 *--------------------------------------------------------------------------*/ 74 75 #define DM_MSG_PREFIX "thin metadata" 76 77 #define THIN_SUPERBLOCK_MAGIC 27022010 78 #define THIN_SUPERBLOCK_LOCATION 0 79 #define THIN_VERSION 1 80 #define THIN_METADATA_CACHE_SIZE 64 81 #define SECTOR_TO_BLOCK_SHIFT 3 82 83 /* This should be plenty */ 84 #define SPACE_MAP_ROOT_SIZE 128 85 86 /* 87 * Little endian on-disk superblock and device details. 88 */ 89 struct thin_disk_superblock { 90 __le32 csum; /* Checksum of superblock except for this field. */ 91 __le32 flags; 92 __le64 blocknr; /* This block number, dm_block_t. */ 93 94 __u8 uuid[16]; 95 __le64 magic; 96 __le32 version; 97 __le32 time; 98 99 __le64 trans_id; 100 101 /* 102 * Root held by userspace transactions. 103 */ 104 __le64 held_root; 105 106 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 107 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 108 109 /* 110 * 2-level btree mapping (dev_id, (dev block, time)) -> data block 111 */ 112 __le64 data_mapping_root; 113 114 /* 115 * Device detail root mapping dev_id -> device_details 116 */ 117 __le64 device_details_root; 118 119 __le32 data_block_size; /* In 512-byte sectors. */ 120 121 __le32 metadata_block_size; /* In 512-byte sectors. */ 122 __le64 metadata_nr_blocks; 123 124 __le32 compat_flags; 125 __le32 compat_ro_flags; 126 __le32 incompat_flags; 127 } __packed; 128 129 struct disk_device_details { 130 __le64 mapped_blocks; 131 __le64 transaction_id; /* When created. */ 132 __le32 creation_time; 133 __le32 snapshotted_time; 134 } __packed; 135 136 struct dm_pool_metadata { 137 struct hlist_node hash; 138 139 struct block_device *bdev; 140 struct dm_block_manager *bm; 141 struct dm_space_map *metadata_sm; 142 struct dm_space_map *data_sm; 143 struct dm_transaction_manager *tm; 144 struct dm_transaction_manager *nb_tm; 145 146 /* 147 * Two-level btree. 148 * First level holds thin_dev_t. 149 * Second level holds mappings. 150 */ 151 struct dm_btree_info info; 152 153 /* 154 * Non-blocking version of the above. 155 */ 156 struct dm_btree_info nb_info; 157 158 /* 159 * Just the top level for deleting whole devices. 160 */ 161 struct dm_btree_info tl_info; 162 163 /* 164 * Just the bottom level for creating new devices. 165 */ 166 struct dm_btree_info bl_info; 167 168 /* 169 * Describes the device details btree. 170 */ 171 struct dm_btree_info details_info; 172 173 struct rw_semaphore root_lock; 174 uint32_t time; 175 int need_commit; 176 dm_block_t root; 177 dm_block_t details_root; 178 struct list_head thin_devices; 179 uint64_t trans_id; 180 unsigned long flags; 181 sector_t data_block_size; 182 }; 183 184 struct dm_thin_device { 185 struct list_head list; 186 struct dm_pool_metadata *pmd; 187 dm_thin_id id; 188 189 int open_count; 190 int changed; 191 uint64_t mapped_blocks; 192 uint64_t transaction_id; 193 uint32_t creation_time; 194 uint32_t snapshotted_time; 195 }; 196 197 /*---------------------------------------------------------------- 198 * superblock validator 199 *--------------------------------------------------------------*/ 200 201 #define SUPERBLOCK_CSUM_XOR 160774 202 203 static void sb_prepare_for_write(struct dm_block_validator *v, 204 struct dm_block *b, 205 size_t block_size) 206 { 207 struct thin_disk_superblock *disk_super = dm_block_data(b); 208 209 disk_super->blocknr = cpu_to_le64(dm_block_location(b)); 210 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 211 block_size - sizeof(__le32), 212 SUPERBLOCK_CSUM_XOR)); 213 } 214 215 static int sb_check(struct dm_block_validator *v, 216 struct dm_block *b, 217 size_t block_size) 218 { 219 struct thin_disk_superblock *disk_super = dm_block_data(b); 220 __le32 csum_le; 221 222 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { 223 DMERR("sb_check failed: blocknr %llu: " 224 "wanted %llu", le64_to_cpu(disk_super->blocknr), 225 (unsigned long long)dm_block_location(b)); 226 return -ENOTBLK; 227 } 228 229 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { 230 DMERR("sb_check failed: magic %llu: " 231 "wanted %llu", le64_to_cpu(disk_super->magic), 232 (unsigned long long)THIN_SUPERBLOCK_MAGIC); 233 return -EILSEQ; 234 } 235 236 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 237 block_size - sizeof(__le32), 238 SUPERBLOCK_CSUM_XOR)); 239 if (csum_le != disk_super->csum) { 240 DMERR("sb_check failed: csum %u: wanted %u", 241 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); 242 return -EILSEQ; 243 } 244 245 return 0; 246 } 247 248 static struct dm_block_validator sb_validator = { 249 .name = "superblock", 250 .prepare_for_write = sb_prepare_for_write, 251 .check = sb_check 252 }; 253 254 /*---------------------------------------------------------------- 255 * Methods for the btree value types 256 *--------------------------------------------------------------*/ 257 258 static uint64_t pack_block_time(dm_block_t b, uint32_t t) 259 { 260 return (b << 24) | t; 261 } 262 263 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) 264 { 265 *b = v >> 24; 266 *t = v & ((1 << 24) - 1); 267 } 268 269 static void data_block_inc(void *context, void *value_le) 270 { 271 struct dm_space_map *sm = context; 272 __le64 v_le; 273 uint64_t b; 274 uint32_t t; 275 276 memcpy(&v_le, value_le, sizeof(v_le)); 277 unpack_block_time(le64_to_cpu(v_le), &b, &t); 278 dm_sm_inc_block(sm, b); 279 } 280 281 static void data_block_dec(void *context, void *value_le) 282 { 283 struct dm_space_map *sm = context; 284 __le64 v_le; 285 uint64_t b; 286 uint32_t t; 287 288 memcpy(&v_le, value_le, sizeof(v_le)); 289 unpack_block_time(le64_to_cpu(v_le), &b, &t); 290 dm_sm_dec_block(sm, b); 291 } 292 293 static int data_block_equal(void *context, void *value1_le, void *value2_le) 294 { 295 __le64 v1_le, v2_le; 296 uint64_t b1, b2; 297 uint32_t t; 298 299 memcpy(&v1_le, value1_le, sizeof(v1_le)); 300 memcpy(&v2_le, value2_le, sizeof(v2_le)); 301 unpack_block_time(le64_to_cpu(v1_le), &b1, &t); 302 unpack_block_time(le64_to_cpu(v2_le), &b2, &t); 303 304 return b1 == b2; 305 } 306 307 static void subtree_inc(void *context, void *value) 308 { 309 struct dm_btree_info *info = context; 310 __le64 root_le; 311 uint64_t root; 312 313 memcpy(&root_le, value, sizeof(root_le)); 314 root = le64_to_cpu(root_le); 315 dm_tm_inc(info->tm, root); 316 } 317 318 static void subtree_dec(void *context, void *value) 319 { 320 struct dm_btree_info *info = context; 321 __le64 root_le; 322 uint64_t root; 323 324 memcpy(&root_le, value, sizeof(root_le)); 325 root = le64_to_cpu(root_le); 326 if (dm_btree_del(info, root)) 327 DMERR("btree delete failed\n"); 328 } 329 330 static int subtree_equal(void *context, void *value1_le, void *value2_le) 331 { 332 __le64 v1_le, v2_le; 333 memcpy(&v1_le, value1_le, sizeof(v1_le)); 334 memcpy(&v2_le, value2_le, sizeof(v2_le)); 335 336 return v1_le == v2_le; 337 } 338 339 /*----------------------------------------------------------------*/ 340 341 static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) 342 { 343 int r; 344 unsigned i; 345 struct dm_block *b; 346 __le64 *data_le, zero = cpu_to_le64(0); 347 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); 348 349 /* 350 * We can't use a validator here - it may be all zeroes. 351 */ 352 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); 353 if (r) 354 return r; 355 356 data_le = dm_block_data(b); 357 *result = 1; 358 for (i = 0; i < block_size; i++) { 359 if (data_le[i] != zero) { 360 *result = 0; 361 break; 362 } 363 } 364 365 return dm_bm_unlock(b); 366 } 367 368 static int init_pmd(struct dm_pool_metadata *pmd, 369 struct dm_block_manager *bm, 370 dm_block_t nr_blocks, int create) 371 { 372 int r; 373 struct dm_space_map *sm, *data_sm; 374 struct dm_transaction_manager *tm; 375 struct dm_block *sblock; 376 377 if (create) { 378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 379 &sb_validator, &tm, &sm, &sblock); 380 if (r < 0) { 381 DMERR("tm_create_with_sm failed"); 382 return r; 383 } 384 385 data_sm = dm_sm_disk_create(tm, nr_blocks); 386 if (IS_ERR(data_sm)) { 387 DMERR("sm_disk_create failed"); 388 dm_tm_unlock(tm, sblock); 389 r = PTR_ERR(data_sm); 390 goto bad; 391 } 392 } else { 393 struct thin_disk_superblock *disk_super = NULL; 394 size_t space_map_root_offset = 395 offsetof(struct thin_disk_superblock, metadata_space_map_root); 396 397 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 398 &sb_validator, space_map_root_offset, 399 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); 400 if (r < 0) { 401 DMERR("tm_open_with_sm failed"); 402 return r; 403 } 404 405 disk_super = dm_block_data(sblock); 406 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, 407 sizeof(disk_super->data_space_map_root)); 408 if (IS_ERR(data_sm)) { 409 DMERR("sm_disk_open failed"); 410 r = PTR_ERR(data_sm); 411 goto bad; 412 } 413 } 414 415 416 r = dm_tm_unlock(tm, sblock); 417 if (r < 0) { 418 DMERR("couldn't unlock superblock"); 419 goto bad_data_sm; 420 } 421 422 pmd->bm = bm; 423 pmd->metadata_sm = sm; 424 pmd->data_sm = data_sm; 425 pmd->tm = tm; 426 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); 427 if (!pmd->nb_tm) { 428 DMERR("could not create clone tm"); 429 r = -ENOMEM; 430 goto bad_data_sm; 431 } 432 433 pmd->info.tm = tm; 434 pmd->info.levels = 2; 435 pmd->info.value_type.context = pmd->data_sm; 436 pmd->info.value_type.size = sizeof(__le64); 437 pmd->info.value_type.inc = data_block_inc; 438 pmd->info.value_type.dec = data_block_dec; 439 pmd->info.value_type.equal = data_block_equal; 440 441 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 442 pmd->nb_info.tm = pmd->nb_tm; 443 444 pmd->tl_info.tm = tm; 445 pmd->tl_info.levels = 1; 446 pmd->tl_info.value_type.context = &pmd->info; 447 pmd->tl_info.value_type.size = sizeof(__le64); 448 pmd->tl_info.value_type.inc = subtree_inc; 449 pmd->tl_info.value_type.dec = subtree_dec; 450 pmd->tl_info.value_type.equal = subtree_equal; 451 452 pmd->bl_info.tm = tm; 453 pmd->bl_info.levels = 1; 454 pmd->bl_info.value_type.context = pmd->data_sm; 455 pmd->bl_info.value_type.size = sizeof(__le64); 456 pmd->bl_info.value_type.inc = data_block_inc; 457 pmd->bl_info.value_type.dec = data_block_dec; 458 pmd->bl_info.value_type.equal = data_block_equal; 459 460 pmd->details_info.tm = tm; 461 pmd->details_info.levels = 1; 462 pmd->details_info.value_type.context = NULL; 463 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 464 pmd->details_info.value_type.inc = NULL; 465 pmd->details_info.value_type.dec = NULL; 466 pmd->details_info.value_type.equal = NULL; 467 468 pmd->root = 0; 469 470 init_rwsem(&pmd->root_lock); 471 pmd->time = 0; 472 pmd->need_commit = 0; 473 pmd->details_root = 0; 474 pmd->trans_id = 0; 475 pmd->flags = 0; 476 INIT_LIST_HEAD(&pmd->thin_devices); 477 478 return 0; 479 480 bad_data_sm: 481 dm_sm_destroy(data_sm); 482 bad: 483 dm_tm_destroy(tm); 484 dm_sm_destroy(sm); 485 486 return r; 487 } 488 489 static int __begin_transaction(struct dm_pool_metadata *pmd) 490 { 491 int r; 492 u32 features; 493 struct thin_disk_superblock *disk_super; 494 struct dm_block *sblock; 495 496 /* 497 * __maybe_commit_transaction() resets these 498 */ 499 WARN_ON(pmd->need_commit); 500 501 /* 502 * We re-read the superblock every time. Shouldn't need to do this 503 * really. 504 */ 505 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 506 &sb_validator, &sblock); 507 if (r) 508 return r; 509 510 disk_super = dm_block_data(sblock); 511 pmd->time = le32_to_cpu(disk_super->time); 512 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 513 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 514 pmd->trans_id = le64_to_cpu(disk_super->trans_id); 515 pmd->flags = le32_to_cpu(disk_super->flags); 516 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 517 518 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; 519 if (features) { 520 DMERR("could not access metadata due to " 521 "unsupported optional features (%lx).", 522 (unsigned long)features); 523 r = -EINVAL; 524 goto out; 525 } 526 527 /* 528 * Check for read-only metadata to skip the following RDWR checks. 529 */ 530 if (get_disk_ro(pmd->bdev->bd_disk)) 531 goto out; 532 533 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; 534 if (features) { 535 DMERR("could not access metadata RDWR due to " 536 "unsupported optional features (%lx).", 537 (unsigned long)features); 538 r = -EINVAL; 539 } 540 541 out: 542 dm_bm_unlock(sblock); 543 return r; 544 } 545 546 static int __write_changed_details(struct dm_pool_metadata *pmd) 547 { 548 int r; 549 struct dm_thin_device *td, *tmp; 550 struct disk_device_details details; 551 uint64_t key; 552 553 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 554 if (!td->changed) 555 continue; 556 557 key = td->id; 558 559 details.mapped_blocks = cpu_to_le64(td->mapped_blocks); 560 details.transaction_id = cpu_to_le64(td->transaction_id); 561 details.creation_time = cpu_to_le32(td->creation_time); 562 details.snapshotted_time = cpu_to_le32(td->snapshotted_time); 563 __dm_bless_for_disk(&details); 564 565 r = dm_btree_insert(&pmd->details_info, pmd->details_root, 566 &key, &details, &pmd->details_root); 567 if (r) 568 return r; 569 570 if (td->open_count) 571 td->changed = 0; 572 else { 573 list_del(&td->list); 574 kfree(td); 575 } 576 577 pmd->need_commit = 1; 578 } 579 580 return 0; 581 } 582 583 static int __commit_transaction(struct dm_pool_metadata *pmd) 584 { 585 /* 586 * FIXME: Associated pool should be made read-only on failure. 587 */ 588 int r; 589 size_t metadata_len, data_len; 590 struct thin_disk_superblock *disk_super; 591 struct dm_block *sblock; 592 593 /* 594 * We need to know if the thin_disk_superblock exceeds a 512-byte sector. 595 */ 596 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 597 598 r = __write_changed_details(pmd); 599 if (r < 0) 600 goto out; 601 602 if (!pmd->need_commit) 603 goto out; 604 605 r = dm_sm_commit(pmd->data_sm); 606 if (r < 0) 607 goto out; 608 609 r = dm_tm_pre_commit(pmd->tm); 610 if (r < 0) 611 goto out; 612 613 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 614 if (r < 0) 615 goto out; 616 617 r = dm_sm_root_size(pmd->data_sm, &data_len); 618 if (r < 0) 619 goto out; 620 621 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 622 &sb_validator, &sblock); 623 if (r) 624 goto out; 625 626 disk_super = dm_block_data(sblock); 627 disk_super->time = cpu_to_le32(pmd->time); 628 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 629 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 630 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 631 disk_super->flags = cpu_to_le32(pmd->flags); 632 633 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, 634 metadata_len); 635 if (r < 0) 636 goto out_locked; 637 638 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, 639 data_len); 640 if (r < 0) 641 goto out_locked; 642 643 r = dm_tm_commit(pmd->tm, sblock); 644 if (!r) 645 pmd->need_commit = 0; 646 647 out: 648 return r; 649 650 out_locked: 651 dm_bm_unlock(sblock); 652 return r; 653 } 654 655 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 656 sector_t data_block_size) 657 { 658 int r; 659 struct thin_disk_superblock *disk_super; 660 struct dm_pool_metadata *pmd; 661 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 662 struct dm_block_manager *bm; 663 int create; 664 struct dm_block *sblock; 665 666 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 667 if (!pmd) { 668 DMERR("could not allocate metadata struct"); 669 return ERR_PTR(-ENOMEM); 670 } 671 672 /* 673 * Max hex locks: 674 * 3 for btree insert + 675 * 2 for btree lookup used within space map 676 */ 677 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, 678 THIN_METADATA_CACHE_SIZE, 5); 679 if (!bm) { 680 DMERR("could not create block manager"); 681 kfree(pmd); 682 return ERR_PTR(-ENOMEM); 683 } 684 685 r = superblock_all_zeroes(bm, &create); 686 if (r) { 687 dm_block_manager_destroy(bm); 688 kfree(pmd); 689 return ERR_PTR(r); 690 } 691 692 693 r = init_pmd(pmd, bm, 0, create); 694 if (r) { 695 dm_block_manager_destroy(bm); 696 kfree(pmd); 697 return ERR_PTR(r); 698 } 699 pmd->bdev = bdev; 700 701 if (!create) { 702 r = __begin_transaction(pmd); 703 if (r < 0) 704 goto bad; 705 return pmd; 706 } 707 708 /* 709 * Create. 710 */ 711 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 712 &sb_validator, &sblock); 713 if (r) 714 goto bad; 715 716 if (bdev_size > THIN_METADATA_MAX_SECTORS) 717 bdev_size = THIN_METADATA_MAX_SECTORS; 718 719 disk_super = dm_block_data(sblock); 720 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 721 disk_super->version = cpu_to_le32(THIN_VERSION); 722 disk_super->time = 0; 723 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 724 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 725 disk_super->data_block_size = cpu_to_le32(data_block_size); 726 727 r = dm_bm_unlock(sblock); 728 if (r < 0) 729 goto bad; 730 731 r = dm_btree_empty(&pmd->info, &pmd->root); 732 if (r < 0) 733 goto bad; 734 735 r = dm_btree_empty(&pmd->details_info, &pmd->details_root); 736 if (r < 0) { 737 DMERR("couldn't create devices root"); 738 goto bad; 739 } 740 741 pmd->flags = 0; 742 pmd->need_commit = 1; 743 r = dm_pool_commit_metadata(pmd); 744 if (r < 0) { 745 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 746 __func__, r); 747 goto bad; 748 } 749 750 return pmd; 751 752 bad: 753 if (dm_pool_metadata_close(pmd) < 0) 754 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 755 return ERR_PTR(r); 756 } 757 758 int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 759 { 760 int r; 761 unsigned open_devices = 0; 762 struct dm_thin_device *td, *tmp; 763 764 down_read(&pmd->root_lock); 765 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 766 if (td->open_count) 767 open_devices++; 768 else { 769 list_del(&td->list); 770 kfree(td); 771 } 772 } 773 up_read(&pmd->root_lock); 774 775 if (open_devices) { 776 DMERR("attempt to close pmd when %u device(s) are still open", 777 open_devices); 778 return -EBUSY; 779 } 780 781 r = __commit_transaction(pmd); 782 if (r < 0) 783 DMWARN("%s: __commit_transaction() failed, error = %d", 784 __func__, r); 785 786 dm_tm_destroy(pmd->tm); 787 dm_tm_destroy(pmd->nb_tm); 788 dm_block_manager_destroy(pmd->bm); 789 dm_sm_destroy(pmd->metadata_sm); 790 dm_sm_destroy(pmd->data_sm); 791 kfree(pmd); 792 793 return 0; 794 } 795 796 /* 797 * __open_device: Returns @td corresponding to device with id @dev, 798 * creating it if @create is set and incrementing @td->open_count. 799 * On failure, @td is undefined. 800 */ 801 static int __open_device(struct dm_pool_metadata *pmd, 802 dm_thin_id dev, int create, 803 struct dm_thin_device **td) 804 { 805 int r, changed = 0; 806 struct dm_thin_device *td2; 807 uint64_t key = dev; 808 struct disk_device_details details_le; 809 810 /* 811 * If the device is already open, return it. 812 */ 813 list_for_each_entry(td2, &pmd->thin_devices, list) 814 if (td2->id == dev) { 815 /* 816 * May not create an already-open device. 817 */ 818 if (create) 819 return -EEXIST; 820 821 td2->open_count++; 822 *td = td2; 823 return 0; 824 } 825 826 /* 827 * Check the device exists. 828 */ 829 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 830 &key, &details_le); 831 if (r) { 832 if (r != -ENODATA || !create) 833 return r; 834 835 /* 836 * Create new device. 837 */ 838 changed = 1; 839 details_le.mapped_blocks = 0; 840 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 841 details_le.creation_time = cpu_to_le32(pmd->time); 842 details_le.snapshotted_time = cpu_to_le32(pmd->time); 843 } 844 845 *td = kmalloc(sizeof(**td), GFP_NOIO); 846 if (!*td) 847 return -ENOMEM; 848 849 (*td)->pmd = pmd; 850 (*td)->id = dev; 851 (*td)->open_count = 1; 852 (*td)->changed = changed; 853 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 854 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 855 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 856 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); 857 858 list_add(&(*td)->list, &pmd->thin_devices); 859 860 return 0; 861 } 862 863 static void __close_device(struct dm_thin_device *td) 864 { 865 --td->open_count; 866 } 867 868 static int __create_thin(struct dm_pool_metadata *pmd, 869 dm_thin_id dev) 870 { 871 int r; 872 dm_block_t dev_root; 873 uint64_t key = dev; 874 struct disk_device_details details_le; 875 struct dm_thin_device *td; 876 __le64 value; 877 878 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 879 &key, &details_le); 880 if (!r) 881 return -EEXIST; 882 883 /* 884 * Create an empty btree for the mappings. 885 */ 886 r = dm_btree_empty(&pmd->bl_info, &dev_root); 887 if (r) 888 return r; 889 890 /* 891 * Insert it into the main mapping tree. 892 */ 893 value = cpu_to_le64(dev_root); 894 __dm_bless_for_disk(&value); 895 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 896 if (r) { 897 dm_btree_del(&pmd->bl_info, dev_root); 898 return r; 899 } 900 901 r = __open_device(pmd, dev, 1, &td); 902 if (r) { 903 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 904 dm_btree_del(&pmd->bl_info, dev_root); 905 return r; 906 } 907 __close_device(td); 908 909 return r; 910 } 911 912 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 913 { 914 int r; 915 916 down_write(&pmd->root_lock); 917 r = __create_thin(pmd, dev); 918 up_write(&pmd->root_lock); 919 920 return r; 921 } 922 923 static int __set_snapshot_details(struct dm_pool_metadata *pmd, 924 struct dm_thin_device *snap, 925 dm_thin_id origin, uint32_t time) 926 { 927 int r; 928 struct dm_thin_device *td; 929 930 r = __open_device(pmd, origin, 0, &td); 931 if (r) 932 return r; 933 934 td->changed = 1; 935 td->snapshotted_time = time; 936 937 snap->mapped_blocks = td->mapped_blocks; 938 snap->snapshotted_time = time; 939 __close_device(td); 940 941 return 0; 942 } 943 944 static int __create_snap(struct dm_pool_metadata *pmd, 945 dm_thin_id dev, dm_thin_id origin) 946 { 947 int r; 948 dm_block_t origin_root; 949 uint64_t key = origin, dev_key = dev; 950 struct dm_thin_device *td; 951 struct disk_device_details details_le; 952 __le64 value; 953 954 /* check this device is unused */ 955 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 956 &dev_key, &details_le); 957 if (!r) 958 return -EEXIST; 959 960 /* find the mapping tree for the origin */ 961 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); 962 if (r) 963 return r; 964 origin_root = le64_to_cpu(value); 965 966 /* clone the origin, an inc will do */ 967 dm_tm_inc(pmd->tm, origin_root); 968 969 /* insert into the main mapping tree */ 970 value = cpu_to_le64(origin_root); 971 __dm_bless_for_disk(&value); 972 key = dev; 973 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 974 if (r) { 975 dm_tm_dec(pmd->tm, origin_root); 976 return r; 977 } 978 979 pmd->time++; 980 981 r = __open_device(pmd, dev, 1, &td); 982 if (r) 983 goto bad; 984 985 r = __set_snapshot_details(pmd, td, origin, pmd->time); 986 __close_device(td); 987 988 if (r) 989 goto bad; 990 991 return 0; 992 993 bad: 994 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 995 dm_btree_remove(&pmd->details_info, pmd->details_root, 996 &key, &pmd->details_root); 997 return r; 998 } 999 1000 int dm_pool_create_snap(struct dm_pool_metadata *pmd, 1001 dm_thin_id dev, 1002 dm_thin_id origin) 1003 { 1004 int r; 1005 1006 down_write(&pmd->root_lock); 1007 r = __create_snap(pmd, dev, origin); 1008 up_write(&pmd->root_lock); 1009 1010 return r; 1011 } 1012 1013 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) 1014 { 1015 int r; 1016 uint64_t key = dev; 1017 struct dm_thin_device *td; 1018 1019 /* TODO: failure should mark the transaction invalid */ 1020 r = __open_device(pmd, dev, 0, &td); 1021 if (r) 1022 return r; 1023 1024 if (td->open_count > 1) { 1025 __close_device(td); 1026 return -EBUSY; 1027 } 1028 1029 list_del(&td->list); 1030 kfree(td); 1031 r = dm_btree_remove(&pmd->details_info, pmd->details_root, 1032 &key, &pmd->details_root); 1033 if (r) 1034 return r; 1035 1036 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1037 if (r) 1038 return r; 1039 1040 pmd->need_commit = 1; 1041 1042 return 0; 1043 } 1044 1045 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1046 dm_thin_id dev) 1047 { 1048 int r; 1049 1050 down_write(&pmd->root_lock); 1051 r = __delete_device(pmd, dev); 1052 up_write(&pmd->root_lock); 1053 1054 return r; 1055 } 1056 1057 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 1058 uint64_t current_id, 1059 uint64_t new_id) 1060 { 1061 down_write(&pmd->root_lock); 1062 if (pmd->trans_id != current_id) { 1063 up_write(&pmd->root_lock); 1064 DMERR("mismatched transaction id"); 1065 return -EINVAL; 1066 } 1067 1068 pmd->trans_id = new_id; 1069 pmd->need_commit = 1; 1070 up_write(&pmd->root_lock); 1071 1072 return 0; 1073 } 1074 1075 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1076 uint64_t *result) 1077 { 1078 down_read(&pmd->root_lock); 1079 *result = pmd->trans_id; 1080 up_read(&pmd->root_lock); 1081 1082 return 0; 1083 } 1084 1085 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) 1086 { 1087 int r, inc; 1088 struct thin_disk_superblock *disk_super; 1089 struct dm_block *copy, *sblock; 1090 dm_block_t held_root; 1091 1092 /* 1093 * Copy the superblock. 1094 */ 1095 dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); 1096 r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, 1097 &sb_validator, ©, &inc); 1098 if (r) 1099 return r; 1100 1101 BUG_ON(!inc); 1102 1103 held_root = dm_block_location(copy); 1104 disk_super = dm_block_data(copy); 1105 1106 if (le64_to_cpu(disk_super->held_root)) { 1107 DMWARN("Pool metadata snapshot already exists: release this before taking another."); 1108 1109 dm_tm_dec(pmd->tm, held_root); 1110 dm_tm_unlock(pmd->tm, copy); 1111 pmd->need_commit = 1; 1112 1113 return -EBUSY; 1114 } 1115 1116 /* 1117 * Wipe the spacemap since we're not publishing this. 1118 */ 1119 memset(&disk_super->data_space_map_root, 0, 1120 sizeof(disk_super->data_space_map_root)); 1121 memset(&disk_super->metadata_space_map_root, 0, 1122 sizeof(disk_super->metadata_space_map_root)); 1123 1124 /* 1125 * Increment the data structures that need to be preserved. 1126 */ 1127 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); 1128 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); 1129 dm_tm_unlock(pmd->tm, copy); 1130 1131 /* 1132 * Write the held root into the superblock. 1133 */ 1134 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1135 &sb_validator, &sblock); 1136 if (r) { 1137 dm_tm_dec(pmd->tm, held_root); 1138 pmd->need_commit = 1; 1139 return r; 1140 } 1141 1142 disk_super = dm_block_data(sblock); 1143 disk_super->held_root = cpu_to_le64(held_root); 1144 dm_bm_unlock(sblock); 1145 1146 pmd->need_commit = 1; 1147 1148 return 0; 1149 } 1150 1151 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) 1152 { 1153 int r; 1154 1155 down_write(&pmd->root_lock); 1156 r = __reserve_metadata_snap(pmd); 1157 up_write(&pmd->root_lock); 1158 1159 return r; 1160 } 1161 1162 static int __release_metadata_snap(struct dm_pool_metadata *pmd) 1163 { 1164 int r; 1165 struct thin_disk_superblock *disk_super; 1166 struct dm_block *sblock, *copy; 1167 dm_block_t held_root; 1168 1169 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1170 &sb_validator, &sblock); 1171 if (r) 1172 return r; 1173 1174 disk_super = dm_block_data(sblock); 1175 held_root = le64_to_cpu(disk_super->held_root); 1176 disk_super->held_root = cpu_to_le64(0); 1177 pmd->need_commit = 1; 1178 1179 dm_bm_unlock(sblock); 1180 1181 if (!held_root) { 1182 DMWARN("No pool metadata snapshot found: nothing to release."); 1183 return -EINVAL; 1184 } 1185 1186 r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); 1187 if (r) 1188 return r; 1189 1190 disk_super = dm_block_data(copy); 1191 dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->data_mapping_root)); 1192 dm_sm_dec_block(pmd->metadata_sm, le64_to_cpu(disk_super->device_details_root)); 1193 dm_sm_dec_block(pmd->metadata_sm, held_root); 1194 1195 return dm_tm_unlock(pmd->tm, copy); 1196 } 1197 1198 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) 1199 { 1200 int r; 1201 1202 down_write(&pmd->root_lock); 1203 r = __release_metadata_snap(pmd); 1204 up_write(&pmd->root_lock); 1205 1206 return r; 1207 } 1208 1209 static int __get_metadata_snap(struct dm_pool_metadata *pmd, 1210 dm_block_t *result) 1211 { 1212 int r; 1213 struct thin_disk_superblock *disk_super; 1214 struct dm_block *sblock; 1215 1216 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1217 &sb_validator, &sblock); 1218 if (r) 1219 return r; 1220 1221 disk_super = dm_block_data(sblock); 1222 *result = le64_to_cpu(disk_super->held_root); 1223 1224 return dm_bm_unlock(sblock); 1225 } 1226 1227 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, 1228 dm_block_t *result) 1229 { 1230 int r; 1231 1232 down_read(&pmd->root_lock); 1233 r = __get_metadata_snap(pmd, result); 1234 up_read(&pmd->root_lock); 1235 1236 return r; 1237 } 1238 1239 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1240 struct dm_thin_device **td) 1241 { 1242 int r; 1243 1244 down_write(&pmd->root_lock); 1245 r = __open_device(pmd, dev, 0, td); 1246 up_write(&pmd->root_lock); 1247 1248 return r; 1249 } 1250 1251 int dm_pool_close_thin_device(struct dm_thin_device *td) 1252 { 1253 down_write(&td->pmd->root_lock); 1254 __close_device(td); 1255 up_write(&td->pmd->root_lock); 1256 1257 return 0; 1258 } 1259 1260 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) 1261 { 1262 return td->id; 1263 } 1264 1265 static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1266 { 1267 return td->snapshotted_time > time; 1268 } 1269 1270 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1271 int can_block, struct dm_thin_lookup_result *result) 1272 { 1273 int r; 1274 uint64_t block_time = 0; 1275 __le64 value; 1276 struct dm_pool_metadata *pmd = td->pmd; 1277 dm_block_t keys[2] = { td->id, block }; 1278 1279 if (can_block) { 1280 down_read(&pmd->root_lock); 1281 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); 1282 if (!r) 1283 block_time = le64_to_cpu(value); 1284 up_read(&pmd->root_lock); 1285 1286 } else if (down_read_trylock(&pmd->root_lock)) { 1287 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); 1288 if (!r) 1289 block_time = le64_to_cpu(value); 1290 up_read(&pmd->root_lock); 1291 1292 } else 1293 return -EWOULDBLOCK; 1294 1295 if (!r) { 1296 dm_block_t exception_block; 1297 uint32_t exception_time; 1298 unpack_block_time(block_time, &exception_block, 1299 &exception_time); 1300 result->block = exception_block; 1301 result->shared = __snapshotted_since(td, exception_time); 1302 } 1303 1304 return r; 1305 } 1306 1307 static int __insert(struct dm_thin_device *td, dm_block_t block, 1308 dm_block_t data_block) 1309 { 1310 int r, inserted; 1311 __le64 value; 1312 struct dm_pool_metadata *pmd = td->pmd; 1313 dm_block_t keys[2] = { td->id, block }; 1314 1315 pmd->need_commit = 1; 1316 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1317 __dm_bless_for_disk(&value); 1318 1319 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, 1320 &pmd->root, &inserted); 1321 if (r) 1322 return r; 1323 1324 if (inserted) { 1325 td->mapped_blocks++; 1326 td->changed = 1; 1327 } 1328 1329 return 0; 1330 } 1331 1332 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1333 dm_block_t data_block) 1334 { 1335 int r; 1336 1337 down_write(&td->pmd->root_lock); 1338 r = __insert(td, block, data_block); 1339 up_write(&td->pmd->root_lock); 1340 1341 return r; 1342 } 1343 1344 static int __remove(struct dm_thin_device *td, dm_block_t block) 1345 { 1346 int r; 1347 struct dm_pool_metadata *pmd = td->pmd; 1348 dm_block_t keys[2] = { td->id, block }; 1349 1350 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); 1351 if (r) 1352 return r; 1353 1354 td->mapped_blocks--; 1355 td->changed = 1; 1356 pmd->need_commit = 1; 1357 1358 return 0; 1359 } 1360 1361 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1362 { 1363 int r; 1364 1365 down_write(&td->pmd->root_lock); 1366 r = __remove(td, block); 1367 up_write(&td->pmd->root_lock); 1368 1369 return r; 1370 } 1371 1372 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1373 { 1374 int r; 1375 1376 down_write(&pmd->root_lock); 1377 1378 r = dm_sm_new_block(pmd->data_sm, result); 1379 pmd->need_commit = 1; 1380 1381 up_write(&pmd->root_lock); 1382 1383 return r; 1384 } 1385 1386 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1387 { 1388 int r; 1389 1390 down_write(&pmd->root_lock); 1391 1392 r = __commit_transaction(pmd); 1393 if (r <= 0) 1394 goto out; 1395 1396 /* 1397 * Open the next transaction. 1398 */ 1399 r = __begin_transaction(pmd); 1400 out: 1401 up_write(&pmd->root_lock); 1402 return r; 1403 } 1404 1405 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1406 { 1407 int r; 1408 1409 down_read(&pmd->root_lock); 1410 r = dm_sm_get_nr_free(pmd->data_sm, result); 1411 up_read(&pmd->root_lock); 1412 1413 return r; 1414 } 1415 1416 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1417 dm_block_t *result) 1418 { 1419 int r; 1420 1421 down_read(&pmd->root_lock); 1422 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1423 up_read(&pmd->root_lock); 1424 1425 return r; 1426 } 1427 1428 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1429 dm_block_t *result) 1430 { 1431 int r; 1432 1433 down_read(&pmd->root_lock); 1434 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1435 up_read(&pmd->root_lock); 1436 1437 return r; 1438 } 1439 1440 int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) 1441 { 1442 down_read(&pmd->root_lock); 1443 *result = pmd->data_block_size; 1444 up_read(&pmd->root_lock); 1445 1446 return 0; 1447 } 1448 1449 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1450 { 1451 int r; 1452 1453 down_read(&pmd->root_lock); 1454 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1455 up_read(&pmd->root_lock); 1456 1457 return r; 1458 } 1459 1460 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1461 { 1462 struct dm_pool_metadata *pmd = td->pmd; 1463 1464 down_read(&pmd->root_lock); 1465 *result = td->mapped_blocks; 1466 up_read(&pmd->root_lock); 1467 1468 return 0; 1469 } 1470 1471 static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1472 { 1473 int r; 1474 __le64 value_le; 1475 dm_block_t thin_root; 1476 struct dm_pool_metadata *pmd = td->pmd; 1477 1478 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); 1479 if (r) 1480 return r; 1481 1482 thin_root = le64_to_cpu(value_le); 1483 1484 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); 1485 } 1486 1487 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 1488 dm_block_t *result) 1489 { 1490 int r; 1491 struct dm_pool_metadata *pmd = td->pmd; 1492 1493 down_read(&pmd->root_lock); 1494 r = __highest_block(td, result); 1495 up_read(&pmd->root_lock); 1496 1497 return r; 1498 } 1499 1500 static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1501 { 1502 int r; 1503 dm_block_t old_count; 1504 1505 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); 1506 if (r) 1507 return r; 1508 1509 if (new_count == old_count) 1510 return 0; 1511 1512 if (new_count < old_count) { 1513 DMERR("cannot reduce size of data device"); 1514 return -EINVAL; 1515 } 1516 1517 r = dm_sm_extend(pmd->data_sm, new_count - old_count); 1518 if (!r) 1519 pmd->need_commit = 1; 1520 1521 return r; 1522 } 1523 1524 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1525 { 1526 int r; 1527 1528 down_write(&pmd->root_lock); 1529 r = __resize_data_dev(pmd, new_count); 1530 up_write(&pmd->root_lock); 1531 1532 return r; 1533 } 1534