1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "persistent-data/dm-btree.h" 9 #include "persistent-data/dm-space-map.h" 10 #include "persistent-data/dm-space-map-disk.h" 11 #include "persistent-data/dm-transaction-manager.h" 12 13 #include <linux/list.h> 14 #include <linux/device-mapper.h> 15 #include <linux/workqueue.h> 16 17 /*-------------------------------------------------------------------------- 18 * As far as the metadata goes, there is: 19 * 20 * - A superblock in block zero, taking up fewer than 512 bytes for 21 * atomic writes. 22 * 23 * - A space map managing the metadata blocks. 24 * 25 * - A space map managing the data blocks. 26 * 27 * - A btree mapping our internal thin dev ids onto struct disk_device_details. 28 * 29 * - A hierarchical btree, with 2 levels which effectively maps (thin 30 * dev id, virtual block) -> block_time. Block time is a 64-bit 31 * field holding the time in the low 24 bits, and block in the top 48 32 * bits. 33 * 34 * BTrees consist solely of btree_nodes, that fill a block. Some are 35 * internal nodes, as such their values are a __le64 pointing to other 36 * nodes. Leaf nodes can store data of any reasonable size (ie. much 37 * smaller than the block size). The nodes consist of the header, 38 * followed by an array of keys, followed by an array of values. We have 39 * to binary search on the keys so they're all held together to help the 40 * cpu cache. 41 * 42 * Space maps have 2 btrees: 43 * 44 * - One maps a uint64_t onto a struct index_entry. Which points to a 45 * bitmap block, and has some details about how many free entries there 46 * are etc. 47 * 48 * - The bitmap blocks have a header (for the checksum). Then the rest 49 * of the block is pairs of bits. With the meaning being: 50 * 51 * 0 - ref count is 0 52 * 1 - ref count is 1 53 * 2 - ref count is 2 54 * 3 - ref count is higher than 2 55 * 56 * - If the count is higher than 2 then the ref count is entered in a 57 * second btree that directly maps the block_address to a uint32_t ref 58 * count. 59 * 60 * The space map metadata variant doesn't have a bitmaps btree. Instead 61 * it has one single blocks worth of index_entries. This avoids 62 * recursive issues with the bitmap btree needing to allocate space in 63 * order to insert. With a small data block size such as 64k the 64 * metadata support data devices that are hundreds of terrabytes. 65 * 66 * The space maps allocate space linearly from front to back. Space that 67 * is freed in a transaction is never recycled within that transaction. 68 * To try and avoid fragmenting _free_ space the allocator always goes 69 * back and fills in gaps. 70 * 71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks 72 * from the block manager. 73 *--------------------------------------------------------------------------*/ 74 75 #define DM_MSG_PREFIX "thin metadata" 76 77 #define THIN_SUPERBLOCK_MAGIC 27022010 78 #define THIN_SUPERBLOCK_LOCATION 0 79 #define THIN_VERSION 1 80 #define THIN_METADATA_CACHE_SIZE 64 81 #define SECTOR_TO_BLOCK_SHIFT 3 82 83 /* This should be plenty */ 84 #define SPACE_MAP_ROOT_SIZE 128 85 86 /* 87 * Little endian on-disk superblock and device details. 88 */ 89 struct thin_disk_superblock { 90 __le32 csum; /* Checksum of superblock except for this field. */ 91 __le32 flags; 92 __le64 blocknr; /* This block number, dm_block_t. */ 93 94 __u8 uuid[16]; 95 __le64 magic; 96 __le32 version; 97 __le32 time; 98 99 __le64 trans_id; 100 101 /* 102 * Root held by userspace transactions. 103 */ 104 __le64 held_root; 105 106 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 107 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 108 109 /* 110 * 2-level btree mapping (dev_id, (dev block, time)) -> data block 111 */ 112 __le64 data_mapping_root; 113 114 /* 115 * Device detail root mapping dev_id -> device_details 116 */ 117 __le64 device_details_root; 118 119 __le32 data_block_size; /* In 512-byte sectors. */ 120 121 __le32 metadata_block_size; /* In 512-byte sectors. */ 122 __le64 metadata_nr_blocks; 123 124 __le32 compat_flags; 125 __le32 compat_ro_flags; 126 __le32 incompat_flags; 127 } __packed; 128 129 struct disk_device_details { 130 __le64 mapped_blocks; 131 __le64 transaction_id; /* When created. */ 132 __le32 creation_time; 133 __le32 snapshotted_time; 134 } __packed; 135 136 struct dm_pool_metadata { 137 struct hlist_node hash; 138 139 struct block_device *bdev; 140 struct dm_block_manager *bm; 141 struct dm_space_map *metadata_sm; 142 struct dm_space_map *data_sm; 143 struct dm_transaction_manager *tm; 144 struct dm_transaction_manager *nb_tm; 145 146 /* 147 * Two-level btree. 148 * First level holds thin_dev_t. 149 * Second level holds mappings. 150 */ 151 struct dm_btree_info info; 152 153 /* 154 * Non-blocking version of the above. 155 */ 156 struct dm_btree_info nb_info; 157 158 /* 159 * Just the top level for deleting whole devices. 160 */ 161 struct dm_btree_info tl_info; 162 163 /* 164 * Just the bottom level for creating new devices. 165 */ 166 struct dm_btree_info bl_info; 167 168 /* 169 * Describes the device details btree. 170 */ 171 struct dm_btree_info details_info; 172 173 struct rw_semaphore root_lock; 174 uint32_t time; 175 int need_commit; 176 dm_block_t root; 177 dm_block_t details_root; 178 struct list_head thin_devices; 179 uint64_t trans_id; 180 unsigned long flags; 181 sector_t data_block_size; 182 }; 183 184 struct dm_thin_device { 185 struct list_head list; 186 struct dm_pool_metadata *pmd; 187 dm_thin_id id; 188 189 int open_count; 190 int changed; 191 uint64_t mapped_blocks; 192 uint64_t transaction_id; 193 uint32_t creation_time; 194 uint32_t snapshotted_time; 195 }; 196 197 /*---------------------------------------------------------------- 198 * superblock validator 199 *--------------------------------------------------------------*/ 200 201 #define SUPERBLOCK_CSUM_XOR 160774 202 203 static void sb_prepare_for_write(struct dm_block_validator *v, 204 struct dm_block *b, 205 size_t block_size) 206 { 207 struct thin_disk_superblock *disk_super = dm_block_data(b); 208 209 disk_super->blocknr = cpu_to_le64(dm_block_location(b)); 210 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 211 block_size - sizeof(__le32), 212 SUPERBLOCK_CSUM_XOR)); 213 } 214 215 static int sb_check(struct dm_block_validator *v, 216 struct dm_block *b, 217 size_t block_size) 218 { 219 struct thin_disk_superblock *disk_super = dm_block_data(b); 220 __le32 csum_le; 221 222 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { 223 DMERR("sb_check failed: blocknr %llu: " 224 "wanted %llu", le64_to_cpu(disk_super->blocknr), 225 (unsigned long long)dm_block_location(b)); 226 return -ENOTBLK; 227 } 228 229 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { 230 DMERR("sb_check failed: magic %llu: " 231 "wanted %llu", le64_to_cpu(disk_super->magic), 232 (unsigned long long)THIN_SUPERBLOCK_MAGIC); 233 return -EILSEQ; 234 } 235 236 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 237 block_size - sizeof(__le32), 238 SUPERBLOCK_CSUM_XOR)); 239 if (csum_le != disk_super->csum) { 240 DMERR("sb_check failed: csum %u: wanted %u", 241 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); 242 return -EILSEQ; 243 } 244 245 return 0; 246 } 247 248 static struct dm_block_validator sb_validator = { 249 .name = "superblock", 250 .prepare_for_write = sb_prepare_for_write, 251 .check = sb_check 252 }; 253 254 /*---------------------------------------------------------------- 255 * Methods for the btree value types 256 *--------------------------------------------------------------*/ 257 258 static uint64_t pack_block_time(dm_block_t b, uint32_t t) 259 { 260 return (b << 24) | t; 261 } 262 263 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) 264 { 265 *b = v >> 24; 266 *t = v & ((1 << 24) - 1); 267 } 268 269 static void data_block_inc(void *context, void *value_le) 270 { 271 struct dm_space_map *sm = context; 272 __le64 v_le; 273 uint64_t b; 274 uint32_t t; 275 276 memcpy(&v_le, value_le, sizeof(v_le)); 277 unpack_block_time(le64_to_cpu(v_le), &b, &t); 278 dm_sm_inc_block(sm, b); 279 } 280 281 static void data_block_dec(void *context, void *value_le) 282 { 283 struct dm_space_map *sm = context; 284 __le64 v_le; 285 uint64_t b; 286 uint32_t t; 287 288 memcpy(&v_le, value_le, sizeof(v_le)); 289 unpack_block_time(le64_to_cpu(v_le), &b, &t); 290 dm_sm_dec_block(sm, b); 291 } 292 293 static int data_block_equal(void *context, void *value1_le, void *value2_le) 294 { 295 __le64 v1_le, v2_le; 296 uint64_t b1, b2; 297 uint32_t t; 298 299 memcpy(&v1_le, value1_le, sizeof(v1_le)); 300 memcpy(&v2_le, value2_le, sizeof(v2_le)); 301 unpack_block_time(le64_to_cpu(v1_le), &b1, &t); 302 unpack_block_time(le64_to_cpu(v2_le), &b2, &t); 303 304 return b1 == b2; 305 } 306 307 static void subtree_inc(void *context, void *value) 308 { 309 struct dm_btree_info *info = context; 310 __le64 root_le; 311 uint64_t root; 312 313 memcpy(&root_le, value, sizeof(root_le)); 314 root = le64_to_cpu(root_le); 315 dm_tm_inc(info->tm, root); 316 } 317 318 static void subtree_dec(void *context, void *value) 319 { 320 struct dm_btree_info *info = context; 321 __le64 root_le; 322 uint64_t root; 323 324 memcpy(&root_le, value, sizeof(root_le)); 325 root = le64_to_cpu(root_le); 326 if (dm_btree_del(info, root)) 327 DMERR("btree delete failed\n"); 328 } 329 330 static int subtree_equal(void *context, void *value1_le, void *value2_le) 331 { 332 __le64 v1_le, v2_le; 333 memcpy(&v1_le, value1_le, sizeof(v1_le)); 334 memcpy(&v2_le, value2_le, sizeof(v2_le)); 335 336 return v1_le == v2_le; 337 } 338 339 /*----------------------------------------------------------------*/ 340 341 static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) 342 { 343 int r; 344 unsigned i; 345 struct dm_block *b; 346 __le64 *data_le, zero = cpu_to_le64(0); 347 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); 348 349 /* 350 * We can't use a validator here - it may be all zeroes. 351 */ 352 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); 353 if (r) 354 return r; 355 356 data_le = dm_block_data(b); 357 *result = 1; 358 for (i = 0; i < block_size; i++) { 359 if (data_le[i] != zero) { 360 *result = 0; 361 break; 362 } 363 } 364 365 return dm_bm_unlock(b); 366 } 367 368 static int init_pmd(struct dm_pool_metadata *pmd, 369 struct dm_block_manager *bm, 370 dm_block_t nr_blocks, int create) 371 { 372 int r; 373 struct dm_space_map *sm, *data_sm; 374 struct dm_transaction_manager *tm; 375 struct dm_block *sblock; 376 377 if (create) { 378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 379 &sb_validator, &tm, &sm, &sblock); 380 if (r < 0) { 381 DMERR("tm_create_with_sm failed"); 382 return r; 383 } 384 385 data_sm = dm_sm_disk_create(tm, nr_blocks); 386 if (IS_ERR(data_sm)) { 387 DMERR("sm_disk_create failed"); 388 r = PTR_ERR(data_sm); 389 goto bad; 390 } 391 } else { 392 struct thin_disk_superblock *disk_super = NULL; 393 size_t space_map_root_offset = 394 offsetof(struct thin_disk_superblock, metadata_space_map_root); 395 396 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 397 &sb_validator, space_map_root_offset, 398 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); 399 if (r < 0) { 400 DMERR("tm_open_with_sm failed"); 401 return r; 402 } 403 404 disk_super = dm_block_data(sblock); 405 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, 406 sizeof(disk_super->data_space_map_root)); 407 if (IS_ERR(data_sm)) { 408 DMERR("sm_disk_open failed"); 409 r = PTR_ERR(data_sm); 410 goto bad; 411 } 412 } 413 414 415 r = dm_tm_unlock(tm, sblock); 416 if (r < 0) { 417 DMERR("couldn't unlock superblock"); 418 goto bad_data_sm; 419 } 420 421 pmd->bm = bm; 422 pmd->metadata_sm = sm; 423 pmd->data_sm = data_sm; 424 pmd->tm = tm; 425 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); 426 if (!pmd->nb_tm) { 427 DMERR("could not create clone tm"); 428 r = -ENOMEM; 429 goto bad_data_sm; 430 } 431 432 pmd->info.tm = tm; 433 pmd->info.levels = 2; 434 pmd->info.value_type.context = pmd->data_sm; 435 pmd->info.value_type.size = sizeof(__le64); 436 pmd->info.value_type.inc = data_block_inc; 437 pmd->info.value_type.dec = data_block_dec; 438 pmd->info.value_type.equal = data_block_equal; 439 440 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 441 pmd->nb_info.tm = pmd->nb_tm; 442 443 pmd->tl_info.tm = tm; 444 pmd->tl_info.levels = 1; 445 pmd->tl_info.value_type.context = &pmd->info; 446 pmd->tl_info.value_type.size = sizeof(__le64); 447 pmd->tl_info.value_type.inc = subtree_inc; 448 pmd->tl_info.value_type.dec = subtree_dec; 449 pmd->tl_info.value_type.equal = subtree_equal; 450 451 pmd->bl_info.tm = tm; 452 pmd->bl_info.levels = 1; 453 pmd->bl_info.value_type.context = pmd->data_sm; 454 pmd->bl_info.value_type.size = sizeof(__le64); 455 pmd->bl_info.value_type.inc = data_block_inc; 456 pmd->bl_info.value_type.dec = data_block_dec; 457 pmd->bl_info.value_type.equal = data_block_equal; 458 459 pmd->details_info.tm = tm; 460 pmd->details_info.levels = 1; 461 pmd->details_info.value_type.context = NULL; 462 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 463 pmd->details_info.value_type.inc = NULL; 464 pmd->details_info.value_type.dec = NULL; 465 pmd->details_info.value_type.equal = NULL; 466 467 pmd->root = 0; 468 469 init_rwsem(&pmd->root_lock); 470 pmd->time = 0; 471 pmd->need_commit = 0; 472 pmd->details_root = 0; 473 pmd->trans_id = 0; 474 pmd->flags = 0; 475 INIT_LIST_HEAD(&pmd->thin_devices); 476 477 return 0; 478 479 bad_data_sm: 480 dm_sm_destroy(data_sm); 481 bad: 482 dm_tm_destroy(tm); 483 dm_sm_destroy(sm); 484 485 return r; 486 } 487 488 static int __begin_transaction(struct dm_pool_metadata *pmd) 489 { 490 int r; 491 u32 features; 492 struct thin_disk_superblock *disk_super; 493 struct dm_block *sblock; 494 495 /* 496 * __maybe_commit_transaction() resets these 497 */ 498 WARN_ON(pmd->need_commit); 499 500 /* 501 * We re-read the superblock every time. Shouldn't need to do this 502 * really. 503 */ 504 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 505 &sb_validator, &sblock); 506 if (r) 507 return r; 508 509 disk_super = dm_block_data(sblock); 510 pmd->time = le32_to_cpu(disk_super->time); 511 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 512 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 513 pmd->trans_id = le64_to_cpu(disk_super->trans_id); 514 pmd->flags = le32_to_cpu(disk_super->flags); 515 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 516 517 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; 518 if (features) { 519 DMERR("could not access metadata due to " 520 "unsupported optional features (%lx).", 521 (unsigned long)features); 522 r = -EINVAL; 523 goto out; 524 } 525 526 /* 527 * Check for read-only metadata to skip the following RDWR checks. 528 */ 529 if (get_disk_ro(pmd->bdev->bd_disk)) 530 goto out; 531 532 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; 533 if (features) { 534 DMERR("could not access metadata RDWR due to " 535 "unsupported optional features (%lx).", 536 (unsigned long)features); 537 r = -EINVAL; 538 } 539 540 out: 541 dm_bm_unlock(sblock); 542 return r; 543 } 544 545 static int __write_changed_details(struct dm_pool_metadata *pmd) 546 { 547 int r; 548 struct dm_thin_device *td, *tmp; 549 struct disk_device_details details; 550 uint64_t key; 551 552 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 553 if (!td->changed) 554 continue; 555 556 key = td->id; 557 558 details.mapped_blocks = cpu_to_le64(td->mapped_blocks); 559 details.transaction_id = cpu_to_le64(td->transaction_id); 560 details.creation_time = cpu_to_le32(td->creation_time); 561 details.snapshotted_time = cpu_to_le32(td->snapshotted_time); 562 __dm_bless_for_disk(&details); 563 564 r = dm_btree_insert(&pmd->details_info, pmd->details_root, 565 &key, &details, &pmd->details_root); 566 if (r) 567 return r; 568 569 if (td->open_count) 570 td->changed = 0; 571 else { 572 list_del(&td->list); 573 kfree(td); 574 } 575 576 pmd->need_commit = 1; 577 } 578 579 return 0; 580 } 581 582 static int __commit_transaction(struct dm_pool_metadata *pmd) 583 { 584 /* 585 * FIXME: Associated pool should be made read-only on failure. 586 */ 587 int r; 588 size_t metadata_len, data_len; 589 struct thin_disk_superblock *disk_super; 590 struct dm_block *sblock; 591 592 /* 593 * We need to know if the thin_disk_superblock exceeds a 512-byte sector. 594 */ 595 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 596 597 r = __write_changed_details(pmd); 598 if (r < 0) 599 goto out; 600 601 if (!pmd->need_commit) 602 goto out; 603 604 r = dm_sm_commit(pmd->data_sm); 605 if (r < 0) 606 goto out; 607 608 r = dm_tm_pre_commit(pmd->tm); 609 if (r < 0) 610 goto out; 611 612 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 613 if (r < 0) 614 goto out; 615 616 r = dm_sm_root_size(pmd->metadata_sm, &data_len); 617 if (r < 0) 618 goto out; 619 620 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 621 &sb_validator, &sblock); 622 if (r) 623 goto out; 624 625 disk_super = dm_block_data(sblock); 626 disk_super->time = cpu_to_le32(pmd->time); 627 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 628 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 629 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 630 disk_super->flags = cpu_to_le32(pmd->flags); 631 632 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, 633 metadata_len); 634 if (r < 0) 635 goto out_locked; 636 637 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, 638 data_len); 639 if (r < 0) 640 goto out_locked; 641 642 r = dm_tm_commit(pmd->tm, sblock); 643 if (!r) 644 pmd->need_commit = 0; 645 646 out: 647 return r; 648 649 out_locked: 650 dm_bm_unlock(sblock); 651 return r; 652 } 653 654 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 655 sector_t data_block_size) 656 { 657 int r; 658 struct thin_disk_superblock *disk_super; 659 struct dm_pool_metadata *pmd; 660 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 661 struct dm_block_manager *bm; 662 int create; 663 struct dm_block *sblock; 664 665 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 666 if (!pmd) { 667 DMERR("could not allocate metadata struct"); 668 return ERR_PTR(-ENOMEM); 669 } 670 671 /* 672 * Max hex locks: 673 * 3 for btree insert + 674 * 2 for btree lookup used within space map 675 */ 676 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, 677 THIN_METADATA_CACHE_SIZE, 5); 678 if (!bm) { 679 DMERR("could not create block manager"); 680 kfree(pmd); 681 return ERR_PTR(-ENOMEM); 682 } 683 684 r = superblock_all_zeroes(bm, &create); 685 if (r) { 686 dm_block_manager_destroy(bm); 687 kfree(pmd); 688 return ERR_PTR(r); 689 } 690 691 692 r = init_pmd(pmd, bm, 0, create); 693 if (r) { 694 dm_block_manager_destroy(bm); 695 kfree(pmd); 696 return ERR_PTR(r); 697 } 698 pmd->bdev = bdev; 699 700 if (!create) { 701 r = __begin_transaction(pmd); 702 if (r < 0) 703 goto bad; 704 return pmd; 705 } 706 707 /* 708 * Create. 709 */ 710 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 711 &sb_validator, &sblock); 712 if (r) 713 goto bad; 714 715 disk_super = dm_block_data(sblock); 716 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 717 disk_super->version = cpu_to_le32(THIN_VERSION); 718 disk_super->time = 0; 719 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 720 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 721 disk_super->data_block_size = cpu_to_le32(data_block_size); 722 723 r = dm_bm_unlock(sblock); 724 if (r < 0) 725 goto bad; 726 727 r = dm_btree_empty(&pmd->info, &pmd->root); 728 if (r < 0) 729 goto bad; 730 731 r = dm_btree_empty(&pmd->details_info, &pmd->details_root); 732 if (r < 0) { 733 DMERR("couldn't create devices root"); 734 goto bad; 735 } 736 737 pmd->flags = 0; 738 pmd->need_commit = 1; 739 r = dm_pool_commit_metadata(pmd); 740 if (r < 0) { 741 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 742 __func__, r); 743 goto bad; 744 } 745 746 return pmd; 747 748 bad: 749 if (dm_pool_metadata_close(pmd) < 0) 750 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 751 return ERR_PTR(r); 752 } 753 754 int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 755 { 756 int r; 757 unsigned open_devices = 0; 758 struct dm_thin_device *td, *tmp; 759 760 down_read(&pmd->root_lock); 761 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 762 if (td->open_count) 763 open_devices++; 764 else { 765 list_del(&td->list); 766 kfree(td); 767 } 768 } 769 up_read(&pmd->root_lock); 770 771 if (open_devices) { 772 DMERR("attempt to close pmd when %u device(s) are still open", 773 open_devices); 774 return -EBUSY; 775 } 776 777 r = __commit_transaction(pmd); 778 if (r < 0) 779 DMWARN("%s: __commit_transaction() failed, error = %d", 780 __func__, r); 781 782 dm_tm_destroy(pmd->tm); 783 dm_tm_destroy(pmd->nb_tm); 784 dm_block_manager_destroy(pmd->bm); 785 dm_sm_destroy(pmd->metadata_sm); 786 dm_sm_destroy(pmd->data_sm); 787 kfree(pmd); 788 789 return 0; 790 } 791 792 static int __open_device(struct dm_pool_metadata *pmd, 793 dm_thin_id dev, int create, 794 struct dm_thin_device **td) 795 { 796 int r, changed = 0; 797 struct dm_thin_device *td2; 798 uint64_t key = dev; 799 struct disk_device_details details_le; 800 801 /* 802 * Check the device isn't already open. 803 */ 804 list_for_each_entry(td2, &pmd->thin_devices, list) 805 if (td2->id == dev) { 806 td2->open_count++; 807 *td = td2; 808 return 0; 809 } 810 811 /* 812 * Check the device exists. 813 */ 814 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 815 &key, &details_le); 816 if (r) { 817 if (r != -ENODATA || !create) 818 return r; 819 820 changed = 1; 821 details_le.mapped_blocks = 0; 822 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 823 details_le.creation_time = cpu_to_le32(pmd->time); 824 details_le.snapshotted_time = cpu_to_le32(pmd->time); 825 } 826 827 *td = kmalloc(sizeof(**td), GFP_NOIO); 828 if (!*td) 829 return -ENOMEM; 830 831 (*td)->pmd = pmd; 832 (*td)->id = dev; 833 (*td)->open_count = 1; 834 (*td)->changed = changed; 835 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 836 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 837 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 838 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); 839 840 list_add(&(*td)->list, &pmd->thin_devices); 841 842 return 0; 843 } 844 845 static void __close_device(struct dm_thin_device *td) 846 { 847 --td->open_count; 848 } 849 850 static int __create_thin(struct dm_pool_metadata *pmd, 851 dm_thin_id dev) 852 { 853 int r; 854 dm_block_t dev_root; 855 uint64_t key = dev; 856 struct disk_device_details details_le; 857 struct dm_thin_device *td; 858 __le64 value; 859 860 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 861 &key, &details_le); 862 if (!r) 863 return -EEXIST; 864 865 /* 866 * Create an empty btree for the mappings. 867 */ 868 r = dm_btree_empty(&pmd->bl_info, &dev_root); 869 if (r) 870 return r; 871 872 /* 873 * Insert it into the main mapping tree. 874 */ 875 value = cpu_to_le64(dev_root); 876 __dm_bless_for_disk(&value); 877 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 878 if (r) { 879 dm_btree_del(&pmd->bl_info, dev_root); 880 return r; 881 } 882 883 r = __open_device(pmd, dev, 1, &td); 884 if (r) { 885 __close_device(td); 886 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 887 dm_btree_del(&pmd->bl_info, dev_root); 888 return r; 889 } 890 td->changed = 1; 891 __close_device(td); 892 893 return r; 894 } 895 896 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 897 { 898 int r; 899 900 down_write(&pmd->root_lock); 901 r = __create_thin(pmd, dev); 902 up_write(&pmd->root_lock); 903 904 return r; 905 } 906 907 static int __set_snapshot_details(struct dm_pool_metadata *pmd, 908 struct dm_thin_device *snap, 909 dm_thin_id origin, uint32_t time) 910 { 911 int r; 912 struct dm_thin_device *td; 913 914 r = __open_device(pmd, origin, 0, &td); 915 if (r) 916 return r; 917 918 td->changed = 1; 919 td->snapshotted_time = time; 920 921 snap->mapped_blocks = td->mapped_blocks; 922 snap->snapshotted_time = time; 923 __close_device(td); 924 925 return 0; 926 } 927 928 static int __create_snap(struct dm_pool_metadata *pmd, 929 dm_thin_id dev, dm_thin_id origin) 930 { 931 int r; 932 dm_block_t origin_root; 933 uint64_t key = origin, dev_key = dev; 934 struct dm_thin_device *td; 935 struct disk_device_details details_le; 936 __le64 value; 937 938 /* check this device is unused */ 939 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 940 &dev_key, &details_le); 941 if (!r) 942 return -EEXIST; 943 944 /* find the mapping tree for the origin */ 945 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); 946 if (r) 947 return r; 948 origin_root = le64_to_cpu(value); 949 950 /* clone the origin, an inc will do */ 951 dm_tm_inc(pmd->tm, origin_root); 952 953 /* insert into the main mapping tree */ 954 value = cpu_to_le64(origin_root); 955 __dm_bless_for_disk(&value); 956 key = dev; 957 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 958 if (r) { 959 dm_tm_dec(pmd->tm, origin_root); 960 return r; 961 } 962 963 pmd->time++; 964 965 r = __open_device(pmd, dev, 1, &td); 966 if (r) 967 goto bad; 968 969 r = __set_snapshot_details(pmd, td, origin, pmd->time); 970 if (r) 971 goto bad; 972 973 __close_device(td); 974 return 0; 975 976 bad: 977 __close_device(td); 978 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 979 dm_btree_remove(&pmd->details_info, pmd->details_root, 980 &key, &pmd->details_root); 981 return r; 982 } 983 984 int dm_pool_create_snap(struct dm_pool_metadata *pmd, 985 dm_thin_id dev, 986 dm_thin_id origin) 987 { 988 int r; 989 990 down_write(&pmd->root_lock); 991 r = __create_snap(pmd, dev, origin); 992 up_write(&pmd->root_lock); 993 994 return r; 995 } 996 997 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) 998 { 999 int r; 1000 uint64_t key = dev; 1001 struct dm_thin_device *td; 1002 1003 /* TODO: failure should mark the transaction invalid */ 1004 r = __open_device(pmd, dev, 0, &td); 1005 if (r) 1006 return r; 1007 1008 if (td->open_count > 1) { 1009 __close_device(td); 1010 return -EBUSY; 1011 } 1012 1013 list_del(&td->list); 1014 kfree(td); 1015 r = dm_btree_remove(&pmd->details_info, pmd->details_root, 1016 &key, &pmd->details_root); 1017 if (r) 1018 return r; 1019 1020 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1021 if (r) 1022 return r; 1023 1024 pmd->need_commit = 1; 1025 1026 return 0; 1027 } 1028 1029 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1030 dm_thin_id dev) 1031 { 1032 int r; 1033 1034 down_write(&pmd->root_lock); 1035 r = __delete_device(pmd, dev); 1036 up_write(&pmd->root_lock); 1037 1038 return r; 1039 } 1040 1041 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 1042 uint64_t current_id, 1043 uint64_t new_id) 1044 { 1045 down_write(&pmd->root_lock); 1046 if (pmd->trans_id != current_id) { 1047 up_write(&pmd->root_lock); 1048 DMERR("mismatched transaction id"); 1049 return -EINVAL; 1050 } 1051 1052 pmd->trans_id = new_id; 1053 pmd->need_commit = 1; 1054 up_write(&pmd->root_lock); 1055 1056 return 0; 1057 } 1058 1059 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1060 uint64_t *result) 1061 { 1062 down_read(&pmd->root_lock); 1063 *result = pmd->trans_id; 1064 up_read(&pmd->root_lock); 1065 1066 return 0; 1067 } 1068 1069 static int __get_held_metadata_root(struct dm_pool_metadata *pmd, 1070 dm_block_t *result) 1071 { 1072 int r; 1073 struct thin_disk_superblock *disk_super; 1074 struct dm_block *sblock; 1075 1076 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1077 &sb_validator, &sblock); 1078 if (r) 1079 return r; 1080 1081 disk_super = dm_block_data(sblock); 1082 *result = le64_to_cpu(disk_super->held_root); 1083 1084 return dm_bm_unlock(sblock); 1085 } 1086 1087 int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, 1088 dm_block_t *result) 1089 { 1090 int r; 1091 1092 down_read(&pmd->root_lock); 1093 r = __get_held_metadata_root(pmd, result); 1094 up_read(&pmd->root_lock); 1095 1096 return r; 1097 } 1098 1099 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1100 struct dm_thin_device **td) 1101 { 1102 int r; 1103 1104 down_write(&pmd->root_lock); 1105 r = __open_device(pmd, dev, 0, td); 1106 up_write(&pmd->root_lock); 1107 1108 return r; 1109 } 1110 1111 int dm_pool_close_thin_device(struct dm_thin_device *td) 1112 { 1113 down_write(&td->pmd->root_lock); 1114 __close_device(td); 1115 up_write(&td->pmd->root_lock); 1116 1117 return 0; 1118 } 1119 1120 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) 1121 { 1122 return td->id; 1123 } 1124 1125 static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1126 { 1127 return td->snapshotted_time > time; 1128 } 1129 1130 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1131 int can_block, struct dm_thin_lookup_result *result) 1132 { 1133 int r; 1134 uint64_t block_time = 0; 1135 __le64 value; 1136 struct dm_pool_metadata *pmd = td->pmd; 1137 dm_block_t keys[2] = { td->id, block }; 1138 1139 if (can_block) { 1140 down_read(&pmd->root_lock); 1141 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); 1142 if (!r) 1143 block_time = le64_to_cpu(value); 1144 up_read(&pmd->root_lock); 1145 1146 } else if (down_read_trylock(&pmd->root_lock)) { 1147 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); 1148 if (!r) 1149 block_time = le64_to_cpu(value); 1150 up_read(&pmd->root_lock); 1151 1152 } else 1153 return -EWOULDBLOCK; 1154 1155 if (!r) { 1156 dm_block_t exception_block; 1157 uint32_t exception_time; 1158 unpack_block_time(block_time, &exception_block, 1159 &exception_time); 1160 result->block = exception_block; 1161 result->shared = __snapshotted_since(td, exception_time); 1162 } 1163 1164 return r; 1165 } 1166 1167 static int __insert(struct dm_thin_device *td, dm_block_t block, 1168 dm_block_t data_block) 1169 { 1170 int r, inserted; 1171 __le64 value; 1172 struct dm_pool_metadata *pmd = td->pmd; 1173 dm_block_t keys[2] = { td->id, block }; 1174 1175 pmd->need_commit = 1; 1176 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1177 __dm_bless_for_disk(&value); 1178 1179 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, 1180 &pmd->root, &inserted); 1181 if (r) 1182 return r; 1183 1184 if (inserted) { 1185 td->mapped_blocks++; 1186 td->changed = 1; 1187 } 1188 1189 return 0; 1190 } 1191 1192 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1193 dm_block_t data_block) 1194 { 1195 int r; 1196 1197 down_write(&td->pmd->root_lock); 1198 r = __insert(td, block, data_block); 1199 up_write(&td->pmd->root_lock); 1200 1201 return r; 1202 } 1203 1204 static int __remove(struct dm_thin_device *td, dm_block_t block) 1205 { 1206 int r; 1207 struct dm_pool_metadata *pmd = td->pmd; 1208 dm_block_t keys[2] = { td->id, block }; 1209 1210 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); 1211 if (r) 1212 return r; 1213 1214 pmd->need_commit = 1; 1215 1216 return 0; 1217 } 1218 1219 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1220 { 1221 int r; 1222 1223 down_write(&td->pmd->root_lock); 1224 r = __remove(td, block); 1225 up_write(&td->pmd->root_lock); 1226 1227 return r; 1228 } 1229 1230 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1231 { 1232 int r; 1233 1234 down_write(&pmd->root_lock); 1235 1236 r = dm_sm_new_block(pmd->data_sm, result); 1237 pmd->need_commit = 1; 1238 1239 up_write(&pmd->root_lock); 1240 1241 return r; 1242 } 1243 1244 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1245 { 1246 int r; 1247 1248 down_write(&pmd->root_lock); 1249 1250 r = __commit_transaction(pmd); 1251 if (r <= 0) 1252 goto out; 1253 1254 /* 1255 * Open the next transaction. 1256 */ 1257 r = __begin_transaction(pmd); 1258 out: 1259 up_write(&pmd->root_lock); 1260 return r; 1261 } 1262 1263 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1264 { 1265 int r; 1266 1267 down_read(&pmd->root_lock); 1268 r = dm_sm_get_nr_free(pmd->data_sm, result); 1269 up_read(&pmd->root_lock); 1270 1271 return r; 1272 } 1273 1274 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1275 dm_block_t *result) 1276 { 1277 int r; 1278 1279 down_read(&pmd->root_lock); 1280 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1281 up_read(&pmd->root_lock); 1282 1283 return r; 1284 } 1285 1286 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1287 dm_block_t *result) 1288 { 1289 int r; 1290 1291 down_read(&pmd->root_lock); 1292 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1293 up_read(&pmd->root_lock); 1294 1295 return r; 1296 } 1297 1298 int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) 1299 { 1300 down_read(&pmd->root_lock); 1301 *result = pmd->data_block_size; 1302 up_read(&pmd->root_lock); 1303 1304 return 0; 1305 } 1306 1307 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1308 { 1309 int r; 1310 1311 down_read(&pmd->root_lock); 1312 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1313 up_read(&pmd->root_lock); 1314 1315 return r; 1316 } 1317 1318 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1319 { 1320 struct dm_pool_metadata *pmd = td->pmd; 1321 1322 down_read(&pmd->root_lock); 1323 *result = td->mapped_blocks; 1324 up_read(&pmd->root_lock); 1325 1326 return 0; 1327 } 1328 1329 static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1330 { 1331 int r; 1332 __le64 value_le; 1333 dm_block_t thin_root; 1334 struct dm_pool_metadata *pmd = td->pmd; 1335 1336 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); 1337 if (r) 1338 return r; 1339 1340 thin_root = le64_to_cpu(value_le); 1341 1342 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); 1343 } 1344 1345 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 1346 dm_block_t *result) 1347 { 1348 int r; 1349 struct dm_pool_metadata *pmd = td->pmd; 1350 1351 down_read(&pmd->root_lock); 1352 r = __highest_block(td, result); 1353 up_read(&pmd->root_lock); 1354 1355 return r; 1356 } 1357 1358 static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1359 { 1360 int r; 1361 dm_block_t old_count; 1362 1363 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); 1364 if (r) 1365 return r; 1366 1367 if (new_count == old_count) 1368 return 0; 1369 1370 if (new_count < old_count) { 1371 DMERR("cannot reduce size of data device"); 1372 return -EINVAL; 1373 } 1374 1375 r = dm_sm_extend(pmd->data_sm, new_count - old_count); 1376 if (!r) 1377 pmd->need_commit = 1; 1378 1379 return r; 1380 } 1381 1382 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1383 { 1384 int r; 1385 1386 down_write(&pmd->root_lock); 1387 r = __resize_data_dev(pmd, new_count); 1388 up_write(&pmd->root_lock); 1389 1390 return r; 1391 } 1392