1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-thin-metadata.h" 8 #include "persistent-data/dm-btree.h" 9 #include "persistent-data/dm-space-map.h" 10 #include "persistent-data/dm-space-map-disk.h" 11 #include "persistent-data/dm-transaction-manager.h" 12 13 #include <linux/list.h> 14 #include <linux/device-mapper.h> 15 #include <linux/workqueue.h> 16 17 /*-------------------------------------------------------------------------- 18 * As far as the metadata goes, there is: 19 * 20 * - A superblock in block zero, taking up fewer than 512 bytes for 21 * atomic writes. 22 * 23 * - A space map managing the metadata blocks. 24 * 25 * - A space map managing the data blocks. 26 * 27 * - A btree mapping our internal thin dev ids onto struct disk_device_details. 28 * 29 * - A hierarchical btree, with 2 levels which effectively maps (thin 30 * dev id, virtual block) -> block_time. Block time is a 64-bit 31 * field holding the time in the low 24 bits, and block in the top 48 32 * bits. 33 * 34 * BTrees consist solely of btree_nodes, that fill a block. Some are 35 * internal nodes, as such their values are a __le64 pointing to other 36 * nodes. Leaf nodes can store data of any reasonable size (ie. much 37 * smaller than the block size). The nodes consist of the header, 38 * followed by an array of keys, followed by an array of values. We have 39 * to binary search on the keys so they're all held together to help the 40 * cpu cache. 41 * 42 * Space maps have 2 btrees: 43 * 44 * - One maps a uint64_t onto a struct index_entry. Which points to a 45 * bitmap block, and has some details about how many free entries there 46 * are etc. 47 * 48 * - The bitmap blocks have a header (for the checksum). Then the rest 49 * of the block is pairs of bits. With the meaning being: 50 * 51 * 0 - ref count is 0 52 * 1 - ref count is 1 53 * 2 - ref count is 2 54 * 3 - ref count is higher than 2 55 * 56 * - If the count is higher than 2 then the ref count is entered in a 57 * second btree that directly maps the block_address to a uint32_t ref 58 * count. 59 * 60 * The space map metadata variant doesn't have a bitmaps btree. Instead 61 * it has one single blocks worth of index_entries. This avoids 62 * recursive issues with the bitmap btree needing to allocate space in 63 * order to insert. With a small data block size such as 64k the 64 * metadata support data devices that are hundreds of terrabytes. 65 * 66 * The space maps allocate space linearly from front to back. Space that 67 * is freed in a transaction is never recycled within that transaction. 68 * To try and avoid fragmenting _free_ space the allocator always goes 69 * back and fills in gaps. 70 * 71 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks 72 * from the block manager. 73 *--------------------------------------------------------------------------*/ 74 75 #define DM_MSG_PREFIX "thin metadata" 76 77 #define THIN_SUPERBLOCK_MAGIC 27022010 78 #define THIN_SUPERBLOCK_LOCATION 0 79 #define THIN_VERSION 1 80 #define THIN_METADATA_CACHE_SIZE 64 81 #define SECTOR_TO_BLOCK_SHIFT 3 82 83 /* This should be plenty */ 84 #define SPACE_MAP_ROOT_SIZE 128 85 86 /* 87 * Little endian on-disk superblock and device details. 88 */ 89 struct thin_disk_superblock { 90 __le32 csum; /* Checksum of superblock except for this field. */ 91 __le32 flags; 92 __le64 blocknr; /* This block number, dm_block_t. */ 93 94 __u8 uuid[16]; 95 __le64 magic; 96 __le32 version; 97 __le32 time; 98 99 __le64 trans_id; 100 101 /* 102 * Root held by userspace transactions. 103 */ 104 __le64 held_root; 105 106 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 107 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 108 109 /* 110 * 2-level btree mapping (dev_id, (dev block, time)) -> data block 111 */ 112 __le64 data_mapping_root; 113 114 /* 115 * Device detail root mapping dev_id -> device_details 116 */ 117 __le64 device_details_root; 118 119 __le32 data_block_size; /* In 512-byte sectors. */ 120 121 __le32 metadata_block_size; /* In 512-byte sectors. */ 122 __le64 metadata_nr_blocks; 123 124 __le32 compat_flags; 125 __le32 compat_ro_flags; 126 __le32 incompat_flags; 127 } __packed; 128 129 struct disk_device_details { 130 __le64 mapped_blocks; 131 __le64 transaction_id; /* When created. */ 132 __le32 creation_time; 133 __le32 snapshotted_time; 134 } __packed; 135 136 struct dm_pool_metadata { 137 struct hlist_node hash; 138 139 struct block_device *bdev; 140 struct dm_block_manager *bm; 141 struct dm_space_map *metadata_sm; 142 struct dm_space_map *data_sm; 143 struct dm_transaction_manager *tm; 144 struct dm_transaction_manager *nb_tm; 145 146 /* 147 * Two-level btree. 148 * First level holds thin_dev_t. 149 * Second level holds mappings. 150 */ 151 struct dm_btree_info info; 152 153 /* 154 * Non-blocking version of the above. 155 */ 156 struct dm_btree_info nb_info; 157 158 /* 159 * Just the top level for deleting whole devices. 160 */ 161 struct dm_btree_info tl_info; 162 163 /* 164 * Just the bottom level for creating new devices. 165 */ 166 struct dm_btree_info bl_info; 167 168 /* 169 * Describes the device details btree. 170 */ 171 struct dm_btree_info details_info; 172 173 struct rw_semaphore root_lock; 174 uint32_t time; 175 int need_commit; 176 dm_block_t root; 177 dm_block_t details_root; 178 struct list_head thin_devices; 179 uint64_t trans_id; 180 unsigned long flags; 181 sector_t data_block_size; 182 }; 183 184 struct dm_thin_device { 185 struct list_head list; 186 struct dm_pool_metadata *pmd; 187 dm_thin_id id; 188 189 int open_count; 190 int changed; 191 uint64_t mapped_blocks; 192 uint64_t transaction_id; 193 uint32_t creation_time; 194 uint32_t snapshotted_time; 195 }; 196 197 /*---------------------------------------------------------------- 198 * superblock validator 199 *--------------------------------------------------------------*/ 200 201 #define SUPERBLOCK_CSUM_XOR 160774 202 203 static void sb_prepare_for_write(struct dm_block_validator *v, 204 struct dm_block *b, 205 size_t block_size) 206 { 207 struct thin_disk_superblock *disk_super = dm_block_data(b); 208 209 disk_super->blocknr = cpu_to_le64(dm_block_location(b)); 210 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 211 block_size - sizeof(__le32), 212 SUPERBLOCK_CSUM_XOR)); 213 } 214 215 static int sb_check(struct dm_block_validator *v, 216 struct dm_block *b, 217 size_t block_size) 218 { 219 struct thin_disk_superblock *disk_super = dm_block_data(b); 220 __le32 csum_le; 221 222 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { 223 DMERR("sb_check failed: blocknr %llu: " 224 "wanted %llu", le64_to_cpu(disk_super->blocknr), 225 (unsigned long long)dm_block_location(b)); 226 return -ENOTBLK; 227 } 228 229 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { 230 DMERR("sb_check failed: magic %llu: " 231 "wanted %llu", le64_to_cpu(disk_super->magic), 232 (unsigned long long)THIN_SUPERBLOCK_MAGIC); 233 return -EILSEQ; 234 } 235 236 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 237 block_size - sizeof(__le32), 238 SUPERBLOCK_CSUM_XOR)); 239 if (csum_le != disk_super->csum) { 240 DMERR("sb_check failed: csum %u: wanted %u", 241 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); 242 return -EILSEQ; 243 } 244 245 return 0; 246 } 247 248 static struct dm_block_validator sb_validator = { 249 .name = "superblock", 250 .prepare_for_write = sb_prepare_for_write, 251 .check = sb_check 252 }; 253 254 /*---------------------------------------------------------------- 255 * Methods for the btree value types 256 *--------------------------------------------------------------*/ 257 258 static uint64_t pack_block_time(dm_block_t b, uint32_t t) 259 { 260 return (b << 24) | t; 261 } 262 263 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) 264 { 265 *b = v >> 24; 266 *t = v & ((1 << 24) - 1); 267 } 268 269 static void data_block_inc(void *context, void *value_le) 270 { 271 struct dm_space_map *sm = context; 272 __le64 v_le; 273 uint64_t b; 274 uint32_t t; 275 276 memcpy(&v_le, value_le, sizeof(v_le)); 277 unpack_block_time(le64_to_cpu(v_le), &b, &t); 278 dm_sm_inc_block(sm, b); 279 } 280 281 static void data_block_dec(void *context, void *value_le) 282 { 283 struct dm_space_map *sm = context; 284 __le64 v_le; 285 uint64_t b; 286 uint32_t t; 287 288 memcpy(&v_le, value_le, sizeof(v_le)); 289 unpack_block_time(le64_to_cpu(v_le), &b, &t); 290 dm_sm_dec_block(sm, b); 291 } 292 293 static int data_block_equal(void *context, void *value1_le, void *value2_le) 294 { 295 __le64 v1_le, v2_le; 296 uint64_t b1, b2; 297 uint32_t t; 298 299 memcpy(&v1_le, value1_le, sizeof(v1_le)); 300 memcpy(&v2_le, value2_le, sizeof(v2_le)); 301 unpack_block_time(le64_to_cpu(v1_le), &b1, &t); 302 unpack_block_time(le64_to_cpu(v2_le), &b2, &t); 303 304 return b1 == b2; 305 } 306 307 static void subtree_inc(void *context, void *value) 308 { 309 struct dm_btree_info *info = context; 310 __le64 root_le; 311 uint64_t root; 312 313 memcpy(&root_le, value, sizeof(root_le)); 314 root = le64_to_cpu(root_le); 315 dm_tm_inc(info->tm, root); 316 } 317 318 static void subtree_dec(void *context, void *value) 319 { 320 struct dm_btree_info *info = context; 321 __le64 root_le; 322 uint64_t root; 323 324 memcpy(&root_le, value, sizeof(root_le)); 325 root = le64_to_cpu(root_le); 326 if (dm_btree_del(info, root)) 327 DMERR("btree delete failed\n"); 328 } 329 330 static int subtree_equal(void *context, void *value1_le, void *value2_le) 331 { 332 __le64 v1_le, v2_le; 333 memcpy(&v1_le, value1_le, sizeof(v1_le)); 334 memcpy(&v2_le, value2_le, sizeof(v2_le)); 335 336 return v1_le == v2_le; 337 } 338 339 /*----------------------------------------------------------------*/ 340 341 static int superblock_all_zeroes(struct dm_block_manager *bm, int *result) 342 { 343 int r; 344 unsigned i; 345 struct dm_block *b; 346 __le64 *data_le, zero = cpu_to_le64(0); 347 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); 348 349 /* 350 * We can't use a validator here - it may be all zeroes. 351 */ 352 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); 353 if (r) 354 return r; 355 356 data_le = dm_block_data(b); 357 *result = 1; 358 for (i = 0; i < block_size; i++) { 359 if (data_le[i] != zero) { 360 *result = 0; 361 break; 362 } 363 } 364 365 return dm_bm_unlock(b); 366 } 367 368 static int init_pmd(struct dm_pool_metadata *pmd, 369 struct dm_block_manager *bm, 370 dm_block_t nr_blocks, int create) 371 { 372 int r; 373 struct dm_space_map *sm, *data_sm; 374 struct dm_transaction_manager *tm; 375 struct dm_block *sblock; 376 377 if (create) { 378 r = dm_tm_create_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 379 &sb_validator, &tm, &sm, &sblock); 380 if (r < 0) { 381 DMERR("tm_create_with_sm failed"); 382 return r; 383 } 384 385 data_sm = dm_sm_disk_create(tm, nr_blocks); 386 if (IS_ERR(data_sm)) { 387 DMERR("sm_disk_create failed"); 388 dm_tm_unlock(tm, sblock); 389 r = PTR_ERR(data_sm); 390 goto bad; 391 } 392 } else { 393 struct thin_disk_superblock *disk_super = NULL; 394 size_t space_map_root_offset = 395 offsetof(struct thin_disk_superblock, metadata_space_map_root); 396 397 r = dm_tm_open_with_sm(bm, THIN_SUPERBLOCK_LOCATION, 398 &sb_validator, space_map_root_offset, 399 SPACE_MAP_ROOT_SIZE, &tm, &sm, &sblock); 400 if (r < 0) { 401 DMERR("tm_open_with_sm failed"); 402 return r; 403 } 404 405 disk_super = dm_block_data(sblock); 406 data_sm = dm_sm_disk_open(tm, disk_super->data_space_map_root, 407 sizeof(disk_super->data_space_map_root)); 408 if (IS_ERR(data_sm)) { 409 DMERR("sm_disk_open failed"); 410 r = PTR_ERR(data_sm); 411 goto bad; 412 } 413 } 414 415 416 r = dm_tm_unlock(tm, sblock); 417 if (r < 0) { 418 DMERR("couldn't unlock superblock"); 419 goto bad_data_sm; 420 } 421 422 pmd->bm = bm; 423 pmd->metadata_sm = sm; 424 pmd->data_sm = data_sm; 425 pmd->tm = tm; 426 pmd->nb_tm = dm_tm_create_non_blocking_clone(tm); 427 if (!pmd->nb_tm) { 428 DMERR("could not create clone tm"); 429 r = -ENOMEM; 430 goto bad_data_sm; 431 } 432 433 pmd->info.tm = tm; 434 pmd->info.levels = 2; 435 pmd->info.value_type.context = pmd->data_sm; 436 pmd->info.value_type.size = sizeof(__le64); 437 pmd->info.value_type.inc = data_block_inc; 438 pmd->info.value_type.dec = data_block_dec; 439 pmd->info.value_type.equal = data_block_equal; 440 441 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 442 pmd->nb_info.tm = pmd->nb_tm; 443 444 pmd->tl_info.tm = tm; 445 pmd->tl_info.levels = 1; 446 pmd->tl_info.value_type.context = &pmd->info; 447 pmd->tl_info.value_type.size = sizeof(__le64); 448 pmd->tl_info.value_type.inc = subtree_inc; 449 pmd->tl_info.value_type.dec = subtree_dec; 450 pmd->tl_info.value_type.equal = subtree_equal; 451 452 pmd->bl_info.tm = tm; 453 pmd->bl_info.levels = 1; 454 pmd->bl_info.value_type.context = pmd->data_sm; 455 pmd->bl_info.value_type.size = sizeof(__le64); 456 pmd->bl_info.value_type.inc = data_block_inc; 457 pmd->bl_info.value_type.dec = data_block_dec; 458 pmd->bl_info.value_type.equal = data_block_equal; 459 460 pmd->details_info.tm = tm; 461 pmd->details_info.levels = 1; 462 pmd->details_info.value_type.context = NULL; 463 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 464 pmd->details_info.value_type.inc = NULL; 465 pmd->details_info.value_type.dec = NULL; 466 pmd->details_info.value_type.equal = NULL; 467 468 pmd->root = 0; 469 470 init_rwsem(&pmd->root_lock); 471 pmd->time = 0; 472 pmd->need_commit = 0; 473 pmd->details_root = 0; 474 pmd->trans_id = 0; 475 pmd->flags = 0; 476 INIT_LIST_HEAD(&pmd->thin_devices); 477 478 return 0; 479 480 bad_data_sm: 481 dm_sm_destroy(data_sm); 482 bad: 483 dm_tm_destroy(tm); 484 dm_sm_destroy(sm); 485 486 return r; 487 } 488 489 static int __begin_transaction(struct dm_pool_metadata *pmd) 490 { 491 int r; 492 u32 features; 493 struct thin_disk_superblock *disk_super; 494 struct dm_block *sblock; 495 496 /* 497 * __maybe_commit_transaction() resets these 498 */ 499 WARN_ON(pmd->need_commit); 500 501 /* 502 * We re-read the superblock every time. Shouldn't need to do this 503 * really. 504 */ 505 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 506 &sb_validator, &sblock); 507 if (r) 508 return r; 509 510 disk_super = dm_block_data(sblock); 511 pmd->time = le32_to_cpu(disk_super->time); 512 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 513 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 514 pmd->trans_id = le64_to_cpu(disk_super->trans_id); 515 pmd->flags = le32_to_cpu(disk_super->flags); 516 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 517 518 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; 519 if (features) { 520 DMERR("could not access metadata due to " 521 "unsupported optional features (%lx).", 522 (unsigned long)features); 523 r = -EINVAL; 524 goto out; 525 } 526 527 /* 528 * Check for read-only metadata to skip the following RDWR checks. 529 */ 530 if (get_disk_ro(pmd->bdev->bd_disk)) 531 goto out; 532 533 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; 534 if (features) { 535 DMERR("could not access metadata RDWR due to " 536 "unsupported optional features (%lx).", 537 (unsigned long)features); 538 r = -EINVAL; 539 } 540 541 out: 542 dm_bm_unlock(sblock); 543 return r; 544 } 545 546 static int __write_changed_details(struct dm_pool_metadata *pmd) 547 { 548 int r; 549 struct dm_thin_device *td, *tmp; 550 struct disk_device_details details; 551 uint64_t key; 552 553 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 554 if (!td->changed) 555 continue; 556 557 key = td->id; 558 559 details.mapped_blocks = cpu_to_le64(td->mapped_blocks); 560 details.transaction_id = cpu_to_le64(td->transaction_id); 561 details.creation_time = cpu_to_le32(td->creation_time); 562 details.snapshotted_time = cpu_to_le32(td->snapshotted_time); 563 __dm_bless_for_disk(&details); 564 565 r = dm_btree_insert(&pmd->details_info, pmd->details_root, 566 &key, &details, &pmd->details_root); 567 if (r) 568 return r; 569 570 if (td->open_count) 571 td->changed = 0; 572 else { 573 list_del(&td->list); 574 kfree(td); 575 } 576 577 pmd->need_commit = 1; 578 } 579 580 return 0; 581 } 582 583 static int __commit_transaction(struct dm_pool_metadata *pmd) 584 { 585 /* 586 * FIXME: Associated pool should be made read-only on failure. 587 */ 588 int r; 589 size_t metadata_len, data_len; 590 struct thin_disk_superblock *disk_super; 591 struct dm_block *sblock; 592 593 /* 594 * We need to know if the thin_disk_superblock exceeds a 512-byte sector. 595 */ 596 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 597 598 r = __write_changed_details(pmd); 599 if (r < 0) 600 goto out; 601 602 if (!pmd->need_commit) 603 goto out; 604 605 r = dm_sm_commit(pmd->data_sm); 606 if (r < 0) 607 goto out; 608 609 r = dm_tm_pre_commit(pmd->tm); 610 if (r < 0) 611 goto out; 612 613 r = dm_sm_root_size(pmd->metadata_sm, &metadata_len); 614 if (r < 0) 615 goto out; 616 617 r = dm_sm_root_size(pmd->metadata_sm, &data_len); 618 if (r < 0) 619 goto out; 620 621 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 622 &sb_validator, &sblock); 623 if (r) 624 goto out; 625 626 disk_super = dm_block_data(sblock); 627 disk_super->time = cpu_to_le32(pmd->time); 628 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 629 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 630 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 631 disk_super->flags = cpu_to_le32(pmd->flags); 632 633 r = dm_sm_copy_root(pmd->metadata_sm, &disk_super->metadata_space_map_root, 634 metadata_len); 635 if (r < 0) 636 goto out_locked; 637 638 r = dm_sm_copy_root(pmd->data_sm, &disk_super->data_space_map_root, 639 data_len); 640 if (r < 0) 641 goto out_locked; 642 643 r = dm_tm_commit(pmd->tm, sblock); 644 if (!r) 645 pmd->need_commit = 0; 646 647 out: 648 return r; 649 650 out_locked: 651 dm_bm_unlock(sblock); 652 return r; 653 } 654 655 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 656 sector_t data_block_size) 657 { 658 int r; 659 struct thin_disk_superblock *disk_super; 660 struct dm_pool_metadata *pmd; 661 sector_t bdev_size = i_size_read(bdev->bd_inode) >> SECTOR_SHIFT; 662 struct dm_block_manager *bm; 663 int create; 664 struct dm_block *sblock; 665 666 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 667 if (!pmd) { 668 DMERR("could not allocate metadata struct"); 669 return ERR_PTR(-ENOMEM); 670 } 671 672 /* 673 * Max hex locks: 674 * 3 for btree insert + 675 * 2 for btree lookup used within space map 676 */ 677 bm = dm_block_manager_create(bdev, THIN_METADATA_BLOCK_SIZE, 678 THIN_METADATA_CACHE_SIZE, 5); 679 if (!bm) { 680 DMERR("could not create block manager"); 681 kfree(pmd); 682 return ERR_PTR(-ENOMEM); 683 } 684 685 r = superblock_all_zeroes(bm, &create); 686 if (r) { 687 dm_block_manager_destroy(bm); 688 kfree(pmd); 689 return ERR_PTR(r); 690 } 691 692 693 r = init_pmd(pmd, bm, 0, create); 694 if (r) { 695 dm_block_manager_destroy(bm); 696 kfree(pmd); 697 return ERR_PTR(r); 698 } 699 pmd->bdev = bdev; 700 701 if (!create) { 702 r = __begin_transaction(pmd); 703 if (r < 0) 704 goto bad; 705 return pmd; 706 } 707 708 /* 709 * Create. 710 */ 711 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 712 &sb_validator, &sblock); 713 if (r) 714 goto bad; 715 716 disk_super = dm_block_data(sblock); 717 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 718 disk_super->version = cpu_to_le32(THIN_VERSION); 719 disk_super->time = 0; 720 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE >> SECTOR_SHIFT); 721 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 722 disk_super->data_block_size = cpu_to_le32(data_block_size); 723 724 r = dm_bm_unlock(sblock); 725 if (r < 0) 726 goto bad; 727 728 r = dm_btree_empty(&pmd->info, &pmd->root); 729 if (r < 0) 730 goto bad; 731 732 r = dm_btree_empty(&pmd->details_info, &pmd->details_root); 733 if (r < 0) { 734 DMERR("couldn't create devices root"); 735 goto bad; 736 } 737 738 pmd->flags = 0; 739 pmd->need_commit = 1; 740 r = dm_pool_commit_metadata(pmd); 741 if (r < 0) { 742 DMERR("%s: dm_pool_commit_metadata() failed, error = %d", 743 __func__, r); 744 goto bad; 745 } 746 747 return pmd; 748 749 bad: 750 if (dm_pool_metadata_close(pmd) < 0) 751 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 752 return ERR_PTR(r); 753 } 754 755 int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 756 { 757 int r; 758 unsigned open_devices = 0; 759 struct dm_thin_device *td, *tmp; 760 761 down_read(&pmd->root_lock); 762 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 763 if (td->open_count) 764 open_devices++; 765 else { 766 list_del(&td->list); 767 kfree(td); 768 } 769 } 770 up_read(&pmd->root_lock); 771 772 if (open_devices) { 773 DMERR("attempt to close pmd when %u device(s) are still open", 774 open_devices); 775 return -EBUSY; 776 } 777 778 r = __commit_transaction(pmd); 779 if (r < 0) 780 DMWARN("%s: __commit_transaction() failed, error = %d", 781 __func__, r); 782 783 dm_tm_destroy(pmd->tm); 784 dm_tm_destroy(pmd->nb_tm); 785 dm_block_manager_destroy(pmd->bm); 786 dm_sm_destroy(pmd->metadata_sm); 787 dm_sm_destroy(pmd->data_sm); 788 kfree(pmd); 789 790 return 0; 791 } 792 793 /* 794 * __open_device: Returns @td corresponding to device with id @dev, 795 * creating it if @create is set and incrementing @td->open_count. 796 * On failure, @td is undefined. 797 */ 798 static int __open_device(struct dm_pool_metadata *pmd, 799 dm_thin_id dev, int create, 800 struct dm_thin_device **td) 801 { 802 int r, changed = 0; 803 struct dm_thin_device *td2; 804 uint64_t key = dev; 805 struct disk_device_details details_le; 806 807 /* 808 * If the device is already open, return it. 809 */ 810 list_for_each_entry(td2, &pmd->thin_devices, list) 811 if (td2->id == dev) { 812 /* 813 * May not create an already-open device. 814 */ 815 if (create) 816 return -EEXIST; 817 818 td2->open_count++; 819 *td = td2; 820 return 0; 821 } 822 823 /* 824 * Check the device exists. 825 */ 826 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 827 &key, &details_le); 828 if (r) { 829 if (r != -ENODATA || !create) 830 return r; 831 832 /* 833 * Create new device. 834 */ 835 changed = 1; 836 details_le.mapped_blocks = 0; 837 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 838 details_le.creation_time = cpu_to_le32(pmd->time); 839 details_le.snapshotted_time = cpu_to_le32(pmd->time); 840 } 841 842 *td = kmalloc(sizeof(**td), GFP_NOIO); 843 if (!*td) 844 return -ENOMEM; 845 846 (*td)->pmd = pmd; 847 (*td)->id = dev; 848 (*td)->open_count = 1; 849 (*td)->changed = changed; 850 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 851 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 852 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 853 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); 854 855 list_add(&(*td)->list, &pmd->thin_devices); 856 857 return 0; 858 } 859 860 static void __close_device(struct dm_thin_device *td) 861 { 862 --td->open_count; 863 } 864 865 static int __create_thin(struct dm_pool_metadata *pmd, 866 dm_thin_id dev) 867 { 868 int r; 869 dm_block_t dev_root; 870 uint64_t key = dev; 871 struct disk_device_details details_le; 872 struct dm_thin_device *td; 873 __le64 value; 874 875 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 876 &key, &details_le); 877 if (!r) 878 return -EEXIST; 879 880 /* 881 * Create an empty btree for the mappings. 882 */ 883 r = dm_btree_empty(&pmd->bl_info, &dev_root); 884 if (r) 885 return r; 886 887 /* 888 * Insert it into the main mapping tree. 889 */ 890 value = cpu_to_le64(dev_root); 891 __dm_bless_for_disk(&value); 892 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 893 if (r) { 894 dm_btree_del(&pmd->bl_info, dev_root); 895 return r; 896 } 897 898 r = __open_device(pmd, dev, 1, &td); 899 if (r) { 900 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 901 dm_btree_del(&pmd->bl_info, dev_root); 902 return r; 903 } 904 __close_device(td); 905 906 return r; 907 } 908 909 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 910 { 911 int r; 912 913 down_write(&pmd->root_lock); 914 r = __create_thin(pmd, dev); 915 up_write(&pmd->root_lock); 916 917 return r; 918 } 919 920 static int __set_snapshot_details(struct dm_pool_metadata *pmd, 921 struct dm_thin_device *snap, 922 dm_thin_id origin, uint32_t time) 923 { 924 int r; 925 struct dm_thin_device *td; 926 927 r = __open_device(pmd, origin, 0, &td); 928 if (r) 929 return r; 930 931 td->changed = 1; 932 td->snapshotted_time = time; 933 934 snap->mapped_blocks = td->mapped_blocks; 935 snap->snapshotted_time = time; 936 __close_device(td); 937 938 return 0; 939 } 940 941 static int __create_snap(struct dm_pool_metadata *pmd, 942 dm_thin_id dev, dm_thin_id origin) 943 { 944 int r; 945 dm_block_t origin_root; 946 uint64_t key = origin, dev_key = dev; 947 struct dm_thin_device *td; 948 struct disk_device_details details_le; 949 __le64 value; 950 951 /* check this device is unused */ 952 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 953 &dev_key, &details_le); 954 if (!r) 955 return -EEXIST; 956 957 /* find the mapping tree for the origin */ 958 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); 959 if (r) 960 return r; 961 origin_root = le64_to_cpu(value); 962 963 /* clone the origin, an inc will do */ 964 dm_tm_inc(pmd->tm, origin_root); 965 966 /* insert into the main mapping tree */ 967 value = cpu_to_le64(origin_root); 968 __dm_bless_for_disk(&value); 969 key = dev; 970 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 971 if (r) { 972 dm_tm_dec(pmd->tm, origin_root); 973 return r; 974 } 975 976 pmd->time++; 977 978 r = __open_device(pmd, dev, 1, &td); 979 if (r) 980 goto bad; 981 982 r = __set_snapshot_details(pmd, td, origin, pmd->time); 983 __close_device(td); 984 985 if (r) 986 goto bad; 987 988 return 0; 989 990 bad: 991 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 992 dm_btree_remove(&pmd->details_info, pmd->details_root, 993 &key, &pmd->details_root); 994 return r; 995 } 996 997 int dm_pool_create_snap(struct dm_pool_metadata *pmd, 998 dm_thin_id dev, 999 dm_thin_id origin) 1000 { 1001 int r; 1002 1003 down_write(&pmd->root_lock); 1004 r = __create_snap(pmd, dev, origin); 1005 up_write(&pmd->root_lock); 1006 1007 return r; 1008 } 1009 1010 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) 1011 { 1012 int r; 1013 uint64_t key = dev; 1014 struct dm_thin_device *td; 1015 1016 /* TODO: failure should mark the transaction invalid */ 1017 r = __open_device(pmd, dev, 0, &td); 1018 if (r) 1019 return r; 1020 1021 if (td->open_count > 1) { 1022 __close_device(td); 1023 return -EBUSY; 1024 } 1025 1026 list_del(&td->list); 1027 kfree(td); 1028 r = dm_btree_remove(&pmd->details_info, pmd->details_root, 1029 &key, &pmd->details_root); 1030 if (r) 1031 return r; 1032 1033 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1034 if (r) 1035 return r; 1036 1037 pmd->need_commit = 1; 1038 1039 return 0; 1040 } 1041 1042 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1043 dm_thin_id dev) 1044 { 1045 int r; 1046 1047 down_write(&pmd->root_lock); 1048 r = __delete_device(pmd, dev); 1049 up_write(&pmd->root_lock); 1050 1051 return r; 1052 } 1053 1054 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 1055 uint64_t current_id, 1056 uint64_t new_id) 1057 { 1058 down_write(&pmd->root_lock); 1059 if (pmd->trans_id != current_id) { 1060 up_write(&pmd->root_lock); 1061 DMERR("mismatched transaction id"); 1062 return -EINVAL; 1063 } 1064 1065 pmd->trans_id = new_id; 1066 pmd->need_commit = 1; 1067 up_write(&pmd->root_lock); 1068 1069 return 0; 1070 } 1071 1072 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1073 uint64_t *result) 1074 { 1075 down_read(&pmd->root_lock); 1076 *result = pmd->trans_id; 1077 up_read(&pmd->root_lock); 1078 1079 return 0; 1080 } 1081 1082 static int __get_held_metadata_root(struct dm_pool_metadata *pmd, 1083 dm_block_t *result) 1084 { 1085 int r; 1086 struct thin_disk_superblock *disk_super; 1087 struct dm_block *sblock; 1088 1089 r = dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1090 &sb_validator, &sblock); 1091 if (r) 1092 return r; 1093 1094 disk_super = dm_block_data(sblock); 1095 *result = le64_to_cpu(disk_super->held_root); 1096 1097 return dm_bm_unlock(sblock); 1098 } 1099 1100 int dm_pool_get_held_metadata_root(struct dm_pool_metadata *pmd, 1101 dm_block_t *result) 1102 { 1103 int r; 1104 1105 down_read(&pmd->root_lock); 1106 r = __get_held_metadata_root(pmd, result); 1107 up_read(&pmd->root_lock); 1108 1109 return r; 1110 } 1111 1112 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1113 struct dm_thin_device **td) 1114 { 1115 int r; 1116 1117 down_write(&pmd->root_lock); 1118 r = __open_device(pmd, dev, 0, td); 1119 up_write(&pmd->root_lock); 1120 1121 return r; 1122 } 1123 1124 int dm_pool_close_thin_device(struct dm_thin_device *td) 1125 { 1126 down_write(&td->pmd->root_lock); 1127 __close_device(td); 1128 up_write(&td->pmd->root_lock); 1129 1130 return 0; 1131 } 1132 1133 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) 1134 { 1135 return td->id; 1136 } 1137 1138 static int __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1139 { 1140 return td->snapshotted_time > time; 1141 } 1142 1143 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1144 int can_block, struct dm_thin_lookup_result *result) 1145 { 1146 int r; 1147 uint64_t block_time = 0; 1148 __le64 value; 1149 struct dm_pool_metadata *pmd = td->pmd; 1150 dm_block_t keys[2] = { td->id, block }; 1151 1152 if (can_block) { 1153 down_read(&pmd->root_lock); 1154 r = dm_btree_lookup(&pmd->info, pmd->root, keys, &value); 1155 if (!r) 1156 block_time = le64_to_cpu(value); 1157 up_read(&pmd->root_lock); 1158 1159 } else if (down_read_trylock(&pmd->root_lock)) { 1160 r = dm_btree_lookup(&pmd->nb_info, pmd->root, keys, &value); 1161 if (!r) 1162 block_time = le64_to_cpu(value); 1163 up_read(&pmd->root_lock); 1164 1165 } else 1166 return -EWOULDBLOCK; 1167 1168 if (!r) { 1169 dm_block_t exception_block; 1170 uint32_t exception_time; 1171 unpack_block_time(block_time, &exception_block, 1172 &exception_time); 1173 result->block = exception_block; 1174 result->shared = __snapshotted_since(td, exception_time); 1175 } 1176 1177 return r; 1178 } 1179 1180 static int __insert(struct dm_thin_device *td, dm_block_t block, 1181 dm_block_t data_block) 1182 { 1183 int r, inserted; 1184 __le64 value; 1185 struct dm_pool_metadata *pmd = td->pmd; 1186 dm_block_t keys[2] = { td->id, block }; 1187 1188 pmd->need_commit = 1; 1189 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1190 __dm_bless_for_disk(&value); 1191 1192 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, 1193 &pmd->root, &inserted); 1194 if (r) 1195 return r; 1196 1197 if (inserted) { 1198 td->mapped_blocks++; 1199 td->changed = 1; 1200 } 1201 1202 return 0; 1203 } 1204 1205 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1206 dm_block_t data_block) 1207 { 1208 int r; 1209 1210 down_write(&td->pmd->root_lock); 1211 r = __insert(td, block, data_block); 1212 up_write(&td->pmd->root_lock); 1213 1214 return r; 1215 } 1216 1217 static int __remove(struct dm_thin_device *td, dm_block_t block) 1218 { 1219 int r; 1220 struct dm_pool_metadata *pmd = td->pmd; 1221 dm_block_t keys[2] = { td->id, block }; 1222 1223 r = dm_btree_remove(&pmd->info, pmd->root, keys, &pmd->root); 1224 if (r) 1225 return r; 1226 1227 td->mapped_blocks--; 1228 td->changed = 1; 1229 pmd->need_commit = 1; 1230 1231 return 0; 1232 } 1233 1234 int dm_thin_remove_block(struct dm_thin_device *td, dm_block_t block) 1235 { 1236 int r; 1237 1238 down_write(&td->pmd->root_lock); 1239 r = __remove(td, block); 1240 up_write(&td->pmd->root_lock); 1241 1242 return r; 1243 } 1244 1245 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1246 { 1247 int r; 1248 1249 down_write(&pmd->root_lock); 1250 1251 r = dm_sm_new_block(pmd->data_sm, result); 1252 pmd->need_commit = 1; 1253 1254 up_write(&pmd->root_lock); 1255 1256 return r; 1257 } 1258 1259 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1260 { 1261 int r; 1262 1263 down_write(&pmd->root_lock); 1264 1265 r = __commit_transaction(pmd); 1266 if (r <= 0) 1267 goto out; 1268 1269 /* 1270 * Open the next transaction. 1271 */ 1272 r = __begin_transaction(pmd); 1273 out: 1274 up_write(&pmd->root_lock); 1275 return r; 1276 } 1277 1278 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1279 { 1280 int r; 1281 1282 down_read(&pmd->root_lock); 1283 r = dm_sm_get_nr_free(pmd->data_sm, result); 1284 up_read(&pmd->root_lock); 1285 1286 return r; 1287 } 1288 1289 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1290 dm_block_t *result) 1291 { 1292 int r; 1293 1294 down_read(&pmd->root_lock); 1295 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1296 up_read(&pmd->root_lock); 1297 1298 return r; 1299 } 1300 1301 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1302 dm_block_t *result) 1303 { 1304 int r; 1305 1306 down_read(&pmd->root_lock); 1307 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1308 up_read(&pmd->root_lock); 1309 1310 return r; 1311 } 1312 1313 int dm_pool_get_data_block_size(struct dm_pool_metadata *pmd, sector_t *result) 1314 { 1315 down_read(&pmd->root_lock); 1316 *result = pmd->data_block_size; 1317 up_read(&pmd->root_lock); 1318 1319 return 0; 1320 } 1321 1322 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1323 { 1324 int r; 1325 1326 down_read(&pmd->root_lock); 1327 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1328 up_read(&pmd->root_lock); 1329 1330 return r; 1331 } 1332 1333 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1334 { 1335 struct dm_pool_metadata *pmd = td->pmd; 1336 1337 down_read(&pmd->root_lock); 1338 *result = td->mapped_blocks; 1339 up_read(&pmd->root_lock); 1340 1341 return 0; 1342 } 1343 1344 static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1345 { 1346 int r; 1347 __le64 value_le; 1348 dm_block_t thin_root; 1349 struct dm_pool_metadata *pmd = td->pmd; 1350 1351 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); 1352 if (r) 1353 return r; 1354 1355 thin_root = le64_to_cpu(value_le); 1356 1357 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); 1358 } 1359 1360 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 1361 dm_block_t *result) 1362 { 1363 int r; 1364 struct dm_pool_metadata *pmd = td->pmd; 1365 1366 down_read(&pmd->root_lock); 1367 r = __highest_block(td, result); 1368 up_read(&pmd->root_lock); 1369 1370 return r; 1371 } 1372 1373 static int __resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1374 { 1375 int r; 1376 dm_block_t old_count; 1377 1378 r = dm_sm_get_nr_blocks(pmd->data_sm, &old_count); 1379 if (r) 1380 return r; 1381 1382 if (new_count == old_count) 1383 return 0; 1384 1385 if (new_count < old_count) { 1386 DMERR("cannot reduce size of data device"); 1387 return -EINVAL; 1388 } 1389 1390 r = dm_sm_extend(pmd->data_sm, new_count - old_count); 1391 if (!r) 1392 pmd->need_commit = 1; 1393 1394 return r; 1395 } 1396 1397 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 1398 { 1399 int r; 1400 1401 down_write(&pmd->root_lock); 1402 r = __resize_data_dev(pmd, new_count); 1403 up_write(&pmd->root_lock); 1404 1405 return r; 1406 } 1407