1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011-2012 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-thin-metadata.h" 9 #include "persistent-data/dm-btree.h" 10 #include "persistent-data/dm-space-map.h" 11 #include "persistent-data/dm-space-map-disk.h" 12 #include "persistent-data/dm-transaction-manager.h" 13 14 #include <linux/list.h> 15 #include <linux/device-mapper.h> 16 #include <linux/workqueue.h> 17 18 /*-------------------------------------------------------------------------- 19 * As far as the metadata goes, there is: 20 * 21 * - A superblock in block zero, taking up fewer than 512 bytes for 22 * atomic writes. 23 * 24 * - A space map managing the metadata blocks. 25 * 26 * - A space map managing the data blocks. 27 * 28 * - A btree mapping our internal thin dev ids onto struct disk_device_details. 29 * 30 * - A hierarchical btree, with 2 levels which effectively maps (thin 31 * dev id, virtual block) -> block_time. Block time is a 64-bit 32 * field holding the time in the low 24 bits, and block in the top 40 33 * bits. 34 * 35 * BTrees consist solely of btree_nodes, that fill a block. Some are 36 * internal nodes, as such their values are a __le64 pointing to other 37 * nodes. Leaf nodes can store data of any reasonable size (ie. much 38 * smaller than the block size). The nodes consist of the header, 39 * followed by an array of keys, followed by an array of values. We have 40 * to binary search on the keys so they're all held together to help the 41 * cpu cache. 42 * 43 * Space maps have 2 btrees: 44 * 45 * - One maps a uint64_t onto a struct index_entry. Which points to a 46 * bitmap block, and has some details about how many free entries there 47 * are etc. 48 * 49 * - The bitmap blocks have a header (for the checksum). Then the rest 50 * of the block is pairs of bits. With the meaning being: 51 * 52 * 0 - ref count is 0 53 * 1 - ref count is 1 54 * 2 - ref count is 2 55 * 3 - ref count is higher than 2 56 * 57 * - If the count is higher than 2 then the ref count is entered in a 58 * second btree that directly maps the block_address to a uint32_t ref 59 * count. 60 * 61 * The space map metadata variant doesn't have a bitmaps btree. Instead 62 * it has one single blocks worth of index_entries. This avoids 63 * recursive issues with the bitmap btree needing to allocate space in 64 * order to insert. With a small data block size such as 64k the 65 * metadata support data devices that are hundreds of terrabytes. 66 * 67 * The space maps allocate space linearly from front to back. Space that 68 * is freed in a transaction is never recycled within that transaction. 69 * To try and avoid fragmenting _free_ space the allocator always goes 70 * back and fills in gaps. 71 * 72 * All metadata io is in THIN_METADATA_BLOCK_SIZE sized/aligned chunks 73 * from the block manager. 74 *--------------------------------------------------------------------------*/ 75 76 #define DM_MSG_PREFIX "thin metadata" 77 78 #define THIN_SUPERBLOCK_MAGIC 27022010 79 #define THIN_SUPERBLOCK_LOCATION 0 80 #define THIN_VERSION 2 81 #define SECTOR_TO_BLOCK_SHIFT 3 82 83 /* 84 * For btree insert: 85 * 3 for btree insert + 86 * 2 for btree lookup used within space map 87 * For btree remove: 88 * 2 for shadow spine + 89 * 4 for rebalance 3 child node 90 */ 91 #define THIN_MAX_CONCURRENT_LOCKS 6 92 93 /* This should be plenty */ 94 #define SPACE_MAP_ROOT_SIZE 128 95 96 /* 97 * Little endian on-disk superblock and device details. 98 */ 99 struct thin_disk_superblock { 100 __le32 csum; /* Checksum of superblock except for this field. */ 101 __le32 flags; 102 __le64 blocknr; /* This block number, dm_block_t. */ 103 104 __u8 uuid[16]; 105 __le64 magic; 106 __le32 version; 107 __le32 time; 108 109 __le64 trans_id; 110 111 /* 112 * Root held by userspace transactions. 113 */ 114 __le64 held_root; 115 116 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 117 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 118 119 /* 120 * 2-level btree mapping (dev_id, (dev block, time)) -> data block 121 */ 122 __le64 data_mapping_root; 123 124 /* 125 * Device detail root mapping dev_id -> device_details 126 */ 127 __le64 device_details_root; 128 129 __le32 data_block_size; /* In 512-byte sectors. */ 130 131 __le32 metadata_block_size; /* In 512-byte sectors. */ 132 __le64 metadata_nr_blocks; 133 134 __le32 compat_flags; 135 __le32 compat_ro_flags; 136 __le32 incompat_flags; 137 } __packed; 138 139 struct disk_device_details { 140 __le64 mapped_blocks; 141 __le64 transaction_id; /* When created. */ 142 __le32 creation_time; 143 __le32 snapshotted_time; 144 } __packed; 145 146 struct dm_pool_metadata { 147 struct hlist_node hash; 148 149 struct block_device *bdev; 150 struct dm_block_manager *bm; 151 struct dm_space_map *metadata_sm; 152 struct dm_space_map *data_sm; 153 struct dm_transaction_manager *tm; 154 struct dm_transaction_manager *nb_tm; 155 156 /* 157 * Two-level btree. 158 * First level holds thin_dev_t. 159 * Second level holds mappings. 160 */ 161 struct dm_btree_info info; 162 163 /* 164 * Non-blocking version of the above. 165 */ 166 struct dm_btree_info nb_info; 167 168 /* 169 * Just the top level for deleting whole devices. 170 */ 171 struct dm_btree_info tl_info; 172 173 /* 174 * Just the bottom level for creating new devices. 175 */ 176 struct dm_btree_info bl_info; 177 178 /* 179 * Describes the device details btree. 180 */ 181 struct dm_btree_info details_info; 182 183 struct rw_semaphore root_lock; 184 uint32_t time; 185 dm_block_t root; 186 dm_block_t details_root; 187 struct list_head thin_devices; 188 uint64_t trans_id; 189 unsigned long flags; 190 sector_t data_block_size; 191 192 /* 193 * Pre-commit callback. 194 * 195 * This allows the thin provisioning target to run a callback before 196 * the metadata are committed. 197 */ 198 dm_pool_pre_commit_fn pre_commit_fn; 199 void *pre_commit_context; 200 201 /* 202 * We reserve a section of the metadata for commit overhead. 203 * All reported space does *not* include this. 204 */ 205 dm_block_t metadata_reserve; 206 207 /* 208 * Set if a transaction has to be aborted but the attempt to roll back 209 * to the previous (good) transaction failed. The only pool metadata 210 * operation possible in this state is the closing of the device. 211 */ 212 bool fail_io:1; 213 214 /* 215 * Set once a thin-pool has been accessed through one of the interfaces 216 * that imply the pool is in-service (e.g. thin devices created/deleted, 217 * thin-pool message, metadata snapshots, etc). 218 */ 219 bool in_service:1; 220 221 /* 222 * Reading the space map roots can fail, so we read it into these 223 * buffers before the superblock is locked and updated. 224 */ 225 __u8 data_space_map_root[SPACE_MAP_ROOT_SIZE]; 226 __u8 metadata_space_map_root[SPACE_MAP_ROOT_SIZE]; 227 }; 228 229 struct dm_thin_device { 230 struct list_head list; 231 struct dm_pool_metadata *pmd; 232 dm_thin_id id; 233 234 int open_count; 235 bool changed:1; 236 bool aborted_with_changes:1; 237 uint64_t mapped_blocks; 238 uint64_t transaction_id; 239 uint32_t creation_time; 240 uint32_t snapshotted_time; 241 }; 242 243 /*---------------------------------------------------------------- 244 * superblock validator 245 *--------------------------------------------------------------*/ 246 247 #define SUPERBLOCK_CSUM_XOR 160774 248 249 static void sb_prepare_for_write(struct dm_block_validator *v, 250 struct dm_block *b, 251 size_t block_size) 252 { 253 struct thin_disk_superblock *disk_super = dm_block_data(b); 254 255 disk_super->blocknr = cpu_to_le64(dm_block_location(b)); 256 disk_super->csum = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 257 block_size - sizeof(__le32), 258 SUPERBLOCK_CSUM_XOR)); 259 } 260 261 static int sb_check(struct dm_block_validator *v, 262 struct dm_block *b, 263 size_t block_size) 264 { 265 struct thin_disk_superblock *disk_super = dm_block_data(b); 266 __le32 csum_le; 267 268 if (dm_block_location(b) != le64_to_cpu(disk_super->blocknr)) { 269 DMERR("sb_check failed: blocknr %llu: " 270 "wanted %llu", le64_to_cpu(disk_super->blocknr), 271 (unsigned long long)dm_block_location(b)); 272 return -ENOTBLK; 273 } 274 275 if (le64_to_cpu(disk_super->magic) != THIN_SUPERBLOCK_MAGIC) { 276 DMERR("sb_check failed: magic %llu: " 277 "wanted %llu", le64_to_cpu(disk_super->magic), 278 (unsigned long long)THIN_SUPERBLOCK_MAGIC); 279 return -EILSEQ; 280 } 281 282 csum_le = cpu_to_le32(dm_bm_checksum(&disk_super->flags, 283 block_size - sizeof(__le32), 284 SUPERBLOCK_CSUM_XOR)); 285 if (csum_le != disk_super->csum) { 286 DMERR("sb_check failed: csum %u: wanted %u", 287 le32_to_cpu(csum_le), le32_to_cpu(disk_super->csum)); 288 return -EILSEQ; 289 } 290 291 return 0; 292 } 293 294 static struct dm_block_validator sb_validator = { 295 .name = "superblock", 296 .prepare_for_write = sb_prepare_for_write, 297 .check = sb_check 298 }; 299 300 /*---------------------------------------------------------------- 301 * Methods for the btree value types 302 *--------------------------------------------------------------*/ 303 304 static uint64_t pack_block_time(dm_block_t b, uint32_t t) 305 { 306 return (b << 24) | t; 307 } 308 309 static void unpack_block_time(uint64_t v, dm_block_t *b, uint32_t *t) 310 { 311 *b = v >> 24; 312 *t = v & ((1 << 24) - 1); 313 } 314 315 /* 316 * It's more efficient to call dm_sm_{inc,dec}_blocks as few times as 317 * possible. 'with_runs' reads contiguous runs of blocks, and calls the 318 * given sm function. 319 */ 320 typedef int (*run_fn)(struct dm_space_map *, dm_block_t, dm_block_t); 321 322 static void with_runs(struct dm_space_map *sm, const __le64 *value_le, unsigned count, run_fn fn) 323 { 324 uint64_t b, begin, end; 325 uint32_t t; 326 bool in_run = false; 327 unsigned i; 328 329 for (i = 0; i < count; i++, value_le++) { 330 /* We know value_le is 8 byte aligned */ 331 unpack_block_time(le64_to_cpu(*value_le), &b, &t); 332 333 if (in_run) { 334 if (b == end) { 335 end++; 336 } else { 337 fn(sm, begin, end); 338 begin = b; 339 end = b + 1; 340 } 341 } else { 342 in_run = true; 343 begin = b; 344 end = b + 1; 345 } 346 } 347 348 if (in_run) 349 fn(sm, begin, end); 350 } 351 352 static void data_block_inc(void *context, const void *value_le, unsigned count) 353 { 354 with_runs((struct dm_space_map *) context, 355 (const __le64 *) value_le, count, dm_sm_inc_blocks); 356 } 357 358 static void data_block_dec(void *context, const void *value_le, unsigned count) 359 { 360 with_runs((struct dm_space_map *) context, 361 (const __le64 *) value_le, count, dm_sm_dec_blocks); 362 } 363 364 static int data_block_equal(void *context, const void *value1_le, const void *value2_le) 365 { 366 __le64 v1_le, v2_le; 367 uint64_t b1, b2; 368 uint32_t t; 369 370 memcpy(&v1_le, value1_le, sizeof(v1_le)); 371 memcpy(&v2_le, value2_le, sizeof(v2_le)); 372 unpack_block_time(le64_to_cpu(v1_le), &b1, &t); 373 unpack_block_time(le64_to_cpu(v2_le), &b2, &t); 374 375 return b1 == b2; 376 } 377 378 static void subtree_inc(void *context, const void *value, unsigned count) 379 { 380 struct dm_btree_info *info = context; 381 const __le64 *root_le = value; 382 unsigned i; 383 384 for (i = 0; i < count; i++, root_le++) 385 dm_tm_inc(info->tm, le64_to_cpu(*root_le)); 386 } 387 388 static void subtree_dec(void *context, const void *value, unsigned count) 389 { 390 struct dm_btree_info *info = context; 391 const __le64 *root_le = value; 392 unsigned i; 393 394 for (i = 0; i < count; i++, root_le++) 395 if (dm_btree_del(info, le64_to_cpu(*root_le))) 396 DMERR("btree delete failed"); 397 } 398 399 static int subtree_equal(void *context, const void *value1_le, const void *value2_le) 400 { 401 __le64 v1_le, v2_le; 402 memcpy(&v1_le, value1_le, sizeof(v1_le)); 403 memcpy(&v2_le, value2_le, sizeof(v2_le)); 404 405 return v1_le == v2_le; 406 } 407 408 /*----------------------------------------------------------------*/ 409 410 /* 411 * Variant that is used for in-core only changes or code that 412 * shouldn't put the pool in service on its own (e.g. commit). 413 */ 414 static inline void pmd_write_lock_in_core(struct dm_pool_metadata *pmd) 415 __acquires(pmd->root_lock) 416 { 417 down_write(&pmd->root_lock); 418 } 419 420 static inline void pmd_write_lock(struct dm_pool_metadata *pmd) 421 { 422 pmd_write_lock_in_core(pmd); 423 if (unlikely(!pmd->in_service)) 424 pmd->in_service = true; 425 } 426 427 static inline void pmd_write_unlock(struct dm_pool_metadata *pmd) 428 __releases(pmd->root_lock) 429 { 430 up_write(&pmd->root_lock); 431 } 432 433 /*----------------------------------------------------------------*/ 434 435 static int superblock_lock_zero(struct dm_pool_metadata *pmd, 436 struct dm_block **sblock) 437 { 438 return dm_bm_write_lock_zero(pmd->bm, THIN_SUPERBLOCK_LOCATION, 439 &sb_validator, sblock); 440 } 441 442 static int superblock_lock(struct dm_pool_metadata *pmd, 443 struct dm_block **sblock) 444 { 445 return dm_bm_write_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 446 &sb_validator, sblock); 447 } 448 449 static int __superblock_all_zeroes(struct dm_block_manager *bm, int *result) 450 { 451 int r; 452 unsigned i; 453 struct dm_block *b; 454 __le64 *data_le, zero = cpu_to_le64(0); 455 unsigned block_size = dm_bm_block_size(bm) / sizeof(__le64); 456 457 /* 458 * We can't use a validator here - it may be all zeroes. 459 */ 460 r = dm_bm_read_lock(bm, THIN_SUPERBLOCK_LOCATION, NULL, &b); 461 if (r) 462 return r; 463 464 data_le = dm_block_data(b); 465 *result = 1; 466 for (i = 0; i < block_size; i++) { 467 if (data_le[i] != zero) { 468 *result = 0; 469 break; 470 } 471 } 472 473 dm_bm_unlock(b); 474 475 return 0; 476 } 477 478 static void __setup_btree_details(struct dm_pool_metadata *pmd) 479 { 480 pmd->info.tm = pmd->tm; 481 pmd->info.levels = 2; 482 pmd->info.value_type.context = pmd->data_sm; 483 pmd->info.value_type.size = sizeof(__le64); 484 pmd->info.value_type.inc = data_block_inc; 485 pmd->info.value_type.dec = data_block_dec; 486 pmd->info.value_type.equal = data_block_equal; 487 488 memcpy(&pmd->nb_info, &pmd->info, sizeof(pmd->nb_info)); 489 pmd->nb_info.tm = pmd->nb_tm; 490 491 pmd->tl_info.tm = pmd->tm; 492 pmd->tl_info.levels = 1; 493 pmd->tl_info.value_type.context = &pmd->bl_info; 494 pmd->tl_info.value_type.size = sizeof(__le64); 495 pmd->tl_info.value_type.inc = subtree_inc; 496 pmd->tl_info.value_type.dec = subtree_dec; 497 pmd->tl_info.value_type.equal = subtree_equal; 498 499 pmd->bl_info.tm = pmd->tm; 500 pmd->bl_info.levels = 1; 501 pmd->bl_info.value_type.context = pmd->data_sm; 502 pmd->bl_info.value_type.size = sizeof(__le64); 503 pmd->bl_info.value_type.inc = data_block_inc; 504 pmd->bl_info.value_type.dec = data_block_dec; 505 pmd->bl_info.value_type.equal = data_block_equal; 506 507 pmd->details_info.tm = pmd->tm; 508 pmd->details_info.levels = 1; 509 pmd->details_info.value_type.context = NULL; 510 pmd->details_info.value_type.size = sizeof(struct disk_device_details); 511 pmd->details_info.value_type.inc = NULL; 512 pmd->details_info.value_type.dec = NULL; 513 pmd->details_info.value_type.equal = NULL; 514 } 515 516 static int save_sm_roots(struct dm_pool_metadata *pmd) 517 { 518 int r; 519 size_t len; 520 521 r = dm_sm_root_size(pmd->metadata_sm, &len); 522 if (r < 0) 523 return r; 524 525 r = dm_sm_copy_root(pmd->metadata_sm, &pmd->metadata_space_map_root, len); 526 if (r < 0) 527 return r; 528 529 r = dm_sm_root_size(pmd->data_sm, &len); 530 if (r < 0) 531 return r; 532 533 return dm_sm_copy_root(pmd->data_sm, &pmd->data_space_map_root, len); 534 } 535 536 static void copy_sm_roots(struct dm_pool_metadata *pmd, 537 struct thin_disk_superblock *disk) 538 { 539 memcpy(&disk->metadata_space_map_root, 540 &pmd->metadata_space_map_root, 541 sizeof(pmd->metadata_space_map_root)); 542 543 memcpy(&disk->data_space_map_root, 544 &pmd->data_space_map_root, 545 sizeof(pmd->data_space_map_root)); 546 } 547 548 static int __write_initial_superblock(struct dm_pool_metadata *pmd) 549 { 550 int r; 551 struct dm_block *sblock; 552 struct thin_disk_superblock *disk_super; 553 sector_t bdev_size = bdev_nr_sectors(pmd->bdev); 554 555 if (bdev_size > THIN_METADATA_MAX_SECTORS) 556 bdev_size = THIN_METADATA_MAX_SECTORS; 557 558 r = dm_sm_commit(pmd->data_sm); 559 if (r < 0) 560 return r; 561 562 r = dm_tm_pre_commit(pmd->tm); 563 if (r < 0) 564 return r; 565 566 r = save_sm_roots(pmd); 567 if (r < 0) 568 return r; 569 570 r = superblock_lock_zero(pmd, &sblock); 571 if (r) 572 return r; 573 574 disk_super = dm_block_data(sblock); 575 disk_super->flags = 0; 576 memset(disk_super->uuid, 0, sizeof(disk_super->uuid)); 577 disk_super->magic = cpu_to_le64(THIN_SUPERBLOCK_MAGIC); 578 disk_super->version = cpu_to_le32(THIN_VERSION); 579 disk_super->time = 0; 580 disk_super->trans_id = 0; 581 disk_super->held_root = 0; 582 583 copy_sm_roots(pmd, disk_super); 584 585 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 586 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 587 disk_super->metadata_block_size = cpu_to_le32(THIN_METADATA_BLOCK_SIZE); 588 disk_super->metadata_nr_blocks = cpu_to_le64(bdev_size >> SECTOR_TO_BLOCK_SHIFT); 589 disk_super->data_block_size = cpu_to_le32(pmd->data_block_size); 590 591 return dm_tm_commit(pmd->tm, sblock); 592 } 593 594 static int __format_metadata(struct dm_pool_metadata *pmd) 595 { 596 int r; 597 598 r = dm_tm_create_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, 599 &pmd->tm, &pmd->metadata_sm); 600 if (r < 0) { 601 DMERR("tm_create_with_sm failed"); 602 return r; 603 } 604 605 pmd->data_sm = dm_sm_disk_create(pmd->tm, 0); 606 if (IS_ERR(pmd->data_sm)) { 607 DMERR("sm_disk_create failed"); 608 r = PTR_ERR(pmd->data_sm); 609 goto bad_cleanup_tm; 610 } 611 612 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); 613 if (!pmd->nb_tm) { 614 DMERR("could not create non-blocking clone tm"); 615 r = -ENOMEM; 616 goto bad_cleanup_data_sm; 617 } 618 619 __setup_btree_details(pmd); 620 621 r = dm_btree_empty(&pmd->info, &pmd->root); 622 if (r < 0) 623 goto bad_cleanup_nb_tm; 624 625 r = dm_btree_empty(&pmd->details_info, &pmd->details_root); 626 if (r < 0) { 627 DMERR("couldn't create devices root"); 628 goto bad_cleanup_nb_tm; 629 } 630 631 r = __write_initial_superblock(pmd); 632 if (r) 633 goto bad_cleanup_nb_tm; 634 635 return 0; 636 637 bad_cleanup_nb_tm: 638 dm_tm_destroy(pmd->nb_tm); 639 bad_cleanup_data_sm: 640 dm_sm_destroy(pmd->data_sm); 641 bad_cleanup_tm: 642 dm_tm_destroy(pmd->tm); 643 dm_sm_destroy(pmd->metadata_sm); 644 645 return r; 646 } 647 648 static int __check_incompat_features(struct thin_disk_superblock *disk_super, 649 struct dm_pool_metadata *pmd) 650 { 651 uint32_t features; 652 653 features = le32_to_cpu(disk_super->incompat_flags) & ~THIN_FEATURE_INCOMPAT_SUPP; 654 if (features) { 655 DMERR("could not access metadata due to unsupported optional features (%lx).", 656 (unsigned long)features); 657 return -EINVAL; 658 } 659 660 /* 661 * Check for read-only metadata to skip the following RDWR checks. 662 */ 663 if (bdev_read_only(pmd->bdev)) 664 return 0; 665 666 features = le32_to_cpu(disk_super->compat_ro_flags) & ~THIN_FEATURE_COMPAT_RO_SUPP; 667 if (features) { 668 DMERR("could not access metadata RDWR due to unsupported optional features (%lx).", 669 (unsigned long)features); 670 return -EINVAL; 671 } 672 673 return 0; 674 } 675 676 static int __open_metadata(struct dm_pool_metadata *pmd) 677 { 678 int r; 679 struct dm_block *sblock; 680 struct thin_disk_superblock *disk_super; 681 682 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 683 &sb_validator, &sblock); 684 if (r < 0) { 685 DMERR("couldn't read superblock"); 686 return r; 687 } 688 689 disk_super = dm_block_data(sblock); 690 691 /* Verify the data block size hasn't changed */ 692 if (le32_to_cpu(disk_super->data_block_size) != pmd->data_block_size) { 693 DMERR("changing the data block size (from %u to %llu) is not supported", 694 le32_to_cpu(disk_super->data_block_size), 695 (unsigned long long)pmd->data_block_size); 696 r = -EINVAL; 697 goto bad_unlock_sblock; 698 } 699 700 r = __check_incompat_features(disk_super, pmd); 701 if (r < 0) 702 goto bad_unlock_sblock; 703 704 r = dm_tm_open_with_sm(pmd->bm, THIN_SUPERBLOCK_LOCATION, 705 disk_super->metadata_space_map_root, 706 sizeof(disk_super->metadata_space_map_root), 707 &pmd->tm, &pmd->metadata_sm); 708 if (r < 0) { 709 DMERR("tm_open_with_sm failed"); 710 goto bad_unlock_sblock; 711 } 712 713 pmd->data_sm = dm_sm_disk_open(pmd->tm, disk_super->data_space_map_root, 714 sizeof(disk_super->data_space_map_root)); 715 if (IS_ERR(pmd->data_sm)) { 716 DMERR("sm_disk_open failed"); 717 r = PTR_ERR(pmd->data_sm); 718 goto bad_cleanup_tm; 719 } 720 721 pmd->nb_tm = dm_tm_create_non_blocking_clone(pmd->tm); 722 if (!pmd->nb_tm) { 723 DMERR("could not create non-blocking clone tm"); 724 r = -ENOMEM; 725 goto bad_cleanup_data_sm; 726 } 727 728 /* 729 * For pool metadata opening process, root setting is redundant 730 * because it will be set again in __begin_transaction(). But dm 731 * pool aborting process really needs to get last transaction's 732 * root to avoid accessing broken btree. 733 */ 734 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 735 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 736 737 __setup_btree_details(pmd); 738 dm_bm_unlock(sblock); 739 740 return 0; 741 742 bad_cleanup_data_sm: 743 dm_sm_destroy(pmd->data_sm); 744 bad_cleanup_tm: 745 dm_tm_destroy(pmd->tm); 746 dm_sm_destroy(pmd->metadata_sm); 747 bad_unlock_sblock: 748 dm_bm_unlock(sblock); 749 750 return r; 751 } 752 753 static int __open_or_format_metadata(struct dm_pool_metadata *pmd, bool format_device) 754 { 755 int r, unformatted; 756 757 r = __superblock_all_zeroes(pmd->bm, &unformatted); 758 if (r) 759 return r; 760 761 if (unformatted) 762 return format_device ? __format_metadata(pmd) : -EPERM; 763 764 return __open_metadata(pmd); 765 } 766 767 static int __create_persistent_data_objects(struct dm_pool_metadata *pmd, bool format_device) 768 { 769 int r; 770 771 pmd->bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 772 THIN_MAX_CONCURRENT_LOCKS); 773 if (IS_ERR(pmd->bm)) { 774 DMERR("could not create block manager"); 775 r = PTR_ERR(pmd->bm); 776 pmd->bm = NULL; 777 return r; 778 } 779 780 r = __open_or_format_metadata(pmd, format_device); 781 if (r) { 782 dm_block_manager_destroy(pmd->bm); 783 pmd->bm = NULL; 784 } 785 786 return r; 787 } 788 789 static void __destroy_persistent_data_objects(struct dm_pool_metadata *pmd, 790 bool destroy_bm) 791 { 792 dm_sm_destroy(pmd->data_sm); 793 dm_sm_destroy(pmd->metadata_sm); 794 dm_tm_destroy(pmd->nb_tm); 795 dm_tm_destroy(pmd->tm); 796 if (destroy_bm) 797 dm_block_manager_destroy(pmd->bm); 798 } 799 800 static int __begin_transaction(struct dm_pool_metadata *pmd) 801 { 802 int r; 803 struct thin_disk_superblock *disk_super; 804 struct dm_block *sblock; 805 806 /* 807 * We re-read the superblock every time. Shouldn't need to do this 808 * really. 809 */ 810 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 811 &sb_validator, &sblock); 812 if (r) 813 return r; 814 815 disk_super = dm_block_data(sblock); 816 pmd->time = le32_to_cpu(disk_super->time); 817 pmd->root = le64_to_cpu(disk_super->data_mapping_root); 818 pmd->details_root = le64_to_cpu(disk_super->device_details_root); 819 pmd->trans_id = le64_to_cpu(disk_super->trans_id); 820 pmd->flags = le32_to_cpu(disk_super->flags); 821 pmd->data_block_size = le32_to_cpu(disk_super->data_block_size); 822 823 dm_bm_unlock(sblock); 824 return 0; 825 } 826 827 static int __write_changed_details(struct dm_pool_metadata *pmd) 828 { 829 int r; 830 struct dm_thin_device *td, *tmp; 831 struct disk_device_details details; 832 uint64_t key; 833 834 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 835 if (!td->changed) 836 continue; 837 838 key = td->id; 839 840 details.mapped_blocks = cpu_to_le64(td->mapped_blocks); 841 details.transaction_id = cpu_to_le64(td->transaction_id); 842 details.creation_time = cpu_to_le32(td->creation_time); 843 details.snapshotted_time = cpu_to_le32(td->snapshotted_time); 844 __dm_bless_for_disk(&details); 845 846 r = dm_btree_insert(&pmd->details_info, pmd->details_root, 847 &key, &details, &pmd->details_root); 848 if (r) 849 return r; 850 851 if (td->open_count) 852 td->changed = false; 853 else { 854 list_del(&td->list); 855 kfree(td); 856 } 857 } 858 859 return 0; 860 } 861 862 static int __commit_transaction(struct dm_pool_metadata *pmd) 863 { 864 int r; 865 struct thin_disk_superblock *disk_super; 866 struct dm_block *sblock; 867 868 /* 869 * We need to know if the thin_disk_superblock exceeds a 512-byte sector. 870 */ 871 BUILD_BUG_ON(sizeof(struct thin_disk_superblock) > 512); 872 BUG_ON(!rwsem_is_locked(&pmd->root_lock)); 873 874 if (unlikely(!pmd->in_service)) 875 return 0; 876 877 if (pmd->pre_commit_fn) { 878 r = pmd->pre_commit_fn(pmd->pre_commit_context); 879 if (r < 0) { 880 DMERR("pre-commit callback failed"); 881 return r; 882 } 883 } 884 885 r = __write_changed_details(pmd); 886 if (r < 0) 887 return r; 888 889 r = dm_sm_commit(pmd->data_sm); 890 if (r < 0) 891 return r; 892 893 r = dm_tm_pre_commit(pmd->tm); 894 if (r < 0) 895 return r; 896 897 r = save_sm_roots(pmd); 898 if (r < 0) 899 return r; 900 901 r = superblock_lock(pmd, &sblock); 902 if (r) 903 return r; 904 905 disk_super = dm_block_data(sblock); 906 disk_super->time = cpu_to_le32(pmd->time); 907 disk_super->data_mapping_root = cpu_to_le64(pmd->root); 908 disk_super->device_details_root = cpu_to_le64(pmd->details_root); 909 disk_super->trans_id = cpu_to_le64(pmd->trans_id); 910 disk_super->flags = cpu_to_le32(pmd->flags); 911 912 copy_sm_roots(pmd, disk_super); 913 914 return dm_tm_commit(pmd->tm, sblock); 915 } 916 917 static void __set_metadata_reserve(struct dm_pool_metadata *pmd) 918 { 919 int r; 920 dm_block_t total; 921 dm_block_t max_blocks = 4096; /* 16M */ 922 923 r = dm_sm_get_nr_blocks(pmd->metadata_sm, &total); 924 if (r) { 925 DMERR("could not get size of metadata device"); 926 pmd->metadata_reserve = max_blocks; 927 } else 928 pmd->metadata_reserve = min(max_blocks, div_u64(total, 10)); 929 } 930 931 struct dm_pool_metadata *dm_pool_metadata_open(struct block_device *bdev, 932 sector_t data_block_size, 933 bool format_device) 934 { 935 int r; 936 struct dm_pool_metadata *pmd; 937 938 pmd = kmalloc(sizeof(*pmd), GFP_KERNEL); 939 if (!pmd) { 940 DMERR("could not allocate metadata struct"); 941 return ERR_PTR(-ENOMEM); 942 } 943 944 init_rwsem(&pmd->root_lock); 945 pmd->time = 0; 946 INIT_LIST_HEAD(&pmd->thin_devices); 947 pmd->fail_io = false; 948 pmd->in_service = false; 949 pmd->bdev = bdev; 950 pmd->data_block_size = data_block_size; 951 pmd->pre_commit_fn = NULL; 952 pmd->pre_commit_context = NULL; 953 954 r = __create_persistent_data_objects(pmd, format_device); 955 if (r) { 956 kfree(pmd); 957 return ERR_PTR(r); 958 } 959 960 r = __begin_transaction(pmd); 961 if (r < 0) { 962 if (dm_pool_metadata_close(pmd) < 0) 963 DMWARN("%s: dm_pool_metadata_close() failed.", __func__); 964 return ERR_PTR(r); 965 } 966 967 __set_metadata_reserve(pmd); 968 969 return pmd; 970 } 971 972 int dm_pool_metadata_close(struct dm_pool_metadata *pmd) 973 { 974 int r; 975 unsigned open_devices = 0; 976 struct dm_thin_device *td, *tmp; 977 978 down_read(&pmd->root_lock); 979 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 980 if (td->open_count) 981 open_devices++; 982 else { 983 list_del(&td->list); 984 kfree(td); 985 } 986 } 987 up_read(&pmd->root_lock); 988 989 if (open_devices) { 990 DMERR("attempt to close pmd when %u device(s) are still open", 991 open_devices); 992 return -EBUSY; 993 } 994 995 pmd_write_lock_in_core(pmd); 996 if (!pmd->fail_io && !dm_bm_is_read_only(pmd->bm)) { 997 r = __commit_transaction(pmd); 998 if (r < 0) 999 DMWARN("%s: __commit_transaction() failed, error = %d", 1000 __func__, r); 1001 } 1002 pmd_write_unlock(pmd); 1003 if (!pmd->fail_io) 1004 __destroy_persistent_data_objects(pmd, true); 1005 1006 kfree(pmd); 1007 return 0; 1008 } 1009 1010 /* 1011 * __open_device: Returns @td corresponding to device with id @dev, 1012 * creating it if @create is set and incrementing @td->open_count. 1013 * On failure, @td is undefined. 1014 */ 1015 static int __open_device(struct dm_pool_metadata *pmd, 1016 dm_thin_id dev, int create, 1017 struct dm_thin_device **td) 1018 { 1019 int r, changed = 0; 1020 struct dm_thin_device *td2; 1021 uint64_t key = dev; 1022 struct disk_device_details details_le; 1023 1024 /* 1025 * If the device is already open, return it. 1026 */ 1027 list_for_each_entry(td2, &pmd->thin_devices, list) 1028 if (td2->id == dev) { 1029 /* 1030 * May not create an already-open device. 1031 */ 1032 if (create) 1033 return -EEXIST; 1034 1035 td2->open_count++; 1036 *td = td2; 1037 return 0; 1038 } 1039 1040 /* 1041 * Check the device exists. 1042 */ 1043 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1044 &key, &details_le); 1045 if (r) { 1046 if (r != -ENODATA || !create) 1047 return r; 1048 1049 /* 1050 * Create new device. 1051 */ 1052 changed = 1; 1053 details_le.mapped_blocks = 0; 1054 details_le.transaction_id = cpu_to_le64(pmd->trans_id); 1055 details_le.creation_time = cpu_to_le32(pmd->time); 1056 details_le.snapshotted_time = cpu_to_le32(pmd->time); 1057 } 1058 1059 *td = kmalloc(sizeof(**td), GFP_NOIO); 1060 if (!*td) 1061 return -ENOMEM; 1062 1063 (*td)->pmd = pmd; 1064 (*td)->id = dev; 1065 (*td)->open_count = 1; 1066 (*td)->changed = changed; 1067 (*td)->aborted_with_changes = false; 1068 (*td)->mapped_blocks = le64_to_cpu(details_le.mapped_blocks); 1069 (*td)->transaction_id = le64_to_cpu(details_le.transaction_id); 1070 (*td)->creation_time = le32_to_cpu(details_le.creation_time); 1071 (*td)->snapshotted_time = le32_to_cpu(details_le.snapshotted_time); 1072 1073 list_add(&(*td)->list, &pmd->thin_devices); 1074 1075 return 0; 1076 } 1077 1078 static void __close_device(struct dm_thin_device *td) 1079 { 1080 --td->open_count; 1081 } 1082 1083 static int __create_thin(struct dm_pool_metadata *pmd, 1084 dm_thin_id dev) 1085 { 1086 int r; 1087 dm_block_t dev_root; 1088 uint64_t key = dev; 1089 struct dm_thin_device *td; 1090 __le64 value; 1091 1092 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1093 &key, NULL); 1094 if (!r) 1095 return -EEXIST; 1096 1097 /* 1098 * Create an empty btree for the mappings. 1099 */ 1100 r = dm_btree_empty(&pmd->bl_info, &dev_root); 1101 if (r) 1102 return r; 1103 1104 /* 1105 * Insert it into the main mapping tree. 1106 */ 1107 value = cpu_to_le64(dev_root); 1108 __dm_bless_for_disk(&value); 1109 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 1110 if (r) { 1111 dm_btree_del(&pmd->bl_info, dev_root); 1112 return r; 1113 } 1114 1115 r = __open_device(pmd, dev, 1, &td); 1116 if (r) { 1117 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1118 dm_btree_del(&pmd->bl_info, dev_root); 1119 return r; 1120 } 1121 __close_device(td); 1122 1123 return r; 1124 } 1125 1126 int dm_pool_create_thin(struct dm_pool_metadata *pmd, dm_thin_id dev) 1127 { 1128 int r = -EINVAL; 1129 1130 pmd_write_lock(pmd); 1131 if (!pmd->fail_io) 1132 r = __create_thin(pmd, dev); 1133 pmd_write_unlock(pmd); 1134 1135 return r; 1136 } 1137 1138 static int __set_snapshot_details(struct dm_pool_metadata *pmd, 1139 struct dm_thin_device *snap, 1140 dm_thin_id origin, uint32_t time) 1141 { 1142 int r; 1143 struct dm_thin_device *td; 1144 1145 r = __open_device(pmd, origin, 0, &td); 1146 if (r) 1147 return r; 1148 1149 td->changed = true; 1150 td->snapshotted_time = time; 1151 1152 snap->mapped_blocks = td->mapped_blocks; 1153 snap->snapshotted_time = time; 1154 __close_device(td); 1155 1156 return 0; 1157 } 1158 1159 static int __create_snap(struct dm_pool_metadata *pmd, 1160 dm_thin_id dev, dm_thin_id origin) 1161 { 1162 int r; 1163 dm_block_t origin_root; 1164 uint64_t key = origin, dev_key = dev; 1165 struct dm_thin_device *td; 1166 __le64 value; 1167 1168 /* check this device is unused */ 1169 r = dm_btree_lookup(&pmd->details_info, pmd->details_root, 1170 &dev_key, NULL); 1171 if (!r) 1172 return -EEXIST; 1173 1174 /* find the mapping tree for the origin */ 1175 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &key, &value); 1176 if (r) 1177 return r; 1178 origin_root = le64_to_cpu(value); 1179 1180 /* clone the origin, an inc will do */ 1181 dm_tm_inc(pmd->tm, origin_root); 1182 1183 /* insert into the main mapping tree */ 1184 value = cpu_to_le64(origin_root); 1185 __dm_bless_for_disk(&value); 1186 key = dev; 1187 r = dm_btree_insert(&pmd->tl_info, pmd->root, &key, &value, &pmd->root); 1188 if (r) { 1189 dm_tm_dec(pmd->tm, origin_root); 1190 return r; 1191 } 1192 1193 pmd->time++; 1194 1195 r = __open_device(pmd, dev, 1, &td); 1196 if (r) 1197 goto bad; 1198 1199 r = __set_snapshot_details(pmd, td, origin, pmd->time); 1200 __close_device(td); 1201 1202 if (r) 1203 goto bad; 1204 1205 return 0; 1206 1207 bad: 1208 dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1209 dm_btree_remove(&pmd->details_info, pmd->details_root, 1210 &key, &pmd->details_root); 1211 return r; 1212 } 1213 1214 int dm_pool_create_snap(struct dm_pool_metadata *pmd, 1215 dm_thin_id dev, 1216 dm_thin_id origin) 1217 { 1218 int r = -EINVAL; 1219 1220 pmd_write_lock(pmd); 1221 if (!pmd->fail_io) 1222 r = __create_snap(pmd, dev, origin); 1223 pmd_write_unlock(pmd); 1224 1225 return r; 1226 } 1227 1228 static int __delete_device(struct dm_pool_metadata *pmd, dm_thin_id dev) 1229 { 1230 int r; 1231 uint64_t key = dev; 1232 struct dm_thin_device *td; 1233 1234 /* TODO: failure should mark the transaction invalid */ 1235 r = __open_device(pmd, dev, 0, &td); 1236 if (r) 1237 return r; 1238 1239 if (td->open_count > 1) { 1240 __close_device(td); 1241 return -EBUSY; 1242 } 1243 1244 list_del(&td->list); 1245 kfree(td); 1246 r = dm_btree_remove(&pmd->details_info, pmd->details_root, 1247 &key, &pmd->details_root); 1248 if (r) 1249 return r; 1250 1251 r = dm_btree_remove(&pmd->tl_info, pmd->root, &key, &pmd->root); 1252 if (r) 1253 return r; 1254 1255 return 0; 1256 } 1257 1258 int dm_pool_delete_thin_device(struct dm_pool_metadata *pmd, 1259 dm_thin_id dev) 1260 { 1261 int r = -EINVAL; 1262 1263 pmd_write_lock(pmd); 1264 if (!pmd->fail_io) 1265 r = __delete_device(pmd, dev); 1266 pmd_write_unlock(pmd); 1267 1268 return r; 1269 } 1270 1271 int dm_pool_set_metadata_transaction_id(struct dm_pool_metadata *pmd, 1272 uint64_t current_id, 1273 uint64_t new_id) 1274 { 1275 int r = -EINVAL; 1276 1277 pmd_write_lock(pmd); 1278 1279 if (pmd->fail_io) 1280 goto out; 1281 1282 if (pmd->trans_id != current_id) { 1283 DMERR("mismatched transaction id"); 1284 goto out; 1285 } 1286 1287 pmd->trans_id = new_id; 1288 r = 0; 1289 1290 out: 1291 pmd_write_unlock(pmd); 1292 1293 return r; 1294 } 1295 1296 int dm_pool_get_metadata_transaction_id(struct dm_pool_metadata *pmd, 1297 uint64_t *result) 1298 { 1299 int r = -EINVAL; 1300 1301 down_read(&pmd->root_lock); 1302 if (!pmd->fail_io) { 1303 *result = pmd->trans_id; 1304 r = 0; 1305 } 1306 up_read(&pmd->root_lock); 1307 1308 return r; 1309 } 1310 1311 static int __reserve_metadata_snap(struct dm_pool_metadata *pmd) 1312 { 1313 int r, inc; 1314 struct thin_disk_superblock *disk_super; 1315 struct dm_block *copy, *sblock; 1316 dm_block_t held_root; 1317 1318 /* 1319 * We commit to ensure the btree roots which we increment in a 1320 * moment are up to date. 1321 */ 1322 r = __commit_transaction(pmd); 1323 if (r < 0) { 1324 DMWARN("%s: __commit_transaction() failed, error = %d", 1325 __func__, r); 1326 return r; 1327 } 1328 1329 /* 1330 * Copy the superblock. 1331 */ 1332 dm_sm_inc_block(pmd->metadata_sm, THIN_SUPERBLOCK_LOCATION); 1333 r = dm_tm_shadow_block(pmd->tm, THIN_SUPERBLOCK_LOCATION, 1334 &sb_validator, ©, &inc); 1335 if (r) 1336 return r; 1337 1338 BUG_ON(!inc); 1339 1340 held_root = dm_block_location(copy); 1341 disk_super = dm_block_data(copy); 1342 1343 if (le64_to_cpu(disk_super->held_root)) { 1344 DMWARN("Pool metadata snapshot already exists: release this before taking another."); 1345 1346 dm_tm_dec(pmd->tm, held_root); 1347 dm_tm_unlock(pmd->tm, copy); 1348 return -EBUSY; 1349 } 1350 1351 /* 1352 * Wipe the spacemap since we're not publishing this. 1353 */ 1354 memset(&disk_super->data_space_map_root, 0, 1355 sizeof(disk_super->data_space_map_root)); 1356 memset(&disk_super->metadata_space_map_root, 0, 1357 sizeof(disk_super->metadata_space_map_root)); 1358 1359 /* 1360 * Increment the data structures that need to be preserved. 1361 */ 1362 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->data_mapping_root)); 1363 dm_tm_inc(pmd->tm, le64_to_cpu(disk_super->device_details_root)); 1364 dm_tm_unlock(pmd->tm, copy); 1365 1366 /* 1367 * Write the held root into the superblock. 1368 */ 1369 r = superblock_lock(pmd, &sblock); 1370 if (r) { 1371 dm_tm_dec(pmd->tm, held_root); 1372 return r; 1373 } 1374 1375 disk_super = dm_block_data(sblock); 1376 disk_super->held_root = cpu_to_le64(held_root); 1377 dm_bm_unlock(sblock); 1378 return 0; 1379 } 1380 1381 int dm_pool_reserve_metadata_snap(struct dm_pool_metadata *pmd) 1382 { 1383 int r = -EINVAL; 1384 1385 pmd_write_lock(pmd); 1386 if (!pmd->fail_io) 1387 r = __reserve_metadata_snap(pmd); 1388 pmd_write_unlock(pmd); 1389 1390 return r; 1391 } 1392 1393 static int __release_metadata_snap(struct dm_pool_metadata *pmd) 1394 { 1395 int r; 1396 struct thin_disk_superblock *disk_super; 1397 struct dm_block *sblock, *copy; 1398 dm_block_t held_root; 1399 1400 r = superblock_lock(pmd, &sblock); 1401 if (r) 1402 return r; 1403 1404 disk_super = dm_block_data(sblock); 1405 held_root = le64_to_cpu(disk_super->held_root); 1406 disk_super->held_root = cpu_to_le64(0); 1407 1408 dm_bm_unlock(sblock); 1409 1410 if (!held_root) { 1411 DMWARN("No pool metadata snapshot found: nothing to release."); 1412 return -EINVAL; 1413 } 1414 1415 r = dm_tm_read_lock(pmd->tm, held_root, &sb_validator, ©); 1416 if (r) 1417 return r; 1418 1419 disk_super = dm_block_data(copy); 1420 dm_btree_del(&pmd->info, le64_to_cpu(disk_super->data_mapping_root)); 1421 dm_btree_del(&pmd->details_info, le64_to_cpu(disk_super->device_details_root)); 1422 dm_sm_dec_block(pmd->metadata_sm, held_root); 1423 1424 dm_tm_unlock(pmd->tm, copy); 1425 1426 return 0; 1427 } 1428 1429 int dm_pool_release_metadata_snap(struct dm_pool_metadata *pmd) 1430 { 1431 int r = -EINVAL; 1432 1433 pmd_write_lock(pmd); 1434 if (!pmd->fail_io) 1435 r = __release_metadata_snap(pmd); 1436 pmd_write_unlock(pmd); 1437 1438 return r; 1439 } 1440 1441 static int __get_metadata_snap(struct dm_pool_metadata *pmd, 1442 dm_block_t *result) 1443 { 1444 int r; 1445 struct thin_disk_superblock *disk_super; 1446 struct dm_block *sblock; 1447 1448 r = dm_bm_read_lock(pmd->bm, THIN_SUPERBLOCK_LOCATION, 1449 &sb_validator, &sblock); 1450 if (r) 1451 return r; 1452 1453 disk_super = dm_block_data(sblock); 1454 *result = le64_to_cpu(disk_super->held_root); 1455 1456 dm_bm_unlock(sblock); 1457 1458 return 0; 1459 } 1460 1461 int dm_pool_get_metadata_snap(struct dm_pool_metadata *pmd, 1462 dm_block_t *result) 1463 { 1464 int r = -EINVAL; 1465 1466 down_read(&pmd->root_lock); 1467 if (!pmd->fail_io) 1468 r = __get_metadata_snap(pmd, result); 1469 up_read(&pmd->root_lock); 1470 1471 return r; 1472 } 1473 1474 int dm_pool_open_thin_device(struct dm_pool_metadata *pmd, dm_thin_id dev, 1475 struct dm_thin_device **td) 1476 { 1477 int r = -EINVAL; 1478 1479 pmd_write_lock_in_core(pmd); 1480 if (!pmd->fail_io) 1481 r = __open_device(pmd, dev, 0, td); 1482 pmd_write_unlock(pmd); 1483 1484 return r; 1485 } 1486 1487 int dm_pool_close_thin_device(struct dm_thin_device *td) 1488 { 1489 pmd_write_lock_in_core(td->pmd); 1490 __close_device(td); 1491 pmd_write_unlock(td->pmd); 1492 1493 return 0; 1494 } 1495 1496 dm_thin_id dm_thin_dev_id(struct dm_thin_device *td) 1497 { 1498 return td->id; 1499 } 1500 1501 /* 1502 * Check whether @time (of block creation) is older than @td's last snapshot. 1503 * If so then the associated block is shared with the last snapshot device. 1504 * Any block on a device created *after* the device last got snapshotted is 1505 * necessarily not shared. 1506 */ 1507 static bool __snapshotted_since(struct dm_thin_device *td, uint32_t time) 1508 { 1509 return td->snapshotted_time > time; 1510 } 1511 1512 static void unpack_lookup_result(struct dm_thin_device *td, __le64 value, 1513 struct dm_thin_lookup_result *result) 1514 { 1515 uint64_t block_time = 0; 1516 dm_block_t exception_block; 1517 uint32_t exception_time; 1518 1519 block_time = le64_to_cpu(value); 1520 unpack_block_time(block_time, &exception_block, &exception_time); 1521 result->block = exception_block; 1522 result->shared = __snapshotted_since(td, exception_time); 1523 } 1524 1525 static int __find_block(struct dm_thin_device *td, dm_block_t block, 1526 int can_issue_io, struct dm_thin_lookup_result *result) 1527 { 1528 int r; 1529 __le64 value; 1530 struct dm_pool_metadata *pmd = td->pmd; 1531 dm_block_t keys[2] = { td->id, block }; 1532 struct dm_btree_info *info; 1533 1534 if (can_issue_io) { 1535 info = &pmd->info; 1536 } else 1537 info = &pmd->nb_info; 1538 1539 r = dm_btree_lookup(info, pmd->root, keys, &value); 1540 if (!r) 1541 unpack_lookup_result(td, value, result); 1542 1543 return r; 1544 } 1545 1546 int dm_thin_find_block(struct dm_thin_device *td, dm_block_t block, 1547 int can_issue_io, struct dm_thin_lookup_result *result) 1548 { 1549 int r; 1550 struct dm_pool_metadata *pmd = td->pmd; 1551 1552 down_read(&pmd->root_lock); 1553 if (pmd->fail_io) { 1554 up_read(&pmd->root_lock); 1555 return -EINVAL; 1556 } 1557 1558 r = __find_block(td, block, can_issue_io, result); 1559 1560 up_read(&pmd->root_lock); 1561 return r; 1562 } 1563 1564 static int __find_next_mapped_block(struct dm_thin_device *td, dm_block_t block, 1565 dm_block_t *vblock, 1566 struct dm_thin_lookup_result *result) 1567 { 1568 int r; 1569 __le64 value; 1570 struct dm_pool_metadata *pmd = td->pmd; 1571 dm_block_t keys[2] = { td->id, block }; 1572 1573 r = dm_btree_lookup_next(&pmd->info, pmd->root, keys, vblock, &value); 1574 if (!r) 1575 unpack_lookup_result(td, value, result); 1576 1577 return r; 1578 } 1579 1580 static int __find_mapped_range(struct dm_thin_device *td, 1581 dm_block_t begin, dm_block_t end, 1582 dm_block_t *thin_begin, dm_block_t *thin_end, 1583 dm_block_t *pool_begin, bool *maybe_shared) 1584 { 1585 int r; 1586 dm_block_t pool_end; 1587 struct dm_thin_lookup_result lookup; 1588 1589 if (end < begin) 1590 return -ENODATA; 1591 1592 r = __find_next_mapped_block(td, begin, &begin, &lookup); 1593 if (r) 1594 return r; 1595 1596 if (begin >= end) 1597 return -ENODATA; 1598 1599 *thin_begin = begin; 1600 *pool_begin = lookup.block; 1601 *maybe_shared = lookup.shared; 1602 1603 begin++; 1604 pool_end = *pool_begin + 1; 1605 while (begin != end) { 1606 r = __find_block(td, begin, true, &lookup); 1607 if (r) { 1608 if (r == -ENODATA) 1609 break; 1610 else 1611 return r; 1612 } 1613 1614 if ((lookup.block != pool_end) || 1615 (lookup.shared != *maybe_shared)) 1616 break; 1617 1618 pool_end++; 1619 begin++; 1620 } 1621 1622 *thin_end = begin; 1623 return 0; 1624 } 1625 1626 int dm_thin_find_mapped_range(struct dm_thin_device *td, 1627 dm_block_t begin, dm_block_t end, 1628 dm_block_t *thin_begin, dm_block_t *thin_end, 1629 dm_block_t *pool_begin, bool *maybe_shared) 1630 { 1631 int r = -EINVAL; 1632 struct dm_pool_metadata *pmd = td->pmd; 1633 1634 down_read(&pmd->root_lock); 1635 if (!pmd->fail_io) { 1636 r = __find_mapped_range(td, begin, end, thin_begin, thin_end, 1637 pool_begin, maybe_shared); 1638 } 1639 up_read(&pmd->root_lock); 1640 1641 return r; 1642 } 1643 1644 static int __insert(struct dm_thin_device *td, dm_block_t block, 1645 dm_block_t data_block) 1646 { 1647 int r, inserted; 1648 __le64 value; 1649 struct dm_pool_metadata *pmd = td->pmd; 1650 dm_block_t keys[2] = { td->id, block }; 1651 1652 value = cpu_to_le64(pack_block_time(data_block, pmd->time)); 1653 __dm_bless_for_disk(&value); 1654 1655 r = dm_btree_insert_notify(&pmd->info, pmd->root, keys, &value, 1656 &pmd->root, &inserted); 1657 if (r) 1658 return r; 1659 1660 td->changed = true; 1661 if (inserted) 1662 td->mapped_blocks++; 1663 1664 return 0; 1665 } 1666 1667 int dm_thin_insert_block(struct dm_thin_device *td, dm_block_t block, 1668 dm_block_t data_block) 1669 { 1670 int r = -EINVAL; 1671 1672 pmd_write_lock(td->pmd); 1673 if (!td->pmd->fail_io) 1674 r = __insert(td, block, data_block); 1675 pmd_write_unlock(td->pmd); 1676 1677 return r; 1678 } 1679 1680 static int __remove_range(struct dm_thin_device *td, dm_block_t begin, dm_block_t end) 1681 { 1682 int r; 1683 unsigned count, total_count = 0; 1684 struct dm_pool_metadata *pmd = td->pmd; 1685 dm_block_t keys[1] = { td->id }; 1686 __le64 value; 1687 dm_block_t mapping_root; 1688 1689 /* 1690 * Find the mapping tree 1691 */ 1692 r = dm_btree_lookup(&pmd->tl_info, pmd->root, keys, &value); 1693 if (r) 1694 return r; 1695 1696 /* 1697 * Remove from the mapping tree, taking care to inc the 1698 * ref count so it doesn't get deleted. 1699 */ 1700 mapping_root = le64_to_cpu(value); 1701 dm_tm_inc(pmd->tm, mapping_root); 1702 r = dm_btree_remove(&pmd->tl_info, pmd->root, keys, &pmd->root); 1703 if (r) 1704 return r; 1705 1706 /* 1707 * Remove leaves stops at the first unmapped entry, so we have to 1708 * loop round finding mapped ranges. 1709 */ 1710 while (begin < end) { 1711 r = dm_btree_lookup_next(&pmd->bl_info, mapping_root, &begin, &begin, &value); 1712 if (r == -ENODATA) 1713 break; 1714 1715 if (r) 1716 return r; 1717 1718 if (begin >= end) 1719 break; 1720 1721 r = dm_btree_remove_leaves(&pmd->bl_info, mapping_root, &begin, end, &mapping_root, &count); 1722 if (r) 1723 return r; 1724 1725 total_count += count; 1726 } 1727 1728 td->mapped_blocks -= total_count; 1729 td->changed = true; 1730 1731 /* 1732 * Reinsert the mapping tree. 1733 */ 1734 value = cpu_to_le64(mapping_root); 1735 __dm_bless_for_disk(&value); 1736 return dm_btree_insert(&pmd->tl_info, pmd->root, keys, &value, &pmd->root); 1737 } 1738 1739 int dm_thin_remove_range(struct dm_thin_device *td, 1740 dm_block_t begin, dm_block_t end) 1741 { 1742 int r = -EINVAL; 1743 1744 pmd_write_lock(td->pmd); 1745 if (!td->pmd->fail_io) 1746 r = __remove_range(td, begin, end); 1747 pmd_write_unlock(td->pmd); 1748 1749 return r; 1750 } 1751 1752 int dm_pool_block_is_shared(struct dm_pool_metadata *pmd, dm_block_t b, bool *result) 1753 { 1754 int r; 1755 uint32_t ref_count; 1756 1757 down_read(&pmd->root_lock); 1758 r = dm_sm_get_count(pmd->data_sm, b, &ref_count); 1759 if (!r) 1760 *result = (ref_count > 1); 1761 up_read(&pmd->root_lock); 1762 1763 return r; 1764 } 1765 1766 int dm_pool_inc_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1767 { 1768 int r = 0; 1769 1770 pmd_write_lock(pmd); 1771 r = dm_sm_inc_blocks(pmd->data_sm, b, e); 1772 pmd_write_unlock(pmd); 1773 1774 return r; 1775 } 1776 1777 int dm_pool_dec_data_range(struct dm_pool_metadata *pmd, dm_block_t b, dm_block_t e) 1778 { 1779 int r = 0; 1780 1781 pmd_write_lock(pmd); 1782 r = dm_sm_dec_blocks(pmd->data_sm, b, e); 1783 pmd_write_unlock(pmd); 1784 1785 return r; 1786 } 1787 1788 bool dm_thin_changed_this_transaction(struct dm_thin_device *td) 1789 { 1790 int r; 1791 1792 down_read(&td->pmd->root_lock); 1793 r = td->changed; 1794 up_read(&td->pmd->root_lock); 1795 1796 return r; 1797 } 1798 1799 bool dm_pool_changed_this_transaction(struct dm_pool_metadata *pmd) 1800 { 1801 bool r = false; 1802 struct dm_thin_device *td, *tmp; 1803 1804 down_read(&pmd->root_lock); 1805 list_for_each_entry_safe(td, tmp, &pmd->thin_devices, list) { 1806 if (td->changed) { 1807 r = td->changed; 1808 break; 1809 } 1810 } 1811 up_read(&pmd->root_lock); 1812 1813 return r; 1814 } 1815 1816 bool dm_thin_aborted_changes(struct dm_thin_device *td) 1817 { 1818 bool r; 1819 1820 down_read(&td->pmd->root_lock); 1821 r = td->aborted_with_changes; 1822 up_read(&td->pmd->root_lock); 1823 1824 return r; 1825 } 1826 1827 int dm_pool_alloc_data_block(struct dm_pool_metadata *pmd, dm_block_t *result) 1828 { 1829 int r = -EINVAL; 1830 1831 pmd_write_lock(pmd); 1832 if (!pmd->fail_io) 1833 r = dm_sm_new_block(pmd->data_sm, result); 1834 pmd_write_unlock(pmd); 1835 1836 return r; 1837 } 1838 1839 int dm_pool_commit_metadata(struct dm_pool_metadata *pmd) 1840 { 1841 int r = -EINVAL; 1842 1843 /* 1844 * Care is taken to not have commit be what 1845 * triggers putting the thin-pool in-service. 1846 */ 1847 pmd_write_lock_in_core(pmd); 1848 if (pmd->fail_io) 1849 goto out; 1850 1851 r = __commit_transaction(pmd); 1852 if (r < 0) 1853 goto out; 1854 1855 /* 1856 * Open the next transaction. 1857 */ 1858 r = __begin_transaction(pmd); 1859 out: 1860 pmd_write_unlock(pmd); 1861 return r; 1862 } 1863 1864 static void __set_abort_with_changes_flags(struct dm_pool_metadata *pmd) 1865 { 1866 struct dm_thin_device *td; 1867 1868 list_for_each_entry(td, &pmd->thin_devices, list) 1869 td->aborted_with_changes = td->changed; 1870 } 1871 1872 int dm_pool_abort_metadata(struct dm_pool_metadata *pmd) 1873 { 1874 int r = -EINVAL; 1875 struct dm_block_manager *old_bm = NULL, *new_bm = NULL; 1876 1877 /* fail_io is double-checked with pmd->root_lock held below */ 1878 if (unlikely(pmd->fail_io)) 1879 return r; 1880 1881 /* 1882 * Replacement block manager (new_bm) is created and old_bm destroyed outside of 1883 * pmd root_lock to avoid ABBA deadlock that would result (due to life-cycle of 1884 * shrinker associated with the block manager's bufio client vs pmd root_lock). 1885 * - must take shrinker_rwsem without holding pmd->root_lock 1886 */ 1887 new_bm = dm_block_manager_create(pmd->bdev, THIN_METADATA_BLOCK_SIZE << SECTOR_SHIFT, 1888 THIN_MAX_CONCURRENT_LOCKS); 1889 1890 pmd_write_lock(pmd); 1891 if (pmd->fail_io) { 1892 pmd_write_unlock(pmd); 1893 goto out; 1894 } 1895 1896 __set_abort_with_changes_flags(pmd); 1897 __destroy_persistent_data_objects(pmd, false); 1898 old_bm = pmd->bm; 1899 if (IS_ERR(new_bm)) { 1900 DMERR("could not create block manager during abort"); 1901 pmd->bm = NULL; 1902 r = PTR_ERR(new_bm); 1903 goto out_unlock; 1904 } 1905 1906 pmd->bm = new_bm; 1907 r = __open_or_format_metadata(pmd, false); 1908 if (r) { 1909 pmd->bm = NULL; 1910 goto out_unlock; 1911 } 1912 new_bm = NULL; 1913 out_unlock: 1914 if (r) 1915 pmd->fail_io = true; 1916 pmd_write_unlock(pmd); 1917 dm_block_manager_destroy(old_bm); 1918 out: 1919 if (new_bm && !IS_ERR(new_bm)) 1920 dm_block_manager_destroy(new_bm); 1921 1922 return r; 1923 } 1924 1925 int dm_pool_get_free_block_count(struct dm_pool_metadata *pmd, dm_block_t *result) 1926 { 1927 int r = -EINVAL; 1928 1929 down_read(&pmd->root_lock); 1930 if (!pmd->fail_io) 1931 r = dm_sm_get_nr_free(pmd->data_sm, result); 1932 up_read(&pmd->root_lock); 1933 1934 return r; 1935 } 1936 1937 int dm_pool_get_free_metadata_block_count(struct dm_pool_metadata *pmd, 1938 dm_block_t *result) 1939 { 1940 int r = -EINVAL; 1941 1942 down_read(&pmd->root_lock); 1943 if (!pmd->fail_io) 1944 r = dm_sm_get_nr_free(pmd->metadata_sm, result); 1945 1946 if (!r) { 1947 if (*result < pmd->metadata_reserve) 1948 *result = 0; 1949 else 1950 *result -= pmd->metadata_reserve; 1951 } 1952 up_read(&pmd->root_lock); 1953 1954 return r; 1955 } 1956 1957 int dm_pool_get_metadata_dev_size(struct dm_pool_metadata *pmd, 1958 dm_block_t *result) 1959 { 1960 int r = -EINVAL; 1961 1962 down_read(&pmd->root_lock); 1963 if (!pmd->fail_io) 1964 r = dm_sm_get_nr_blocks(pmd->metadata_sm, result); 1965 up_read(&pmd->root_lock); 1966 1967 return r; 1968 } 1969 1970 int dm_pool_get_data_dev_size(struct dm_pool_metadata *pmd, dm_block_t *result) 1971 { 1972 int r = -EINVAL; 1973 1974 down_read(&pmd->root_lock); 1975 if (!pmd->fail_io) 1976 r = dm_sm_get_nr_blocks(pmd->data_sm, result); 1977 up_read(&pmd->root_lock); 1978 1979 return r; 1980 } 1981 1982 int dm_thin_get_mapped_count(struct dm_thin_device *td, dm_block_t *result) 1983 { 1984 int r = -EINVAL; 1985 struct dm_pool_metadata *pmd = td->pmd; 1986 1987 down_read(&pmd->root_lock); 1988 if (!pmd->fail_io) { 1989 *result = td->mapped_blocks; 1990 r = 0; 1991 } 1992 up_read(&pmd->root_lock); 1993 1994 return r; 1995 } 1996 1997 static int __highest_block(struct dm_thin_device *td, dm_block_t *result) 1998 { 1999 int r; 2000 __le64 value_le; 2001 dm_block_t thin_root; 2002 struct dm_pool_metadata *pmd = td->pmd; 2003 2004 r = dm_btree_lookup(&pmd->tl_info, pmd->root, &td->id, &value_le); 2005 if (r) 2006 return r; 2007 2008 thin_root = le64_to_cpu(value_le); 2009 2010 return dm_btree_find_highest_key(&pmd->bl_info, thin_root, result); 2011 } 2012 2013 int dm_thin_get_highest_mapped_block(struct dm_thin_device *td, 2014 dm_block_t *result) 2015 { 2016 int r = -EINVAL; 2017 struct dm_pool_metadata *pmd = td->pmd; 2018 2019 down_read(&pmd->root_lock); 2020 if (!pmd->fail_io) 2021 r = __highest_block(td, result); 2022 up_read(&pmd->root_lock); 2023 2024 return r; 2025 } 2026 2027 static int __resize_space_map(struct dm_space_map *sm, dm_block_t new_count) 2028 { 2029 int r; 2030 dm_block_t old_count; 2031 2032 r = dm_sm_get_nr_blocks(sm, &old_count); 2033 if (r) 2034 return r; 2035 2036 if (new_count == old_count) 2037 return 0; 2038 2039 if (new_count < old_count) { 2040 DMERR("cannot reduce size of space map"); 2041 return -EINVAL; 2042 } 2043 2044 return dm_sm_extend(sm, new_count - old_count); 2045 } 2046 2047 int dm_pool_resize_data_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 2048 { 2049 int r = -EINVAL; 2050 2051 pmd_write_lock(pmd); 2052 if (!pmd->fail_io) 2053 r = __resize_space_map(pmd->data_sm, new_count); 2054 pmd_write_unlock(pmd); 2055 2056 return r; 2057 } 2058 2059 int dm_pool_resize_metadata_dev(struct dm_pool_metadata *pmd, dm_block_t new_count) 2060 { 2061 int r = -EINVAL; 2062 2063 pmd_write_lock(pmd); 2064 if (!pmd->fail_io) { 2065 r = __resize_space_map(pmd->metadata_sm, new_count); 2066 if (!r) 2067 __set_metadata_reserve(pmd); 2068 } 2069 pmd_write_unlock(pmd); 2070 2071 return r; 2072 } 2073 2074 void dm_pool_metadata_read_only(struct dm_pool_metadata *pmd) 2075 { 2076 pmd_write_lock_in_core(pmd); 2077 dm_bm_set_read_only(pmd->bm); 2078 pmd_write_unlock(pmd); 2079 } 2080 2081 void dm_pool_metadata_read_write(struct dm_pool_metadata *pmd) 2082 { 2083 pmd_write_lock_in_core(pmd); 2084 dm_bm_set_read_write(pmd->bm); 2085 pmd_write_unlock(pmd); 2086 } 2087 2088 int dm_pool_register_metadata_threshold(struct dm_pool_metadata *pmd, 2089 dm_block_t threshold, 2090 dm_sm_threshold_fn fn, 2091 void *context) 2092 { 2093 int r = -EINVAL; 2094 2095 pmd_write_lock_in_core(pmd); 2096 if (!pmd->fail_io) { 2097 r = dm_sm_register_threshold_callback(pmd->metadata_sm, 2098 threshold, fn, context); 2099 } 2100 pmd_write_unlock(pmd); 2101 2102 return r; 2103 } 2104 2105 void dm_pool_register_pre_commit_callback(struct dm_pool_metadata *pmd, 2106 dm_pool_pre_commit_fn fn, 2107 void *context) 2108 { 2109 pmd_write_lock_in_core(pmd); 2110 pmd->pre_commit_fn = fn; 2111 pmd->pre_commit_context = context; 2112 pmd_write_unlock(pmd); 2113 } 2114 2115 int dm_pool_metadata_set_needs_check(struct dm_pool_metadata *pmd) 2116 { 2117 int r = -EINVAL; 2118 struct dm_block *sblock; 2119 struct thin_disk_superblock *disk_super; 2120 2121 pmd_write_lock(pmd); 2122 if (pmd->fail_io) 2123 goto out; 2124 2125 pmd->flags |= THIN_METADATA_NEEDS_CHECK_FLAG; 2126 2127 r = superblock_lock(pmd, &sblock); 2128 if (r) { 2129 DMERR("couldn't lock superblock"); 2130 goto out; 2131 } 2132 2133 disk_super = dm_block_data(sblock); 2134 disk_super->flags = cpu_to_le32(pmd->flags); 2135 2136 dm_bm_unlock(sblock); 2137 out: 2138 pmd_write_unlock(pmd); 2139 return r; 2140 } 2141 2142 bool dm_pool_metadata_needs_check(struct dm_pool_metadata *pmd) 2143 { 2144 bool needs_check; 2145 2146 down_read(&pmd->root_lock); 2147 needs_check = pmd->flags & THIN_METADATA_NEEDS_CHECK_FLAG; 2148 up_read(&pmd->root_lock); 2149 2150 return needs_check; 2151 } 2152 2153 void dm_pool_issue_prefetches(struct dm_pool_metadata *pmd) 2154 { 2155 down_read(&pmd->root_lock); 2156 if (!pmd->fail_io) 2157 dm_tm_issue_prefetches(pmd->tm); 2158 up_read(&pmd->root_lock); 2159 } 2160