1 /* 2 * Copyright (C) 2012 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-array.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "array" 15 16 /*----------------------------------------------------------------*/ 17 18 /* 19 * The array is implemented as a fully populated btree, which points to 20 * blocks that contain the packed values. This is more space efficient 21 * than just using a btree since we don't store 1 key per value. 22 */ 23 struct array_block { 24 __le32 csum; 25 __le32 max_entries; 26 __le32 nr_entries; 27 __le32 value_size; 28 __le64 blocknr; /* Block this node is supposed to live in. */ 29 } __packed; 30 31 /*----------------------------------------------------------------*/ 32 33 /* 34 * Validator methods. As usual we calculate a checksum, and also write the 35 * block location into the header (paranoia about ssds remapping areas by 36 * mistake). 37 */ 38 #define CSUM_XOR 595846735 39 40 static void array_block_prepare_for_write(struct dm_block_validator *v, 41 struct dm_block *b, 42 size_t size_of_block) 43 { 44 struct array_block *bh_le = dm_block_data(b); 45 46 bh_le->blocknr = cpu_to_le64(dm_block_location(b)); 47 bh_le->csum = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, 48 size_of_block - sizeof(__le32), 49 CSUM_XOR)); 50 } 51 52 static int array_block_check(struct dm_block_validator *v, 53 struct dm_block *b, 54 size_t size_of_block) 55 { 56 struct array_block *bh_le = dm_block_data(b); 57 __le32 csum_disk; 58 59 if (dm_block_location(b) != le64_to_cpu(bh_le->blocknr)) { 60 DMERR_LIMIT("array_block_check failed: blocknr %llu != wanted %llu", 61 (unsigned long long) le64_to_cpu(bh_le->blocknr), 62 (unsigned long long) dm_block_location(b)); 63 return -ENOTBLK; 64 } 65 66 csum_disk = cpu_to_le32(dm_bm_checksum(&bh_le->max_entries, 67 size_of_block - sizeof(__le32), 68 CSUM_XOR)); 69 if (csum_disk != bh_le->csum) { 70 DMERR_LIMIT("array_block_check failed: csum %u != wanted %u", 71 (unsigned) le32_to_cpu(csum_disk), 72 (unsigned) le32_to_cpu(bh_le->csum)); 73 return -EILSEQ; 74 } 75 76 return 0; 77 } 78 79 static struct dm_block_validator array_validator = { 80 .name = "array", 81 .prepare_for_write = array_block_prepare_for_write, 82 .check = array_block_check 83 }; 84 85 /*----------------------------------------------------------------*/ 86 87 /* 88 * Functions for manipulating the array blocks. 89 */ 90 91 /* 92 * Returns a pointer to a value within an array block. 93 * 94 * index - The index into _this_ specific block. 95 */ 96 static void *element_at(struct dm_array_info *info, struct array_block *ab, 97 unsigned index) 98 { 99 unsigned char *entry = (unsigned char *) (ab + 1); 100 101 entry += index * info->value_type.size; 102 103 return entry; 104 } 105 106 /* 107 * Utility function that calls one of the value_type methods on every value 108 * in an array block. 109 */ 110 static void on_entries(struct dm_array_info *info, struct array_block *ab, 111 void (*fn)(void *, const void *)) 112 { 113 unsigned i, nr_entries = le32_to_cpu(ab->nr_entries); 114 115 for (i = 0; i < nr_entries; i++) 116 fn(info->value_type.context, element_at(info, ab, i)); 117 } 118 119 /* 120 * Increment every value in an array block. 121 */ 122 static void inc_ablock_entries(struct dm_array_info *info, struct array_block *ab) 123 { 124 struct dm_btree_value_type *vt = &info->value_type; 125 126 if (vt->inc) 127 on_entries(info, ab, vt->inc); 128 } 129 130 /* 131 * Decrement every value in an array block. 132 */ 133 static void dec_ablock_entries(struct dm_array_info *info, struct array_block *ab) 134 { 135 struct dm_btree_value_type *vt = &info->value_type; 136 137 if (vt->dec) 138 on_entries(info, ab, vt->dec); 139 } 140 141 /* 142 * Each array block can hold this many values. 143 */ 144 static uint32_t calc_max_entries(size_t value_size, size_t size_of_block) 145 { 146 return (size_of_block - sizeof(struct array_block)) / value_size; 147 } 148 149 /* 150 * Allocate a new array block. The caller will need to unlock block. 151 */ 152 static int alloc_ablock(struct dm_array_info *info, size_t size_of_block, 153 uint32_t max_entries, 154 struct dm_block **block, struct array_block **ab) 155 { 156 int r; 157 158 r = dm_tm_new_block(info->btree_info.tm, &array_validator, block); 159 if (r) 160 return r; 161 162 (*ab) = dm_block_data(*block); 163 (*ab)->max_entries = cpu_to_le32(max_entries); 164 (*ab)->nr_entries = cpu_to_le32(0); 165 (*ab)->value_size = cpu_to_le32(info->value_type.size); 166 167 return 0; 168 } 169 170 /* 171 * Pad an array block out with a particular value. Every instance will 172 * cause an increment of the value_type. new_nr must always be more than 173 * the current number of entries. 174 */ 175 static void fill_ablock(struct dm_array_info *info, struct array_block *ab, 176 const void *value, unsigned new_nr) 177 { 178 unsigned i; 179 uint32_t nr_entries; 180 struct dm_btree_value_type *vt = &info->value_type; 181 182 BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); 183 BUG_ON(new_nr < le32_to_cpu(ab->nr_entries)); 184 185 nr_entries = le32_to_cpu(ab->nr_entries); 186 for (i = nr_entries; i < new_nr; i++) { 187 if (vt->inc) 188 vt->inc(vt->context, value); 189 memcpy(element_at(info, ab, i), value, vt->size); 190 } 191 ab->nr_entries = cpu_to_le32(new_nr); 192 } 193 194 /* 195 * Remove some entries from the back of an array block. Every value 196 * removed will be decremented. new_nr must be <= the current number of 197 * entries. 198 */ 199 static void trim_ablock(struct dm_array_info *info, struct array_block *ab, 200 unsigned new_nr) 201 { 202 unsigned i; 203 uint32_t nr_entries; 204 struct dm_btree_value_type *vt = &info->value_type; 205 206 BUG_ON(new_nr > le32_to_cpu(ab->max_entries)); 207 BUG_ON(new_nr > le32_to_cpu(ab->nr_entries)); 208 209 nr_entries = le32_to_cpu(ab->nr_entries); 210 for (i = nr_entries; i > new_nr; i--) 211 if (vt->dec) 212 vt->dec(vt->context, element_at(info, ab, i - 1)); 213 ab->nr_entries = cpu_to_le32(new_nr); 214 } 215 216 /* 217 * Read locks a block, and coerces it to an array block. The caller must 218 * unlock 'block' when finished. 219 */ 220 static int get_ablock(struct dm_array_info *info, dm_block_t b, 221 struct dm_block **block, struct array_block **ab) 222 { 223 int r; 224 225 r = dm_tm_read_lock(info->btree_info.tm, b, &array_validator, block); 226 if (r) 227 return r; 228 229 *ab = dm_block_data(*block); 230 return 0; 231 } 232 233 /* 234 * Unlocks an array block. 235 */ 236 static int unlock_ablock(struct dm_array_info *info, struct dm_block *block) 237 { 238 return dm_tm_unlock(info->btree_info.tm, block); 239 } 240 241 /*----------------------------------------------------------------*/ 242 243 /* 244 * Btree manipulation. 245 */ 246 247 /* 248 * Looks up an array block in the btree, and then read locks it. 249 * 250 * index is the index of the index of the array_block, (ie. the array index 251 * / max_entries). 252 */ 253 static int lookup_ablock(struct dm_array_info *info, dm_block_t root, 254 unsigned index, struct dm_block **block, 255 struct array_block **ab) 256 { 257 int r; 258 uint64_t key = index; 259 __le64 block_le; 260 261 r = dm_btree_lookup(&info->btree_info, root, &key, &block_le); 262 if (r) 263 return r; 264 265 return get_ablock(info, le64_to_cpu(block_le), block, ab); 266 } 267 268 /* 269 * Insert an array block into the btree. The block is _not_ unlocked. 270 */ 271 static int insert_ablock(struct dm_array_info *info, uint64_t index, 272 struct dm_block *block, dm_block_t *root) 273 { 274 __le64 block_le = cpu_to_le64(dm_block_location(block)); 275 276 __dm_bless_for_disk(block_le); 277 return dm_btree_insert(&info->btree_info, *root, &index, &block_le, root); 278 } 279 280 /* 281 * Looks up an array block in the btree. Then shadows it, and updates the 282 * btree to point to this new shadow. 'root' is an input/output parameter 283 * for both the current root block, and the new one. 284 */ 285 static int shadow_ablock(struct dm_array_info *info, dm_block_t *root, 286 unsigned index, struct dm_block **block, 287 struct array_block **ab) 288 { 289 int r, inc; 290 uint64_t key = index; 291 dm_block_t b; 292 __le64 block_le; 293 294 /* 295 * lookup 296 */ 297 r = dm_btree_lookup(&info->btree_info, *root, &key, &block_le); 298 if (r) 299 return r; 300 b = le64_to_cpu(block_le); 301 302 /* 303 * shadow 304 */ 305 r = dm_tm_shadow_block(info->btree_info.tm, b, 306 &array_validator, block, &inc); 307 if (r) 308 return r; 309 310 *ab = dm_block_data(*block); 311 if (inc) 312 inc_ablock_entries(info, *ab); 313 314 /* 315 * Reinsert. 316 * 317 * The shadow op will often be a noop. Only insert if it really 318 * copied data. 319 */ 320 if (dm_block_location(*block) != b) 321 r = insert_ablock(info, index, *block, root); 322 323 return r; 324 } 325 326 /* 327 * Allocate an new array block, and fill it with some values. 328 */ 329 static int insert_new_ablock(struct dm_array_info *info, size_t size_of_block, 330 uint32_t max_entries, 331 unsigned block_index, uint32_t nr, 332 const void *value, dm_block_t *root) 333 { 334 int r; 335 struct dm_block *block; 336 struct array_block *ab; 337 338 r = alloc_ablock(info, size_of_block, max_entries, &block, &ab); 339 if (r) 340 return r; 341 342 fill_ablock(info, ab, value, nr); 343 r = insert_ablock(info, block_index, block, root); 344 unlock_ablock(info, block); 345 346 return r; 347 } 348 349 static int insert_full_ablocks(struct dm_array_info *info, size_t size_of_block, 350 unsigned begin_block, unsigned end_block, 351 unsigned max_entries, const void *value, 352 dm_block_t *root) 353 { 354 int r = 0; 355 356 for (; !r && begin_block != end_block; begin_block++) 357 r = insert_new_ablock(info, size_of_block, max_entries, begin_block, max_entries, value, root); 358 359 return r; 360 } 361 362 /* 363 * There are a bunch of functions involved with resizing an array. This 364 * structure holds information that commonly needed by them. Purely here 365 * to reduce parameter count. 366 */ 367 struct resize { 368 /* 369 * Describes the array. 370 */ 371 struct dm_array_info *info; 372 373 /* 374 * The current root of the array. This gets updated. 375 */ 376 dm_block_t root; 377 378 /* 379 * Metadata block size. Used to calculate the nr entries in an 380 * array block. 381 */ 382 size_t size_of_block; 383 384 /* 385 * Maximum nr entries in an array block. 386 */ 387 unsigned max_entries; 388 389 /* 390 * nr of completely full blocks in the array. 391 * 392 * 'old' refers to before the resize, 'new' after. 393 */ 394 unsigned old_nr_full_blocks, new_nr_full_blocks; 395 396 /* 397 * Number of entries in the final block. 0 iff only full blocks in 398 * the array. 399 */ 400 unsigned old_nr_entries_in_last_block, new_nr_entries_in_last_block; 401 402 /* 403 * The default value used when growing the array. 404 */ 405 const void *value; 406 }; 407 408 /* 409 * Removes a consecutive set of array blocks from the btree. The values 410 * in block are decremented as a side effect of the btree remove. 411 * 412 * begin_index - the index of the first array block to remove. 413 * end_index - the one-past-the-end value. ie. this block is not removed. 414 */ 415 static int drop_blocks(struct resize *resize, unsigned begin_index, 416 unsigned end_index) 417 { 418 int r; 419 420 while (begin_index != end_index) { 421 uint64_t key = begin_index++; 422 r = dm_btree_remove(&resize->info->btree_info, resize->root, 423 &key, &resize->root); 424 if (r) 425 return r; 426 } 427 428 return 0; 429 } 430 431 /* 432 * Calculates how many blocks are needed for the array. 433 */ 434 static unsigned total_nr_blocks_needed(unsigned nr_full_blocks, 435 unsigned nr_entries_in_last_block) 436 { 437 return nr_full_blocks + (nr_entries_in_last_block ? 1 : 0); 438 } 439 440 /* 441 * Shrink an array. 442 */ 443 static int shrink(struct resize *resize) 444 { 445 int r; 446 unsigned begin, end; 447 struct dm_block *block; 448 struct array_block *ab; 449 450 /* 451 * Lose some blocks from the back? 452 */ 453 if (resize->new_nr_full_blocks < resize->old_nr_full_blocks) { 454 begin = total_nr_blocks_needed(resize->new_nr_full_blocks, 455 resize->new_nr_entries_in_last_block); 456 end = total_nr_blocks_needed(resize->old_nr_full_blocks, 457 resize->old_nr_entries_in_last_block); 458 459 r = drop_blocks(resize, begin, end); 460 if (r) 461 return r; 462 } 463 464 /* 465 * Trim the new tail block 466 */ 467 if (resize->new_nr_entries_in_last_block) { 468 r = shadow_ablock(resize->info, &resize->root, 469 resize->new_nr_full_blocks, &block, &ab); 470 if (r) 471 return r; 472 473 trim_ablock(resize->info, ab, resize->new_nr_entries_in_last_block); 474 unlock_ablock(resize->info, block); 475 } 476 477 return 0; 478 } 479 480 /* 481 * Grow an array. 482 */ 483 static int grow_extend_tail_block(struct resize *resize, uint32_t new_nr_entries) 484 { 485 int r; 486 struct dm_block *block; 487 struct array_block *ab; 488 489 r = shadow_ablock(resize->info, &resize->root, 490 resize->old_nr_full_blocks, &block, &ab); 491 if (r) 492 return r; 493 494 fill_ablock(resize->info, ab, resize->value, new_nr_entries); 495 unlock_ablock(resize->info, block); 496 497 return r; 498 } 499 500 static int grow_add_tail_block(struct resize *resize) 501 { 502 return insert_new_ablock(resize->info, resize->size_of_block, 503 resize->max_entries, 504 resize->new_nr_full_blocks, 505 resize->new_nr_entries_in_last_block, 506 resize->value, &resize->root); 507 } 508 509 static int grow_needs_more_blocks(struct resize *resize) 510 { 511 int r; 512 unsigned old_nr_blocks = resize->old_nr_full_blocks; 513 514 if (resize->old_nr_entries_in_last_block > 0) { 515 old_nr_blocks++; 516 517 r = grow_extend_tail_block(resize, resize->max_entries); 518 if (r) 519 return r; 520 } 521 522 r = insert_full_ablocks(resize->info, resize->size_of_block, 523 old_nr_blocks, 524 resize->new_nr_full_blocks, 525 resize->max_entries, resize->value, 526 &resize->root); 527 if (r) 528 return r; 529 530 if (resize->new_nr_entries_in_last_block) 531 r = grow_add_tail_block(resize); 532 533 return r; 534 } 535 536 static int grow(struct resize *resize) 537 { 538 if (resize->new_nr_full_blocks > resize->old_nr_full_blocks) 539 return grow_needs_more_blocks(resize); 540 541 else if (resize->old_nr_entries_in_last_block) 542 return grow_extend_tail_block(resize, resize->new_nr_entries_in_last_block); 543 544 else 545 return grow_add_tail_block(resize); 546 } 547 548 /*----------------------------------------------------------------*/ 549 550 /* 551 * These are the value_type functions for the btree elements, which point 552 * to array blocks. 553 */ 554 static void block_inc(void *context, const void *value) 555 { 556 __le64 block_le; 557 struct dm_array_info *info = context; 558 559 memcpy(&block_le, value, sizeof(block_le)); 560 dm_tm_inc(info->btree_info.tm, le64_to_cpu(block_le)); 561 } 562 563 static void block_dec(void *context, const void *value) 564 { 565 int r; 566 uint64_t b; 567 __le64 block_le; 568 uint32_t ref_count; 569 struct dm_block *block; 570 struct array_block *ab; 571 struct dm_array_info *info = context; 572 573 memcpy(&block_le, value, sizeof(block_le)); 574 b = le64_to_cpu(block_le); 575 576 r = dm_tm_ref(info->btree_info.tm, b, &ref_count); 577 if (r) { 578 DMERR_LIMIT("couldn't get reference count for block %llu", 579 (unsigned long long) b); 580 return; 581 } 582 583 if (ref_count == 1) { 584 /* 585 * We're about to drop the last reference to this ablock. 586 * So we need to decrement the ref count of the contents. 587 */ 588 r = get_ablock(info, b, &block, &ab); 589 if (r) { 590 DMERR_LIMIT("couldn't get array block %llu", 591 (unsigned long long) b); 592 return; 593 } 594 595 dec_ablock_entries(info, ab); 596 unlock_ablock(info, block); 597 } 598 599 dm_tm_dec(info->btree_info.tm, b); 600 } 601 602 static int block_equal(void *context, const void *value1, const void *value2) 603 { 604 return !memcmp(value1, value2, sizeof(__le64)); 605 } 606 607 /*----------------------------------------------------------------*/ 608 609 void dm_array_info_init(struct dm_array_info *info, 610 struct dm_transaction_manager *tm, 611 struct dm_btree_value_type *vt) 612 { 613 struct dm_btree_value_type *bvt = &info->btree_info.value_type; 614 615 memcpy(&info->value_type, vt, sizeof(info->value_type)); 616 info->btree_info.tm = tm; 617 info->btree_info.levels = 1; 618 619 bvt->context = info; 620 bvt->size = sizeof(__le64); 621 bvt->inc = block_inc; 622 bvt->dec = block_dec; 623 bvt->equal = block_equal; 624 } 625 EXPORT_SYMBOL_GPL(dm_array_info_init); 626 627 int dm_array_empty(struct dm_array_info *info, dm_block_t *root) 628 { 629 return dm_btree_empty(&info->btree_info, root); 630 } 631 EXPORT_SYMBOL_GPL(dm_array_empty); 632 633 static int array_resize(struct dm_array_info *info, dm_block_t root, 634 uint32_t old_size, uint32_t new_size, 635 const void *value, dm_block_t *new_root) 636 { 637 int r; 638 struct resize resize; 639 640 if (old_size == new_size) 641 return 0; 642 643 resize.info = info; 644 resize.root = root; 645 resize.size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 646 resize.max_entries = calc_max_entries(info->value_type.size, 647 resize.size_of_block); 648 649 resize.old_nr_full_blocks = old_size / resize.max_entries; 650 resize.old_nr_entries_in_last_block = old_size % resize.max_entries; 651 resize.new_nr_full_blocks = new_size / resize.max_entries; 652 resize.new_nr_entries_in_last_block = new_size % resize.max_entries; 653 resize.value = value; 654 655 r = ((new_size > old_size) ? grow : shrink)(&resize); 656 if (r) 657 return r; 658 659 *new_root = resize.root; 660 return 0; 661 } 662 663 int dm_array_resize(struct dm_array_info *info, dm_block_t root, 664 uint32_t old_size, uint32_t new_size, 665 const void *value, dm_block_t *new_root) 666 __dm_written_to_disk(value) 667 { 668 int r = array_resize(info, root, old_size, new_size, value, new_root); 669 __dm_unbless_for_disk(value); 670 return r; 671 } 672 EXPORT_SYMBOL_GPL(dm_array_resize); 673 674 int dm_array_del(struct dm_array_info *info, dm_block_t root) 675 { 676 return dm_btree_del(&info->btree_info, root); 677 } 678 EXPORT_SYMBOL_GPL(dm_array_del); 679 680 int dm_array_get_value(struct dm_array_info *info, dm_block_t root, 681 uint32_t index, void *value_le) 682 { 683 int r; 684 struct dm_block *block; 685 struct array_block *ab; 686 size_t size_of_block; 687 unsigned entry, max_entries; 688 689 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 690 max_entries = calc_max_entries(info->value_type.size, size_of_block); 691 692 r = lookup_ablock(info, root, index / max_entries, &block, &ab); 693 if (r) 694 return r; 695 696 entry = index % max_entries; 697 if (entry >= le32_to_cpu(ab->nr_entries)) 698 r = -ENODATA; 699 else 700 memcpy(value_le, element_at(info, ab, entry), 701 info->value_type.size); 702 703 unlock_ablock(info, block); 704 return r; 705 } 706 EXPORT_SYMBOL_GPL(dm_array_get_value); 707 708 static int array_set_value(struct dm_array_info *info, dm_block_t root, 709 uint32_t index, const void *value, dm_block_t *new_root) 710 { 711 int r; 712 struct dm_block *block; 713 struct array_block *ab; 714 size_t size_of_block; 715 unsigned max_entries; 716 unsigned entry; 717 void *old_value; 718 struct dm_btree_value_type *vt = &info->value_type; 719 720 size_of_block = dm_bm_block_size(dm_tm_get_bm(info->btree_info.tm)); 721 max_entries = calc_max_entries(info->value_type.size, size_of_block); 722 723 r = shadow_ablock(info, &root, index / max_entries, &block, &ab); 724 if (r) 725 return r; 726 *new_root = root; 727 728 entry = index % max_entries; 729 if (entry >= le32_to_cpu(ab->nr_entries)) { 730 r = -ENODATA; 731 goto out; 732 } 733 734 old_value = element_at(info, ab, entry); 735 if (vt->dec && 736 (!vt->equal || !vt->equal(vt->context, old_value, value))) { 737 vt->dec(vt->context, old_value); 738 if (vt->inc) 739 vt->inc(vt->context, value); 740 } 741 742 memcpy(old_value, value, info->value_type.size); 743 744 out: 745 unlock_ablock(info, block); 746 return r; 747 } 748 749 int dm_array_set_value(struct dm_array_info *info, dm_block_t root, 750 uint32_t index, const void *value, dm_block_t *new_root) 751 __dm_written_to_disk(value) 752 { 753 int r; 754 755 r = array_set_value(info, root, index, value, new_root); 756 __dm_unbless_for_disk(value); 757 return r; 758 } 759 EXPORT_SYMBOL_GPL(dm_array_set_value); 760 761 struct walk_info { 762 struct dm_array_info *info; 763 int (*fn)(void *context, uint64_t key, void *leaf); 764 void *context; 765 }; 766 767 static int walk_ablock(void *context, uint64_t *keys, void *leaf) 768 { 769 struct walk_info *wi = context; 770 771 int r; 772 unsigned i; 773 __le64 block_le; 774 unsigned nr_entries, max_entries; 775 struct dm_block *block; 776 struct array_block *ab; 777 778 memcpy(&block_le, leaf, sizeof(block_le)); 779 r = get_ablock(wi->info, le64_to_cpu(block_le), &block, &ab); 780 if (r) 781 return r; 782 783 max_entries = le32_to_cpu(ab->max_entries); 784 nr_entries = le32_to_cpu(ab->nr_entries); 785 for (i = 0; i < nr_entries; i++) { 786 r = wi->fn(wi->context, keys[0] * max_entries + i, 787 element_at(wi->info, ab, i)); 788 789 if (r) 790 break; 791 } 792 793 unlock_ablock(wi->info, block); 794 return r; 795 } 796 797 int dm_array_walk(struct dm_array_info *info, dm_block_t root, 798 int (*fn)(void *, uint64_t key, void *leaf), 799 void *context) 800 { 801 struct walk_info wi; 802 803 wi.info = info; 804 wi.fn = fn; 805 wi.context = context; 806 807 return dm_btree_walk(&info->btree_info, root, walk_ablock, &wi); 808 } 809 EXPORT_SYMBOL_GPL(dm_array_walk); 810 811 /*----------------------------------------------------------------*/ 812