1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree.h" 8 #include "dm-btree-internal.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /* 17 * Removing an entry from a btree 18 * ============================== 19 * 20 * A very important constraint for our btree is that no node, except the 21 * root, may have fewer than a certain number of entries. 22 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 23 * 24 * Ensuring this is complicated by the way we want to only ever hold the 25 * locks on 2 nodes concurrently, and only change nodes in a top to bottom 26 * fashion. 27 * 28 * Each node may have a left or right sibling. When decending the spine, 29 * if a node contains only MIN_ENTRIES then we try and increase this to at 30 * least MIN_ENTRIES + 1. We do this in the following ways: 31 * 32 * [A] No siblings => this can only happen if the node is the root, in which 33 * case we copy the childs contents over the root. 34 * 35 * [B] No left sibling 36 * ==> rebalance(node, right sibling) 37 * 38 * [C] No right sibling 39 * ==> rebalance(left sibling, node) 40 * 41 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 42 * ==> delete node adding it's contents to left and right 43 * 44 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 45 * ==> rebalance(left, node, right) 46 * 47 * After these operations it's possible that the our original node no 48 * longer contains the desired sub tree. For this reason this rebalancing 49 * is performed on the children of the current node. This also avoids 50 * having a special case for the root. 51 * 52 * Once this rebalancing has occurred we can then step into the child node 53 * for internal nodes. Or delete the entry for leaf nodes. 54 */ 55 56 /* 57 * Some little utilities for moving node data around. 58 */ 59 static void node_shift(struct btree_node *n, int shift) 60 { 61 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 62 uint32_t value_size = le32_to_cpu(n->header.value_size); 63 64 if (shift < 0) { 65 shift = -shift; 66 BUG_ON(shift > nr_entries); 67 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); 68 memmove(key_ptr(n, 0), 69 key_ptr(n, shift), 70 (nr_entries - shift) * sizeof(__le64)); 71 memmove(value_ptr(n, 0), 72 value_ptr(n, shift), 73 (nr_entries - shift) * value_size); 74 } else { 75 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 76 memmove(key_ptr(n, shift), 77 key_ptr(n, 0), 78 nr_entries * sizeof(__le64)); 79 memmove(value_ptr(n, shift), 80 value_ptr(n, 0), 81 nr_entries * value_size); 82 } 83 } 84 85 static int node_copy(struct btree_node *left, struct btree_node *right, int shift) 86 { 87 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 88 uint32_t value_size = le32_to_cpu(left->header.value_size); 89 if (value_size != le32_to_cpu(right->header.value_size)) { 90 DMERR("mismatched value size"); 91 return -EILSEQ; 92 } 93 94 if (shift < 0) { 95 shift = -shift; 96 97 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) { 98 DMERR("bad shift"); 99 return -EINVAL; 100 } 101 102 memcpy(key_ptr(left, nr_left), 103 key_ptr(right, 0), 104 shift * sizeof(__le64)); 105 memcpy(value_ptr(left, nr_left), 106 value_ptr(right, 0), 107 shift * value_size); 108 } else { 109 if (shift > le32_to_cpu(right->header.max_entries)) { 110 DMERR("bad shift"); 111 return -EINVAL; 112 } 113 114 memcpy(key_ptr(right, 0), 115 key_ptr(left, nr_left - shift), 116 shift * sizeof(__le64)); 117 memcpy(value_ptr(right, 0), 118 value_ptr(left, nr_left - shift), 119 shift * value_size); 120 } 121 return 0; 122 } 123 124 /* 125 * Delete a specific entry from a leaf node. 126 */ 127 static void delete_at(struct btree_node *n, unsigned index) 128 { 129 unsigned nr_entries = le32_to_cpu(n->header.nr_entries); 130 unsigned nr_to_copy = nr_entries - (index + 1); 131 uint32_t value_size = le32_to_cpu(n->header.value_size); 132 BUG_ON(index >= nr_entries); 133 134 if (nr_to_copy) { 135 memmove(key_ptr(n, index), 136 key_ptr(n, index + 1), 137 nr_to_copy * sizeof(__le64)); 138 139 memmove(value_ptr(n, index), 140 value_ptr(n, index + 1), 141 nr_to_copy * value_size); 142 } 143 144 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 145 } 146 147 static unsigned merge_threshold(struct btree_node *n) 148 { 149 return le32_to_cpu(n->header.max_entries) / 3; 150 } 151 152 struct child { 153 unsigned index; 154 struct dm_block *block; 155 struct btree_node *n; 156 }; 157 158 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 159 struct btree_node *parent, 160 unsigned index, struct child *result) 161 { 162 int r, inc; 163 dm_block_t root; 164 165 result->index = index; 166 root = value64(parent, index); 167 168 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 169 &result->block, &inc); 170 if (r) 171 return r; 172 173 result->n = dm_block_data(result->block); 174 175 if (inc) 176 inc_children(info->tm, result->n, vt); 177 178 *((__le64 *) value_ptr(parent, index)) = 179 cpu_to_le64(dm_block_location(result->block)); 180 181 return 0; 182 } 183 184 static void exit_child(struct dm_btree_info *info, struct child *c) 185 { 186 dm_tm_unlock(info->tm, c->block); 187 } 188 189 static int shift(struct btree_node *left, struct btree_node *right, int count) 190 { 191 int r; 192 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 193 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 194 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 195 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); 196 197 if (max_entries != r_max_entries) { 198 DMERR("node max_entries mismatch"); 199 return -EILSEQ; 200 } 201 202 if (nr_left - count > max_entries) { 203 DMERR("node shift out of bounds"); 204 return -EINVAL; 205 } 206 207 if (nr_right + count > max_entries) { 208 DMERR("node shift out of bounds"); 209 return -EINVAL; 210 } 211 212 if (!count) 213 return 0; 214 215 if (count > 0) { 216 node_shift(right, count); 217 r = node_copy(left, right, count); 218 if (r) 219 return r; 220 } else { 221 r = node_copy(left, right, count); 222 if (r) 223 return r; 224 node_shift(right, count); 225 } 226 227 left->header.nr_entries = cpu_to_le32(nr_left - count); 228 right->header.nr_entries = cpu_to_le32(nr_right + count); 229 230 return 0; 231 } 232 233 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent, 234 struct child *l, struct child *r) 235 { 236 int ret; 237 struct btree_node *left = l->n; 238 struct btree_node *right = r->n; 239 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 240 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 241 /* 242 * Ensure the number of entries in each child will be greater 243 * than or equal to (max_entries / 3 + 1), so no matter which 244 * child is used for removal, the number will still be not 245 * less than (max_entries / 3). 246 */ 247 unsigned int threshold = 2 * (merge_threshold(left) + 1); 248 249 if (nr_left + nr_right < threshold) { 250 /* 251 * Merge 252 */ 253 node_copy(left, right, -nr_right); 254 left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 255 delete_at(parent, r->index); 256 257 /* 258 * We need to decrement the right block, but not it's 259 * children, since they're still referenced by left. 260 */ 261 dm_tm_dec(info->tm, dm_block_location(r->block)); 262 } else { 263 /* 264 * Rebalance. 265 */ 266 unsigned target_left = (nr_left + nr_right) / 2; 267 ret = shift(left, right, nr_left - target_left); 268 if (ret) 269 return ret; 270 *key_ptr(parent, r->index) = right->keys[0]; 271 } 272 return 0; 273 } 274 275 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 276 struct dm_btree_value_type *vt, unsigned left_index) 277 { 278 int r; 279 struct btree_node *parent; 280 struct child left, right; 281 282 parent = dm_block_data(shadow_current(s)); 283 284 r = init_child(info, vt, parent, left_index, &left); 285 if (r) 286 return r; 287 288 r = init_child(info, vt, parent, left_index + 1, &right); 289 if (r) { 290 exit_child(info, &left); 291 return r; 292 } 293 294 r = __rebalance2(info, parent, &left, &right); 295 296 exit_child(info, &left); 297 exit_child(info, &right); 298 299 return r; 300 } 301 302 /* 303 * We dump as many entries from center as possible into left, then the rest 304 * in right, then rebalance2. This wastes some cpu, but I want something 305 * simple atm. 306 */ 307 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent, 308 struct child *l, struct child *c, struct child *r, 309 struct btree_node *left, struct btree_node *center, struct btree_node *right, 310 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 311 { 312 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 313 unsigned shift = min(max_entries - nr_left, nr_center); 314 315 if (nr_left + shift > max_entries) { 316 DMERR("node shift out of bounds"); 317 return -EINVAL; 318 } 319 320 node_copy(left, center, -shift); 321 left->header.nr_entries = cpu_to_le32(nr_left + shift); 322 323 if (shift != nr_center) { 324 shift = nr_center - shift; 325 326 if ((nr_right + shift) > max_entries) { 327 DMERR("node shift out of bounds"); 328 return -EINVAL; 329 } 330 331 node_shift(right, shift); 332 node_copy(center, right, shift); 333 right->header.nr_entries = cpu_to_le32(nr_right + shift); 334 } 335 *key_ptr(parent, r->index) = right->keys[0]; 336 337 delete_at(parent, c->index); 338 r->index--; 339 340 dm_tm_dec(info->tm, dm_block_location(c->block)); 341 return __rebalance2(info, parent, l, r); 342 } 343 344 /* 345 * Redistributes entries among 3 sibling nodes. 346 */ 347 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent, 348 struct child *l, struct child *c, struct child *r, 349 struct btree_node *left, struct btree_node *center, struct btree_node *right, 350 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 351 { 352 int s, ret; 353 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 354 unsigned total = nr_left + nr_center + nr_right; 355 unsigned target_right = total / 3; 356 unsigned remainder = (target_right * 3) != total; 357 unsigned target_left = target_right + remainder; 358 359 BUG_ON(target_left > max_entries); 360 BUG_ON(target_right > max_entries); 361 362 if (nr_left < nr_right) { 363 s = nr_left - target_left; 364 365 if (s < 0 && nr_center < -s) { 366 /* not enough in central node */ 367 ret = shift(left, center, -nr_center); 368 if (ret) 369 return ret; 370 371 s += nr_center; 372 ret = shift(left, right, s); 373 if (ret) 374 return ret; 375 376 nr_right += s; 377 } else { 378 ret = shift(left, center, s); 379 if (ret) 380 return ret; 381 } 382 383 ret = shift(center, right, target_right - nr_right); 384 if (ret) 385 return ret; 386 } else { 387 s = target_right - nr_right; 388 if (s > 0 && nr_center < s) { 389 /* not enough in central node */ 390 ret = shift(center, right, nr_center); 391 if (ret) 392 return ret; 393 s -= nr_center; 394 ret = shift(left, right, s); 395 if (ret) 396 return ret; 397 nr_left -= s; 398 } else { 399 ret = shift(center, right, s); 400 if (ret) 401 return ret; 402 } 403 404 ret = shift(left, center, nr_left - target_left); 405 if (ret) 406 return ret; 407 } 408 409 *key_ptr(parent, c->index) = center->keys[0]; 410 *key_ptr(parent, r->index) = right->keys[0]; 411 return 0; 412 } 413 414 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent, 415 struct child *l, struct child *c, struct child *r) 416 { 417 struct btree_node *left = l->n; 418 struct btree_node *center = c->n; 419 struct btree_node *right = r->n; 420 421 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 422 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 423 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 424 425 unsigned threshold = merge_threshold(left) * 4 + 1; 426 427 if ((left->header.max_entries != center->header.max_entries) || 428 (center->header.max_entries != right->header.max_entries)) { 429 DMERR("bad btree metadata, max_entries differ"); 430 return -EILSEQ; 431 } 432 433 if ((nr_left + nr_center + nr_right) < threshold) { 434 return delete_center_node(info, parent, l, c, r, left, center, right, 435 nr_left, nr_center, nr_right); 436 } 437 438 return redistribute3(info, parent, l, c, r, left, center, right, 439 nr_left, nr_center, nr_right); 440 } 441 442 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 443 struct dm_btree_value_type *vt, unsigned left_index) 444 { 445 int r; 446 struct btree_node *parent = dm_block_data(shadow_current(s)); 447 struct child left, center, right; 448 449 /* 450 * FIXME: fill out an array? 451 */ 452 r = init_child(info, vt, parent, left_index, &left); 453 if (r) 454 return r; 455 456 r = init_child(info, vt, parent, left_index + 1, ¢er); 457 if (r) { 458 exit_child(info, &left); 459 return r; 460 } 461 462 r = init_child(info, vt, parent, left_index + 2, &right); 463 if (r) { 464 exit_child(info, &left); 465 exit_child(info, ¢er); 466 return r; 467 } 468 469 r = __rebalance3(info, parent, &left, ¢er, &right); 470 471 exit_child(info, &left); 472 exit_child(info, ¢er); 473 exit_child(info, &right); 474 475 return r; 476 } 477 478 static int rebalance_children(struct shadow_spine *s, 479 struct dm_btree_info *info, 480 struct dm_btree_value_type *vt, uint64_t key) 481 { 482 int i, r, has_left_sibling, has_right_sibling; 483 struct btree_node *n; 484 485 n = dm_block_data(shadow_current(s)); 486 487 if (le32_to_cpu(n->header.nr_entries) == 1) { 488 struct dm_block *child; 489 dm_block_t b = value64(n, 0); 490 491 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 492 if (r) 493 return r; 494 495 memcpy(n, dm_block_data(child), 496 dm_bm_block_size(dm_tm_get_bm(info->tm))); 497 498 dm_tm_dec(info->tm, dm_block_location(child)); 499 dm_tm_unlock(info->tm, child); 500 return 0; 501 } 502 503 i = lower_bound(n, key); 504 if (i < 0) 505 return -ENODATA; 506 507 has_left_sibling = i > 0; 508 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 509 510 if (!has_left_sibling) 511 r = rebalance2(s, info, vt, i); 512 513 else if (!has_right_sibling) 514 r = rebalance2(s, info, vt, i - 1); 515 516 else 517 r = rebalance3(s, info, vt, i - 1); 518 519 return r; 520 } 521 522 static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) 523 { 524 int i = lower_bound(n, key); 525 526 if ((i < 0) || 527 (i >= le32_to_cpu(n->header.nr_entries)) || 528 (le64_to_cpu(n->keys[i]) != key)) 529 return -ENODATA; 530 531 *index = i; 532 533 return 0; 534 } 535 536 /* 537 * Prepares for removal from one level of the hierarchy. The caller must 538 * call delete_at() to remove the entry at index. 539 */ 540 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 541 struct dm_btree_value_type *vt, dm_block_t root, 542 uint64_t key, unsigned *index) 543 { 544 int i = *index, r; 545 struct btree_node *n; 546 547 for (;;) { 548 r = shadow_step(s, root, vt); 549 if (r < 0) 550 break; 551 552 /* 553 * We have to patch up the parent node, ugly, but I don't 554 * see a way to do this automatically as part of the spine 555 * op. 556 */ 557 if (shadow_has_parent(s)) { 558 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 559 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 560 &location, sizeof(__le64)); 561 } 562 563 n = dm_block_data(shadow_current(s)); 564 565 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 566 return do_leaf(n, key, index); 567 568 r = rebalance_children(s, info, vt, key); 569 if (r) 570 break; 571 572 n = dm_block_data(shadow_current(s)); 573 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 574 return do_leaf(n, key, index); 575 576 i = lower_bound(n, key); 577 578 /* 579 * We know the key is present, or else 580 * rebalance_children would have returned 581 * -ENODATA 582 */ 583 root = value64(n, i); 584 } 585 586 return r; 587 } 588 589 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 590 uint64_t *keys, dm_block_t *new_root) 591 { 592 unsigned level, last_level = info->levels - 1; 593 int index = 0, r = 0; 594 struct shadow_spine spine; 595 struct btree_node *n; 596 struct dm_btree_value_type le64_vt; 597 598 init_le64_type(info->tm, &le64_vt); 599 init_shadow_spine(&spine, info); 600 for (level = 0; level < info->levels; level++) { 601 r = remove_raw(&spine, info, 602 (level == last_level ? 603 &info->value_type : &le64_vt), 604 root, keys[level], (unsigned *)&index); 605 if (r < 0) 606 break; 607 608 n = dm_block_data(shadow_current(&spine)); 609 if (level != last_level) { 610 root = value64(n, index); 611 continue; 612 } 613 614 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 615 616 if (info->value_type.dec) 617 info->value_type.dec(info->value_type.context, 618 value_ptr(n, index), 1); 619 620 delete_at(n, index); 621 } 622 623 if (!r) 624 *new_root = shadow_root(&spine); 625 exit_shadow_spine(&spine); 626 627 return r; 628 } 629 EXPORT_SYMBOL_GPL(dm_btree_remove); 630 631 /*----------------------------------------------------------------*/ 632 633 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, 634 struct dm_btree_value_type *vt, dm_block_t root, 635 uint64_t key, int *index) 636 { 637 int i = *index, r; 638 struct btree_node *n; 639 640 for (;;) { 641 r = shadow_step(s, root, vt); 642 if (r < 0) 643 break; 644 645 /* 646 * We have to patch up the parent node, ugly, but I don't 647 * see a way to do this automatically as part of the spine 648 * op. 649 */ 650 if (shadow_has_parent(s)) { 651 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 652 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 653 &location, sizeof(__le64)); 654 } 655 656 n = dm_block_data(shadow_current(s)); 657 658 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 659 *index = lower_bound(n, key); 660 return 0; 661 } 662 663 r = rebalance_children(s, info, vt, key); 664 if (r) 665 break; 666 667 n = dm_block_data(shadow_current(s)); 668 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 669 *index = lower_bound(n, key); 670 return 0; 671 } 672 673 i = lower_bound(n, key); 674 675 /* 676 * We know the key is present, or else 677 * rebalance_children would have returned 678 * -ENODATA 679 */ 680 root = value64(n, i); 681 } 682 683 return r; 684 } 685 686 static int remove_one(struct dm_btree_info *info, dm_block_t root, 687 uint64_t *keys, uint64_t end_key, 688 dm_block_t *new_root, unsigned *nr_removed) 689 { 690 unsigned level, last_level = info->levels - 1; 691 int index = 0, r = 0; 692 struct shadow_spine spine; 693 struct btree_node *n; 694 struct dm_btree_value_type le64_vt; 695 uint64_t k; 696 697 init_le64_type(info->tm, &le64_vt); 698 init_shadow_spine(&spine, info); 699 for (level = 0; level < last_level; level++) { 700 r = remove_raw(&spine, info, &le64_vt, 701 root, keys[level], (unsigned *) &index); 702 if (r < 0) 703 goto out; 704 705 n = dm_block_data(shadow_current(&spine)); 706 root = value64(n, index); 707 } 708 709 r = remove_nearest(&spine, info, &info->value_type, 710 root, keys[last_level], &index); 711 if (r < 0) 712 goto out; 713 714 n = dm_block_data(shadow_current(&spine)); 715 716 if (index < 0) 717 index = 0; 718 719 if (index >= le32_to_cpu(n->header.nr_entries)) { 720 r = -ENODATA; 721 goto out; 722 } 723 724 k = le64_to_cpu(n->keys[index]); 725 if (k >= keys[last_level] && k < end_key) { 726 if (info->value_type.dec) 727 info->value_type.dec(info->value_type.context, 728 value_ptr(n, index), 1); 729 730 delete_at(n, index); 731 keys[last_level] = k + 1ull; 732 733 } else 734 r = -ENODATA; 735 736 out: 737 *new_root = shadow_root(&spine); 738 exit_shadow_spine(&spine); 739 740 return r; 741 } 742 743 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 744 uint64_t *first_key, uint64_t end_key, 745 dm_block_t *new_root, unsigned *nr_removed) 746 { 747 int r; 748 749 *nr_removed = 0; 750 do { 751 r = remove_one(info, root, first_key, end_key, &root, nr_removed); 752 if (!r) 753 (*nr_removed)++; 754 } while (!r); 755 756 *new_root = root; 757 return r == -ENODATA ? 0 : r; 758 } 759 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); 760