1 // SPDX-License-Identifier: GPL-2.0-only 2 /* 3 * Copyright (C) 2011 Red Hat, Inc. 4 * 5 * This file is released under the GPL. 6 */ 7 8 #include "dm-btree.h" 9 #include "dm-btree-internal.h" 10 #include "dm-transaction-manager.h" 11 12 #include <linux/export.h> 13 #include <linux/device-mapper.h> 14 15 #define DM_MSG_PREFIX "btree" 16 17 /* 18 * Removing an entry from a btree 19 * ============================== 20 * 21 * A very important constraint for our btree is that no node, except the 22 * root, may have fewer than a certain number of entries. 23 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 24 * 25 * Ensuring this is complicated by the way we want to only ever hold the 26 * locks on 2 nodes concurrently, and only change nodes in a top to bottom 27 * fashion. 28 * 29 * Each node may have a left or right sibling. When decending the spine, 30 * if a node contains only MIN_ENTRIES then we try and increase this to at 31 * least MIN_ENTRIES + 1. We do this in the following ways: 32 * 33 * [A] No siblings => this can only happen if the node is the root, in which 34 * case we copy the childs contents over the root. 35 * 36 * [B] No left sibling 37 * ==> rebalance(node, right sibling) 38 * 39 * [C] No right sibling 40 * ==> rebalance(left sibling, node) 41 * 42 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 43 * ==> delete node adding it's contents to left and right 44 * 45 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 46 * ==> rebalance(left, node, right) 47 * 48 * After these operations it's possible that the our original node no 49 * longer contains the desired sub tree. For this reason this rebalancing 50 * is performed on the children of the current node. This also avoids 51 * having a special case for the root. 52 * 53 * Once this rebalancing has occurred we can then step into the child node 54 * for internal nodes. Or delete the entry for leaf nodes. 55 */ 56 57 /* 58 * Some little utilities for moving node data around. 59 */ 60 static void node_shift(struct btree_node *n, int shift) 61 { 62 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 63 uint32_t value_size = le32_to_cpu(n->header.value_size); 64 65 if (shift < 0) { 66 shift = -shift; 67 BUG_ON(shift > nr_entries); 68 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); 69 memmove(key_ptr(n, 0), 70 key_ptr(n, shift), 71 (nr_entries - shift) * sizeof(__le64)); 72 memmove(value_ptr(n, 0), 73 value_ptr(n, shift), 74 (nr_entries - shift) * value_size); 75 } else { 76 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 77 memmove(key_ptr(n, shift), 78 key_ptr(n, 0), 79 nr_entries * sizeof(__le64)); 80 memmove(value_ptr(n, shift), 81 value_ptr(n, 0), 82 nr_entries * value_size); 83 } 84 } 85 86 static int node_copy(struct btree_node *left, struct btree_node *right, int shift) 87 { 88 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 89 uint32_t value_size = le32_to_cpu(left->header.value_size); 90 91 if (value_size != le32_to_cpu(right->header.value_size)) { 92 DMERR("mismatched value size"); 93 return -EILSEQ; 94 } 95 96 if (shift < 0) { 97 shift = -shift; 98 99 if (nr_left + shift > le32_to_cpu(left->header.max_entries)) { 100 DMERR("bad shift"); 101 return -EINVAL; 102 } 103 104 memcpy(key_ptr(left, nr_left), 105 key_ptr(right, 0), 106 shift * sizeof(__le64)); 107 memcpy(value_ptr(left, nr_left), 108 value_ptr(right, 0), 109 shift * value_size); 110 } else { 111 if (shift > le32_to_cpu(right->header.max_entries)) { 112 DMERR("bad shift"); 113 return -EINVAL; 114 } 115 116 memcpy(key_ptr(right, 0), 117 key_ptr(left, nr_left - shift), 118 shift * sizeof(__le64)); 119 memcpy(value_ptr(right, 0), 120 value_ptr(left, nr_left - shift), 121 shift * value_size); 122 } 123 return 0; 124 } 125 126 /* 127 * Delete a specific entry from a leaf node. 128 */ 129 static void delete_at(struct btree_node *n, unsigned int index) 130 { 131 unsigned int nr_entries = le32_to_cpu(n->header.nr_entries); 132 unsigned int nr_to_copy = nr_entries - (index + 1); 133 uint32_t value_size = le32_to_cpu(n->header.value_size); 134 135 BUG_ON(index >= nr_entries); 136 137 if (nr_to_copy) { 138 memmove(key_ptr(n, index), 139 key_ptr(n, index + 1), 140 nr_to_copy * sizeof(__le64)); 141 142 memmove(value_ptr(n, index), 143 value_ptr(n, index + 1), 144 nr_to_copy * value_size); 145 } 146 147 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 148 } 149 150 static unsigned int merge_threshold(struct btree_node *n) 151 { 152 return le32_to_cpu(n->header.max_entries) / 3; 153 } 154 155 struct child { 156 unsigned int index; 157 struct dm_block *block; 158 struct btree_node *n; 159 }; 160 161 static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 162 struct btree_node *parent, 163 unsigned int index, struct child *result) 164 { 165 int r, inc; 166 dm_block_t root; 167 168 result->index = index; 169 root = value64(parent, index); 170 171 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 172 &result->block, &inc); 173 if (r) 174 return r; 175 176 result->n = dm_block_data(result->block); 177 178 if (inc) 179 inc_children(info->tm, result->n, vt); 180 181 *((__le64 *) value_ptr(parent, index)) = 182 cpu_to_le64(dm_block_location(result->block)); 183 184 return 0; 185 } 186 187 static void exit_child(struct dm_btree_info *info, struct child *c) 188 { 189 dm_tm_unlock(info->tm, c->block); 190 } 191 192 static int shift(struct btree_node *left, struct btree_node *right, int count) 193 { 194 int r; 195 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 196 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 197 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 198 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); 199 200 if (max_entries != r_max_entries) { 201 DMERR("node max_entries mismatch"); 202 return -EILSEQ; 203 } 204 205 if (nr_left - count > max_entries) { 206 DMERR("node shift out of bounds"); 207 return -EINVAL; 208 } 209 210 if (nr_right + count > max_entries) { 211 DMERR("node shift out of bounds"); 212 return -EINVAL; 213 } 214 215 if (!count) 216 return 0; 217 218 if (count > 0) { 219 node_shift(right, count); 220 r = node_copy(left, right, count); 221 if (r) 222 return r; 223 } else { 224 r = node_copy(left, right, count); 225 if (r) 226 return r; 227 node_shift(right, count); 228 } 229 230 left->header.nr_entries = cpu_to_le32(nr_left - count); 231 right->header.nr_entries = cpu_to_le32(nr_right + count); 232 233 return 0; 234 } 235 236 static int __rebalance2(struct dm_btree_info *info, struct btree_node *parent, 237 struct child *l, struct child *r) 238 { 239 int ret; 240 struct btree_node *left = l->n; 241 struct btree_node *right = r->n; 242 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 243 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 244 /* 245 * Ensure the number of entries in each child will be greater 246 * than or equal to (max_entries / 3 + 1), so no matter which 247 * child is used for removal, the number will still be not 248 * less than (max_entries / 3). 249 */ 250 unsigned int threshold = 2 * (merge_threshold(left) + 1); 251 252 if (nr_left + nr_right < threshold) { 253 /* 254 * Merge 255 */ 256 node_copy(left, right, -nr_right); 257 left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 258 delete_at(parent, r->index); 259 260 /* 261 * We need to decrement the right block, but not it's 262 * children, since they're still referenced by left. 263 */ 264 dm_tm_dec(info->tm, dm_block_location(r->block)); 265 } else { 266 /* 267 * Rebalance. 268 */ 269 unsigned int target_left = (nr_left + nr_right) / 2; 270 271 ret = shift(left, right, nr_left - target_left); 272 if (ret) 273 return ret; 274 *key_ptr(parent, r->index) = right->keys[0]; 275 } 276 return 0; 277 } 278 279 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 280 struct dm_btree_value_type *vt, unsigned int left_index) 281 { 282 int r; 283 struct btree_node *parent; 284 struct child left, right; 285 286 parent = dm_block_data(shadow_current(s)); 287 288 r = init_child(info, vt, parent, left_index, &left); 289 if (r) 290 return r; 291 292 r = init_child(info, vt, parent, left_index + 1, &right); 293 if (r) { 294 exit_child(info, &left); 295 return r; 296 } 297 298 r = __rebalance2(info, parent, &left, &right); 299 300 exit_child(info, &left); 301 exit_child(info, &right); 302 303 return r; 304 } 305 306 /* 307 * We dump as many entries from center as possible into left, then the rest 308 * in right, then rebalance2. This wastes some cpu, but I want something 309 * simple atm. 310 */ 311 static int delete_center_node(struct dm_btree_info *info, struct btree_node *parent, 312 struct child *l, struct child *c, struct child *r, 313 struct btree_node *left, struct btree_node *center, struct btree_node *right, 314 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 315 { 316 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 317 unsigned int shift = min(max_entries - nr_left, nr_center); 318 319 if (nr_left + shift > max_entries) { 320 DMERR("node shift out of bounds"); 321 return -EINVAL; 322 } 323 324 node_copy(left, center, -shift); 325 left->header.nr_entries = cpu_to_le32(nr_left + shift); 326 327 if (shift != nr_center) { 328 shift = nr_center - shift; 329 330 if ((nr_right + shift) > max_entries) { 331 DMERR("node shift out of bounds"); 332 return -EINVAL; 333 } 334 335 node_shift(right, shift); 336 node_copy(center, right, shift); 337 right->header.nr_entries = cpu_to_le32(nr_right + shift); 338 } 339 *key_ptr(parent, r->index) = right->keys[0]; 340 341 delete_at(parent, c->index); 342 r->index--; 343 344 dm_tm_dec(info->tm, dm_block_location(c->block)); 345 return __rebalance2(info, parent, l, r); 346 } 347 348 /* 349 * Redistributes entries among 3 sibling nodes. 350 */ 351 static int redistribute3(struct dm_btree_info *info, struct btree_node *parent, 352 struct child *l, struct child *c, struct child *r, 353 struct btree_node *left, struct btree_node *center, struct btree_node *right, 354 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 355 { 356 int s, ret; 357 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 358 unsigned int total = nr_left + nr_center + nr_right; 359 unsigned int target_right = total / 3; 360 unsigned int remainder = (target_right * 3) != total; 361 unsigned int target_left = target_right + remainder; 362 363 BUG_ON(target_left > max_entries); 364 BUG_ON(target_right > max_entries); 365 366 if (nr_left < nr_right) { 367 s = nr_left - target_left; 368 369 if (s < 0 && nr_center < -s) { 370 /* not enough in central node */ 371 ret = shift(left, center, -nr_center); 372 if (ret) 373 return ret; 374 375 s += nr_center; 376 ret = shift(left, right, s); 377 if (ret) 378 return ret; 379 380 nr_right += s; 381 } else { 382 ret = shift(left, center, s); 383 if (ret) 384 return ret; 385 } 386 387 ret = shift(center, right, target_right - nr_right); 388 if (ret) 389 return ret; 390 } else { 391 s = target_right - nr_right; 392 if (s > 0 && nr_center < s) { 393 /* not enough in central node */ 394 ret = shift(center, right, nr_center); 395 if (ret) 396 return ret; 397 s -= nr_center; 398 ret = shift(left, right, s); 399 if (ret) 400 return ret; 401 nr_left -= s; 402 } else { 403 ret = shift(center, right, s); 404 if (ret) 405 return ret; 406 } 407 408 ret = shift(left, center, nr_left - target_left); 409 if (ret) 410 return ret; 411 } 412 413 *key_ptr(parent, c->index) = center->keys[0]; 414 *key_ptr(parent, r->index) = right->keys[0]; 415 return 0; 416 } 417 418 static int __rebalance3(struct dm_btree_info *info, struct btree_node *parent, 419 struct child *l, struct child *c, struct child *r) 420 { 421 struct btree_node *left = l->n; 422 struct btree_node *center = c->n; 423 struct btree_node *right = r->n; 424 425 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 426 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 427 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 428 429 unsigned int threshold = merge_threshold(left) * 4 + 1; 430 431 if ((left->header.max_entries != center->header.max_entries) || 432 (center->header.max_entries != right->header.max_entries)) { 433 DMERR("bad btree metadata, max_entries differ"); 434 return -EILSEQ; 435 } 436 437 if ((nr_left + nr_center + nr_right) < threshold) { 438 return delete_center_node(info, parent, l, c, r, left, center, right, 439 nr_left, nr_center, nr_right); 440 } 441 442 return redistribute3(info, parent, l, c, r, left, center, right, 443 nr_left, nr_center, nr_right); 444 } 445 446 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 447 struct dm_btree_value_type *vt, unsigned int left_index) 448 { 449 int r; 450 struct btree_node *parent = dm_block_data(shadow_current(s)); 451 struct child left, center, right; 452 453 /* 454 * FIXME: fill out an array? 455 */ 456 r = init_child(info, vt, parent, left_index, &left); 457 if (r) 458 return r; 459 460 r = init_child(info, vt, parent, left_index + 1, ¢er); 461 if (r) { 462 exit_child(info, &left); 463 return r; 464 } 465 466 r = init_child(info, vt, parent, left_index + 2, &right); 467 if (r) { 468 exit_child(info, &left); 469 exit_child(info, ¢er); 470 return r; 471 } 472 473 r = __rebalance3(info, parent, &left, ¢er, &right); 474 475 exit_child(info, &left); 476 exit_child(info, ¢er); 477 exit_child(info, &right); 478 479 return r; 480 } 481 482 static int rebalance_children(struct shadow_spine *s, 483 struct dm_btree_info *info, 484 struct dm_btree_value_type *vt, uint64_t key) 485 { 486 int i, r, has_left_sibling, has_right_sibling; 487 struct btree_node *n; 488 489 n = dm_block_data(shadow_current(s)); 490 491 if (le32_to_cpu(n->header.nr_entries) == 1) { 492 struct dm_block *child; 493 dm_block_t b = value64(n, 0); 494 495 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 496 if (r) 497 return r; 498 499 memcpy(n, dm_block_data(child), 500 dm_bm_block_size(dm_tm_get_bm(info->tm))); 501 502 dm_tm_dec(info->tm, dm_block_location(child)); 503 dm_tm_unlock(info->tm, child); 504 return 0; 505 } 506 507 i = lower_bound(n, key); 508 if (i < 0) 509 return -ENODATA; 510 511 has_left_sibling = i > 0; 512 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 513 514 if (!has_left_sibling) 515 r = rebalance2(s, info, vt, i); 516 517 else if (!has_right_sibling) 518 r = rebalance2(s, info, vt, i - 1); 519 520 else 521 r = rebalance3(s, info, vt, i - 1); 522 523 return r; 524 } 525 526 static int do_leaf(struct btree_node *n, uint64_t key, unsigned int *index) 527 { 528 int i = lower_bound(n, key); 529 530 if ((i < 0) || 531 (i >= le32_to_cpu(n->header.nr_entries)) || 532 (le64_to_cpu(n->keys[i]) != key)) 533 return -ENODATA; 534 535 *index = i; 536 537 return 0; 538 } 539 540 /* 541 * Prepares for removal from one level of the hierarchy. The caller must 542 * call delete_at() to remove the entry at index. 543 */ 544 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 545 struct dm_btree_value_type *vt, dm_block_t root, 546 uint64_t key, unsigned int *index) 547 { 548 int i = *index, r; 549 struct btree_node *n; 550 551 for (;;) { 552 r = shadow_step(s, root, vt); 553 if (r < 0) 554 break; 555 556 /* 557 * We have to patch up the parent node, ugly, but I don't 558 * see a way to do this automatically as part of the spine 559 * op. 560 */ 561 if (shadow_has_parent(s)) { 562 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 563 564 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 565 &location, sizeof(__le64)); 566 } 567 568 n = dm_block_data(shadow_current(s)); 569 570 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 571 return do_leaf(n, key, index); 572 573 r = rebalance_children(s, info, vt, key); 574 if (r) 575 break; 576 577 n = dm_block_data(shadow_current(s)); 578 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 579 return do_leaf(n, key, index); 580 581 i = lower_bound(n, key); 582 583 /* 584 * We know the key is present, or else 585 * rebalance_children would have returned 586 * -ENODATA 587 */ 588 root = value64(n, i); 589 } 590 591 return r; 592 } 593 594 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 595 uint64_t *keys, dm_block_t *new_root) 596 { 597 unsigned int level, last_level = info->levels - 1; 598 int index = 0, r = 0; 599 struct shadow_spine spine; 600 struct btree_node *n; 601 struct dm_btree_value_type le64_vt; 602 603 init_le64_type(info->tm, &le64_vt); 604 init_shadow_spine(&spine, info); 605 for (level = 0; level < info->levels; level++) { 606 r = remove_raw(&spine, info, 607 (level == last_level ? 608 &info->value_type : &le64_vt), 609 root, keys[level], (unsigned int *)&index); 610 if (r < 0) 611 break; 612 613 n = dm_block_data(shadow_current(&spine)); 614 if (level != last_level) { 615 root = value64(n, index); 616 continue; 617 } 618 619 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 620 621 if (info->value_type.dec) 622 info->value_type.dec(info->value_type.context, 623 value_ptr(n, index), 1); 624 625 delete_at(n, index); 626 } 627 628 if (!r) 629 *new_root = shadow_root(&spine); 630 exit_shadow_spine(&spine); 631 632 return r; 633 } 634 EXPORT_SYMBOL_GPL(dm_btree_remove); 635 636 /*----------------------------------------------------------------*/ 637 638 static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, 639 struct dm_btree_value_type *vt, dm_block_t root, 640 uint64_t key, int *index) 641 { 642 int i = *index, r; 643 struct btree_node *n; 644 645 for (;;) { 646 r = shadow_step(s, root, vt); 647 if (r < 0) 648 break; 649 650 /* 651 * We have to patch up the parent node, ugly, but I don't 652 * see a way to do this automatically as part of the spine 653 * op. 654 */ 655 if (shadow_has_parent(s)) { 656 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 657 658 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 659 &location, sizeof(__le64)); 660 } 661 662 n = dm_block_data(shadow_current(s)); 663 664 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 665 *index = lower_bound(n, key); 666 return 0; 667 } 668 669 r = rebalance_children(s, info, vt, key); 670 if (r) 671 break; 672 673 n = dm_block_data(shadow_current(s)); 674 if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 675 *index = lower_bound(n, key); 676 return 0; 677 } 678 679 i = lower_bound(n, key); 680 681 /* 682 * We know the key is present, or else 683 * rebalance_children would have returned 684 * -ENODATA 685 */ 686 root = value64(n, i); 687 } 688 689 return r; 690 } 691 692 static int remove_one(struct dm_btree_info *info, dm_block_t root, 693 uint64_t *keys, uint64_t end_key, 694 dm_block_t *new_root, unsigned int *nr_removed) 695 { 696 unsigned int level, last_level = info->levels - 1; 697 int index = 0, r = 0; 698 struct shadow_spine spine; 699 struct btree_node *n; 700 struct dm_btree_value_type le64_vt; 701 uint64_t k; 702 703 init_le64_type(info->tm, &le64_vt); 704 init_shadow_spine(&spine, info); 705 for (level = 0; level < last_level; level++) { 706 r = remove_raw(&spine, info, &le64_vt, 707 root, keys[level], (unsigned int *) &index); 708 if (r < 0) 709 goto out; 710 711 n = dm_block_data(shadow_current(&spine)); 712 root = value64(n, index); 713 } 714 715 r = remove_nearest(&spine, info, &info->value_type, 716 root, keys[last_level], &index); 717 if (r < 0) 718 goto out; 719 720 n = dm_block_data(shadow_current(&spine)); 721 722 if (index < 0) 723 index = 0; 724 725 if (index >= le32_to_cpu(n->header.nr_entries)) { 726 r = -ENODATA; 727 goto out; 728 } 729 730 k = le64_to_cpu(n->keys[index]); 731 if (k >= keys[last_level] && k < end_key) { 732 if (info->value_type.dec) 733 info->value_type.dec(info->value_type.context, 734 value_ptr(n, index), 1); 735 736 delete_at(n, index); 737 keys[last_level] = k + 1ull; 738 739 } else 740 r = -ENODATA; 741 742 out: 743 *new_root = shadow_root(&spine); 744 exit_shadow_spine(&spine); 745 746 return r; 747 } 748 749 int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 750 uint64_t *first_key, uint64_t end_key, 751 dm_block_t *new_root, unsigned int *nr_removed) 752 { 753 int r; 754 755 *nr_removed = 0; 756 do { 757 r = remove_one(info, root, first_key, end_key, &root, nr_removed); 758 if (!r) 759 (*nr_removed)++; 760 } while (!r); 761 762 *new_root = root; 763 return r == -ENODATA ? 0 : r; 764 } 765 EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); 766