1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree.h" 8 #include "dm-btree-internal.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 13 /* 14 * Removing an entry from a btree 15 * ============================== 16 * 17 * A very important constraint for our btree is that no node, except the 18 * root, may have fewer than a certain number of entries. 19 * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 20 * 21 * Ensuring this is complicated by the way we want to only ever hold the 22 * locks on 2 nodes concurrently, and only change nodes in a top to bottom 23 * fashion. 24 * 25 * Each node may have a left or right sibling. When decending the spine, 26 * if a node contains only MIN_ENTRIES then we try and increase this to at 27 * least MIN_ENTRIES + 1. We do this in the following ways: 28 * 29 * [A] No siblings => this can only happen if the node is the root, in which 30 * case we copy the childs contents over the root. 31 * 32 * [B] No left sibling 33 * ==> rebalance(node, right sibling) 34 * 35 * [C] No right sibling 36 * ==> rebalance(left sibling, node) 37 * 38 * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 39 * ==> delete node adding it's contents to left and right 40 * 41 * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 42 * ==> rebalance(left, node, right) 43 * 44 * After these operations it's possible that the our original node no 45 * longer contains the desired sub tree. For this reason this rebalancing 46 * is performed on the children of the current node. This also avoids 47 * having a special case for the root. 48 * 49 * Once this rebalancing has occurred we can then step into the child node 50 * for internal nodes. Or delete the entry for leaf nodes. 51 */ 52 53 /* 54 * Some little utilities for moving node data around. 55 */ 56 static void node_shift(struct node *n, int shift) 57 { 58 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 59 uint32_t value_size = le32_to_cpu(n->header.value_size); 60 61 if (shift < 0) { 62 shift = -shift; 63 BUG_ON(shift > nr_entries); 64 BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); 65 memmove(key_ptr(n, 0), 66 key_ptr(n, shift), 67 (nr_entries - shift) * sizeof(__le64)); 68 memmove(value_ptr(n, 0), 69 value_ptr(n, shift), 70 (nr_entries - shift) * value_size); 71 } else { 72 BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 73 memmove(key_ptr(n, shift), 74 key_ptr(n, 0), 75 nr_entries * sizeof(__le64)); 76 memmove(value_ptr(n, shift), 77 value_ptr(n, 0), 78 nr_entries * value_size); 79 } 80 } 81 82 static void node_copy(struct node *left, struct node *right, int shift) 83 { 84 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 85 uint32_t value_size = le32_to_cpu(left->header.value_size); 86 BUG_ON(value_size != le32_to_cpu(right->header.value_size)); 87 88 if (shift < 0) { 89 shift = -shift; 90 BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); 91 memcpy(key_ptr(left, nr_left), 92 key_ptr(right, 0), 93 shift * sizeof(__le64)); 94 memcpy(value_ptr(left, nr_left), 95 value_ptr(right, 0), 96 shift * value_size); 97 } else { 98 BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 99 memcpy(key_ptr(right, 0), 100 key_ptr(left, nr_left - shift), 101 shift * sizeof(__le64)); 102 memcpy(value_ptr(right, 0), 103 value_ptr(left, nr_left - shift), 104 shift * value_size); 105 } 106 } 107 108 /* 109 * Delete a specific entry from a leaf node. 110 */ 111 static void delete_at(struct node *n, unsigned index) 112 { 113 unsigned nr_entries = le32_to_cpu(n->header.nr_entries); 114 unsigned nr_to_copy = nr_entries - (index + 1); 115 uint32_t value_size = le32_to_cpu(n->header.value_size); 116 BUG_ON(index >= nr_entries); 117 118 if (nr_to_copy) { 119 memmove(key_ptr(n, index), 120 key_ptr(n, index + 1), 121 nr_to_copy * sizeof(__le64)); 122 123 memmove(value_ptr(n, index), 124 value_ptr(n, index + 1), 125 nr_to_copy * value_size); 126 } 127 128 n->header.nr_entries = cpu_to_le32(nr_entries - 1); 129 } 130 131 static unsigned merge_threshold(struct node *n) 132 { 133 return le32_to_cpu(n->header.max_entries) / 3; 134 } 135 136 struct child { 137 unsigned index; 138 struct dm_block *block; 139 struct node *n; 140 }; 141 142 static struct dm_btree_value_type le64_type = { 143 .context = NULL, 144 .size = sizeof(__le64), 145 .inc = NULL, 146 .dec = NULL, 147 .equal = NULL 148 }; 149 150 static int init_child(struct dm_btree_info *info, struct node *parent, 151 unsigned index, struct child *result) 152 { 153 int r, inc; 154 dm_block_t root; 155 156 result->index = index; 157 root = value64(parent, index); 158 159 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 160 &result->block, &inc); 161 if (r) 162 return r; 163 164 result->n = dm_block_data(result->block); 165 166 if (inc) 167 inc_children(info->tm, result->n, &le64_type); 168 169 *((__le64 *) value_ptr(parent, index)) = 170 cpu_to_le64(dm_block_location(result->block)); 171 172 return 0; 173 } 174 175 static int exit_child(struct dm_btree_info *info, struct child *c) 176 { 177 return dm_tm_unlock(info->tm, c->block); 178 } 179 180 static void shift(struct node *left, struct node *right, int count) 181 { 182 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 183 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 184 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 185 uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); 186 187 BUG_ON(max_entries != r_max_entries); 188 BUG_ON(nr_left - count > max_entries); 189 BUG_ON(nr_right + count > max_entries); 190 191 if (!count) 192 return; 193 194 if (count > 0) { 195 node_shift(right, count); 196 node_copy(left, right, count); 197 } else { 198 node_copy(left, right, count); 199 node_shift(right, count); 200 } 201 202 left->header.nr_entries = cpu_to_le32(nr_left - count); 203 right->header.nr_entries = cpu_to_le32(nr_right + count); 204 } 205 206 static void __rebalance2(struct dm_btree_info *info, struct node *parent, 207 struct child *l, struct child *r) 208 { 209 struct node *left = l->n; 210 struct node *right = r->n; 211 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 212 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 213 unsigned threshold = 2 * merge_threshold(left) + 1; 214 215 if (nr_left + nr_right < threshold) { 216 /* 217 * Merge 218 */ 219 node_copy(left, right, -nr_right); 220 left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 221 delete_at(parent, r->index); 222 223 /* 224 * We need to decrement the right block, but not it's 225 * children, since they're still referenced by left. 226 */ 227 dm_tm_dec(info->tm, dm_block_location(r->block)); 228 } else { 229 /* 230 * Rebalance. 231 */ 232 unsigned target_left = (nr_left + nr_right) / 2; 233 shift(left, right, nr_left - target_left); 234 *key_ptr(parent, r->index) = right->keys[0]; 235 } 236 } 237 238 static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 239 unsigned left_index) 240 { 241 int r; 242 struct node *parent; 243 struct child left, right; 244 245 parent = dm_block_data(shadow_current(s)); 246 247 r = init_child(info, parent, left_index, &left); 248 if (r) 249 return r; 250 251 r = init_child(info, parent, left_index + 1, &right); 252 if (r) { 253 exit_child(info, &left); 254 return r; 255 } 256 257 __rebalance2(info, parent, &left, &right); 258 259 r = exit_child(info, &left); 260 if (r) { 261 exit_child(info, &right); 262 return r; 263 } 264 265 return exit_child(info, &right); 266 } 267 268 /* 269 * We dump as many entries from center as possible into left, then the rest 270 * in right, then rebalance2. This wastes some cpu, but I want something 271 * simple atm. 272 */ 273 static void delete_center_node(struct dm_btree_info *info, struct node *parent, 274 struct child *l, struct child *c, struct child *r, 275 struct node *left, struct node *center, struct node *right, 276 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 277 { 278 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 279 unsigned shift = min(max_entries - nr_left, nr_center); 280 281 BUG_ON(nr_left + shift > max_entries); 282 node_copy(left, center, -shift); 283 left->header.nr_entries = cpu_to_le32(nr_left + shift); 284 285 if (shift != nr_center) { 286 shift = nr_center - shift; 287 BUG_ON((nr_right + shift) > max_entries); 288 node_shift(right, shift); 289 node_copy(center, right, shift); 290 right->header.nr_entries = cpu_to_le32(nr_right + shift); 291 } 292 *key_ptr(parent, r->index) = right->keys[0]; 293 294 delete_at(parent, c->index); 295 r->index--; 296 297 dm_tm_dec(info->tm, dm_block_location(c->block)); 298 __rebalance2(info, parent, l, r); 299 } 300 301 /* 302 * Redistributes entries among 3 sibling nodes. 303 */ 304 static void redistribute3(struct dm_btree_info *info, struct node *parent, 305 struct child *l, struct child *c, struct child *r, 306 struct node *left, struct node *center, struct node *right, 307 uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 308 { 309 int s; 310 uint32_t max_entries = le32_to_cpu(left->header.max_entries); 311 unsigned target = (nr_left + nr_center + nr_right) / 3; 312 BUG_ON(target > max_entries); 313 314 if (nr_left < nr_right) { 315 s = nr_left - target; 316 317 if (s < 0 && nr_center < -s) { 318 /* not enough in central node */ 319 shift(left, center, nr_center); 320 s = nr_center - target; 321 shift(left, right, s); 322 nr_right += s; 323 } else 324 shift(left, center, s); 325 326 shift(center, right, target - nr_right); 327 328 } else { 329 s = target - nr_right; 330 if (s > 0 && nr_center < s) { 331 /* not enough in central node */ 332 shift(center, right, nr_center); 333 s = target - nr_center; 334 shift(left, right, s); 335 nr_left -= s; 336 } else 337 shift(center, right, s); 338 339 shift(left, center, nr_left - target); 340 } 341 342 *key_ptr(parent, c->index) = center->keys[0]; 343 *key_ptr(parent, r->index) = right->keys[0]; 344 } 345 346 static void __rebalance3(struct dm_btree_info *info, struct node *parent, 347 struct child *l, struct child *c, struct child *r) 348 { 349 struct node *left = l->n; 350 struct node *center = c->n; 351 struct node *right = r->n; 352 353 uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 354 uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 355 uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 356 357 unsigned threshold = merge_threshold(left) * 4 + 1; 358 359 BUG_ON(left->header.max_entries != center->header.max_entries); 360 BUG_ON(center->header.max_entries != right->header.max_entries); 361 362 if ((nr_left + nr_center + nr_right) < threshold) 363 delete_center_node(info, parent, l, c, r, left, center, right, 364 nr_left, nr_center, nr_right); 365 else 366 redistribute3(info, parent, l, c, r, left, center, right, 367 nr_left, nr_center, nr_right); 368 } 369 370 static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 371 unsigned left_index) 372 { 373 int r; 374 struct node *parent = dm_block_data(shadow_current(s)); 375 struct child left, center, right; 376 377 /* 378 * FIXME: fill out an array? 379 */ 380 r = init_child(info, parent, left_index, &left); 381 if (r) 382 return r; 383 384 r = init_child(info, parent, left_index + 1, ¢er); 385 if (r) { 386 exit_child(info, &left); 387 return r; 388 } 389 390 r = init_child(info, parent, left_index + 2, &right); 391 if (r) { 392 exit_child(info, &left); 393 exit_child(info, ¢er); 394 return r; 395 } 396 397 __rebalance3(info, parent, &left, ¢er, &right); 398 399 r = exit_child(info, &left); 400 if (r) { 401 exit_child(info, ¢er); 402 exit_child(info, &right); 403 return r; 404 } 405 406 r = exit_child(info, ¢er); 407 if (r) { 408 exit_child(info, &right); 409 return r; 410 } 411 412 r = exit_child(info, &right); 413 if (r) 414 return r; 415 416 return 0; 417 } 418 419 static int get_nr_entries(struct dm_transaction_manager *tm, 420 dm_block_t b, uint32_t *result) 421 { 422 int r; 423 struct dm_block *block; 424 struct node *n; 425 426 r = dm_tm_read_lock(tm, b, &btree_node_validator, &block); 427 if (r) 428 return r; 429 430 n = dm_block_data(block); 431 *result = le32_to_cpu(n->header.nr_entries); 432 433 return dm_tm_unlock(tm, block); 434 } 435 436 static int rebalance_children(struct shadow_spine *s, 437 struct dm_btree_info *info, uint64_t key) 438 { 439 int i, r, has_left_sibling, has_right_sibling; 440 uint32_t child_entries; 441 struct node *n; 442 443 n = dm_block_data(shadow_current(s)); 444 445 if (le32_to_cpu(n->header.nr_entries) == 1) { 446 struct dm_block *child; 447 dm_block_t b = value64(n, 0); 448 449 r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 450 if (r) 451 return r; 452 453 memcpy(n, dm_block_data(child), 454 dm_bm_block_size(dm_tm_get_bm(info->tm))); 455 r = dm_tm_unlock(info->tm, child); 456 if (r) 457 return r; 458 459 dm_tm_dec(info->tm, dm_block_location(child)); 460 return 0; 461 } 462 463 i = lower_bound(n, key); 464 if (i < 0) 465 return -ENODATA; 466 467 r = get_nr_entries(info->tm, value64(n, i), &child_entries); 468 if (r) 469 return r; 470 471 has_left_sibling = i > 0; 472 has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 473 474 if (!has_left_sibling) 475 r = rebalance2(s, info, i); 476 477 else if (!has_right_sibling) 478 r = rebalance2(s, info, i - 1); 479 480 else 481 r = rebalance3(s, info, i - 1); 482 483 return r; 484 } 485 486 static int do_leaf(struct node *n, uint64_t key, unsigned *index) 487 { 488 int i = lower_bound(n, key); 489 490 if ((i < 0) || 491 (i >= le32_to_cpu(n->header.nr_entries)) || 492 (le64_to_cpu(n->keys[i]) != key)) 493 return -ENODATA; 494 495 *index = i; 496 497 return 0; 498 } 499 500 /* 501 * Prepares for removal from one level of the hierarchy. The caller must 502 * call delete_at() to remove the entry at index. 503 */ 504 static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 505 struct dm_btree_value_type *vt, dm_block_t root, 506 uint64_t key, unsigned *index) 507 { 508 int i = *index, r; 509 struct node *n; 510 511 for (;;) { 512 r = shadow_step(s, root, vt); 513 if (r < 0) 514 break; 515 516 /* 517 * We have to patch up the parent node, ugly, but I don't 518 * see a way to do this automatically as part of the spine 519 * op. 520 */ 521 if (shadow_has_parent(s)) { 522 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 523 memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 524 &location, sizeof(__le64)); 525 } 526 527 n = dm_block_data(shadow_current(s)); 528 529 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 530 return do_leaf(n, key, index); 531 532 r = rebalance_children(s, info, key); 533 if (r) 534 break; 535 536 n = dm_block_data(shadow_current(s)); 537 if (le32_to_cpu(n->header.flags) & LEAF_NODE) 538 return do_leaf(n, key, index); 539 540 i = lower_bound(n, key); 541 542 /* 543 * We know the key is present, or else 544 * rebalance_children would have returned 545 * -ENODATA 546 */ 547 root = value64(n, i); 548 } 549 550 return r; 551 } 552 553 int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 554 uint64_t *keys, dm_block_t *new_root) 555 { 556 unsigned level, last_level = info->levels - 1; 557 int index = 0, r = 0; 558 struct shadow_spine spine; 559 struct node *n; 560 561 init_shadow_spine(&spine, info); 562 for (level = 0; level < info->levels; level++) { 563 r = remove_raw(&spine, info, 564 (level == last_level ? 565 &info->value_type : &le64_type), 566 root, keys[level], (unsigned *)&index); 567 if (r < 0) 568 break; 569 570 n = dm_block_data(shadow_current(&spine)); 571 if (level != last_level) { 572 root = value64(n, index); 573 continue; 574 } 575 576 BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 577 578 if (info->value_type.dec) 579 info->value_type.dec(info->value_type.context, 580 value_ptr(n, index)); 581 582 delete_at(n, index); 583 } 584 585 *new_root = shadow_root(&spine); 586 exit_shadow_spine(&spine); 587 588 return r; 589 } 590 EXPORT_SYMBOL_GPL(dm_btree_remove); 591