1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct btree_node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 static int upper_bound(struct btree_node *n, uint64_t key) 67 { 68 return bsearch(n, key, 1); 69 } 70 71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 72 struct dm_btree_value_type *vt) 73 { 74 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 75 76 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 77 dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); 78 79 else if (vt->inc) 80 vt->inc(vt->context, value_ptr(n, 0), nr_entries); 81 } 82 83 static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 84 uint64_t key, void *value) 85 __dm_written_to_disk(value) 86 { 87 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 88 uint32_t max_entries = le32_to_cpu(node->header.max_entries); 89 __le64 key_le = cpu_to_le64(key); 90 91 if (index > nr_entries || 92 index >= max_entries || 93 nr_entries >= max_entries) { 94 DMERR("too many entries in btree node for insert"); 95 __dm_unbless_for_disk(value); 96 return -ENOMEM; 97 } 98 99 __dm_bless_for_disk(&key_le); 100 101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 102 array_insert(value_base(node), value_size, nr_entries, index, value); 103 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 104 105 return 0; 106 } 107 108 /*----------------------------------------------------------------*/ 109 110 /* 111 * We want 3n entries (for some n). This works more nicely for repeated 112 * insert remove loops than (2n + 1). 113 */ 114 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 115 { 116 uint32_t total, n; 117 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 118 119 block_size -= sizeof(struct node_header); 120 total = block_size / elt_size; 121 n = total / 3; /* rounds down */ 122 123 return 3 * n; 124 } 125 126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 127 { 128 int r; 129 struct dm_block *b; 130 struct btree_node *n; 131 size_t block_size; 132 uint32_t max_entries; 133 134 r = new_block(info, &b); 135 if (r < 0) 136 return r; 137 138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 139 max_entries = calc_max_entries(info->value_type.size, block_size); 140 141 n = dm_block_data(b); 142 memset(n, 0, block_size); 143 n->header.flags = cpu_to_le32(LEAF_NODE); 144 n->header.nr_entries = cpu_to_le32(0); 145 n->header.max_entries = cpu_to_le32(max_entries); 146 n->header.value_size = cpu_to_le32(info->value_type.size); 147 148 *root = dm_block_location(b); 149 unlock_block(info, b); 150 151 return 0; 152 } 153 EXPORT_SYMBOL_GPL(dm_btree_empty); 154 155 /*----------------------------------------------------------------*/ 156 157 /* 158 * Deletion uses a recursive algorithm, since we have limited stack space 159 * we explicitly manage our own stack on the heap. 160 */ 161 #define MAX_SPINE_DEPTH 64 162 struct frame { 163 struct dm_block *b; 164 struct btree_node *n; 165 unsigned level; 166 unsigned nr_children; 167 unsigned current_child; 168 }; 169 170 struct del_stack { 171 struct dm_btree_info *info; 172 struct dm_transaction_manager *tm; 173 int top; 174 struct frame spine[MAX_SPINE_DEPTH]; 175 }; 176 177 static int top_frame(struct del_stack *s, struct frame **f) 178 { 179 if (s->top < 0) { 180 DMERR("btree deletion stack empty"); 181 return -EINVAL; 182 } 183 184 *f = s->spine + s->top; 185 186 return 0; 187 } 188 189 static int unprocessed_frames(struct del_stack *s) 190 { 191 return s->top >= 0; 192 } 193 194 static void prefetch_children(struct del_stack *s, struct frame *f) 195 { 196 unsigned i; 197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 198 199 for (i = 0; i < f->nr_children; i++) 200 dm_bm_prefetch(bm, value64(f->n, i)); 201 } 202 203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 204 { 205 return f->level < (info->levels - 1); 206 } 207 208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 209 { 210 int r; 211 uint32_t ref_count; 212 213 if (s->top >= MAX_SPINE_DEPTH - 1) { 214 DMERR("btree deletion stack out of memory"); 215 return -ENOMEM; 216 } 217 218 r = dm_tm_ref(s->tm, b, &ref_count); 219 if (r) 220 return r; 221 222 if (ref_count > 1) 223 /* 224 * This is a shared node, so we can just decrement it's 225 * reference counter and leave the children. 226 */ 227 dm_tm_dec(s->tm, b); 228 229 else { 230 uint32_t flags; 231 struct frame *f = s->spine + ++s->top; 232 233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 234 if (r) { 235 s->top--; 236 return r; 237 } 238 239 f->n = dm_block_data(f->b); 240 f->level = level; 241 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 242 f->current_child = 0; 243 244 flags = le32_to_cpu(f->n->header.flags); 245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 246 prefetch_children(s, f); 247 } 248 249 return 0; 250 } 251 252 static void pop_frame(struct del_stack *s) 253 { 254 struct frame *f = s->spine + s->top--; 255 256 dm_tm_dec(s->tm, dm_block_location(f->b)); 257 dm_tm_unlock(s->tm, f->b); 258 } 259 260 static void unlock_all_frames(struct del_stack *s) 261 { 262 struct frame *f; 263 264 while (unprocessed_frames(s)) { 265 f = s->spine + s->top--; 266 dm_tm_unlock(s->tm, f->b); 267 } 268 } 269 270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 271 { 272 int r; 273 struct del_stack *s; 274 275 /* 276 * dm_btree_del() is called via an ioctl, as such should be 277 * considered an FS op. We can't recurse back into the FS, so we 278 * allocate GFP_NOFS. 279 */ 280 s = kmalloc(sizeof(*s), GFP_NOFS); 281 if (!s) 282 return -ENOMEM; 283 s->info = info; 284 s->tm = info->tm; 285 s->top = -1; 286 287 r = push_frame(s, root, 0); 288 if (r) 289 goto out; 290 291 while (unprocessed_frames(s)) { 292 uint32_t flags; 293 struct frame *f; 294 dm_block_t b; 295 296 r = top_frame(s, &f); 297 if (r) 298 goto out; 299 300 if (f->current_child >= f->nr_children) { 301 pop_frame(s); 302 continue; 303 } 304 305 flags = le32_to_cpu(f->n->header.flags); 306 if (flags & INTERNAL_NODE) { 307 b = value64(f->n, f->current_child); 308 f->current_child++; 309 r = push_frame(s, b, f->level); 310 if (r) 311 goto out; 312 313 } else if (is_internal_level(info, f)) { 314 b = value64(f->n, f->current_child); 315 f->current_child++; 316 r = push_frame(s, b, f->level + 1); 317 if (r) 318 goto out; 319 320 } else { 321 if (info->value_type.dec) 322 info->value_type.dec(info->value_type.context, 323 value_ptr(f->n, 0), f->nr_children); 324 pop_frame(s); 325 } 326 } 327 out: 328 if (r) { 329 /* cleanup all frames of del_stack */ 330 unlock_all_frames(s); 331 } 332 kfree(s); 333 334 return r; 335 } 336 EXPORT_SYMBOL_GPL(dm_btree_del); 337 338 /*----------------------------------------------------------------*/ 339 340 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 341 int (*search_fn)(struct btree_node *, uint64_t), 342 uint64_t *result_key, void *v, size_t value_size) 343 { 344 int i, r; 345 uint32_t flags, nr_entries; 346 347 do { 348 r = ro_step(s, block); 349 if (r < 0) 350 return r; 351 352 i = search_fn(ro_node(s), key); 353 354 flags = le32_to_cpu(ro_node(s)->header.flags); 355 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 356 if (i < 0 || i >= nr_entries) 357 return -ENODATA; 358 359 if (flags & INTERNAL_NODE) 360 block = value64(ro_node(s), i); 361 362 } while (!(flags & LEAF_NODE)); 363 364 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 365 if (v) 366 memcpy(v, value_ptr(ro_node(s), i), value_size); 367 368 return 0; 369 } 370 371 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 372 uint64_t *keys, void *value_le) 373 { 374 unsigned level, last_level = info->levels - 1; 375 int r = -ENODATA; 376 uint64_t rkey; 377 __le64 internal_value_le; 378 struct ro_spine spine; 379 380 init_ro_spine(&spine, info); 381 for (level = 0; level < info->levels; level++) { 382 size_t size; 383 void *value_p; 384 385 if (level == last_level) { 386 value_p = value_le; 387 size = info->value_type.size; 388 389 } else { 390 value_p = &internal_value_le; 391 size = sizeof(uint64_t); 392 } 393 394 r = btree_lookup_raw(&spine, root, keys[level], 395 lower_bound, &rkey, 396 value_p, size); 397 398 if (!r) { 399 if (rkey != keys[level]) { 400 exit_ro_spine(&spine); 401 return -ENODATA; 402 } 403 } else { 404 exit_ro_spine(&spine); 405 return r; 406 } 407 408 root = le64_to_cpu(internal_value_le); 409 } 410 exit_ro_spine(&spine); 411 412 return r; 413 } 414 EXPORT_SYMBOL_GPL(dm_btree_lookup); 415 416 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 417 uint64_t key, uint64_t *rkey, void *value_le) 418 { 419 int r, i; 420 uint32_t flags, nr_entries; 421 struct dm_block *node; 422 struct btree_node *n; 423 424 r = bn_read_lock(info, root, &node); 425 if (r) 426 return r; 427 428 n = dm_block_data(node); 429 flags = le32_to_cpu(n->header.flags); 430 nr_entries = le32_to_cpu(n->header.nr_entries); 431 432 if (flags & INTERNAL_NODE) { 433 i = lower_bound(n, key); 434 if (i < 0) { 435 /* 436 * avoid early -ENODATA return when all entries are 437 * higher than the search @key. 438 */ 439 i = 0; 440 } 441 if (i >= nr_entries) { 442 r = -ENODATA; 443 goto out; 444 } 445 446 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 447 if (r == -ENODATA && i < (nr_entries - 1)) { 448 i++; 449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 450 } 451 452 } else { 453 i = upper_bound(n, key); 454 if (i < 0 || i >= nr_entries) { 455 r = -ENODATA; 456 goto out; 457 } 458 459 *rkey = le64_to_cpu(n->keys[i]); 460 memcpy(value_le, value_ptr(n, i), info->value_type.size); 461 } 462 out: 463 dm_tm_unlock(info->tm, node); 464 return r; 465 } 466 467 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 468 uint64_t *keys, uint64_t *rkey, void *value_le) 469 { 470 unsigned level; 471 int r = -ENODATA; 472 __le64 internal_value_le; 473 struct ro_spine spine; 474 475 init_ro_spine(&spine, info); 476 for (level = 0; level < info->levels - 1u; level++) { 477 r = btree_lookup_raw(&spine, root, keys[level], 478 lower_bound, rkey, 479 &internal_value_le, sizeof(uint64_t)); 480 if (r) 481 goto out; 482 483 if (*rkey != keys[level]) { 484 r = -ENODATA; 485 goto out; 486 } 487 488 root = le64_to_cpu(internal_value_le); 489 } 490 491 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 492 out: 493 exit_ro_spine(&spine); 494 return r; 495 } 496 497 EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 498 499 /*----------------------------------------------------------------*/ 500 501 /* 502 * Copies entries from one region of a btree node to another. The regions 503 * must not overlap. 504 */ 505 static void copy_entries(struct btree_node *dest, unsigned dest_offset, 506 struct btree_node *src, unsigned src_offset, 507 unsigned count) 508 { 509 size_t value_size = le32_to_cpu(dest->header.value_size); 510 memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 511 memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 512 } 513 514 /* 515 * Moves entries from one region fo a btree node to another. The regions 516 * may overlap. 517 */ 518 static void move_entries(struct btree_node *dest, unsigned dest_offset, 519 struct btree_node *src, unsigned src_offset, 520 unsigned count) 521 { 522 size_t value_size = le32_to_cpu(dest->header.value_size); 523 memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 524 memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 525 } 526 527 /* 528 * Erases the first 'count' entries of a btree node, shifting following 529 * entries down into their place. 530 */ 531 static void shift_down(struct btree_node *n, unsigned count) 532 { 533 move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); 534 } 535 536 /* 537 * Moves entries in a btree node up 'count' places, making space for 538 * new entries at the start of the node. 539 */ 540 static void shift_up(struct btree_node *n, unsigned count) 541 { 542 move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); 543 } 544 545 /* 546 * Redistributes entries between two btree nodes to make them 547 * have similar numbers of entries. 548 */ 549 static void redistribute2(struct btree_node *left, struct btree_node *right) 550 { 551 unsigned nr_left = le32_to_cpu(left->header.nr_entries); 552 unsigned nr_right = le32_to_cpu(right->header.nr_entries); 553 unsigned total = nr_left + nr_right; 554 unsigned target_left = total / 2; 555 unsigned target_right = total - target_left; 556 557 if (nr_left < target_left) { 558 unsigned delta = target_left - nr_left; 559 copy_entries(left, nr_left, right, 0, delta); 560 shift_down(right, delta); 561 } else if (nr_left > target_left) { 562 unsigned delta = nr_left - target_left; 563 if (nr_right) 564 shift_up(right, delta); 565 copy_entries(right, 0, left, target_left, delta); 566 } 567 568 left->header.nr_entries = cpu_to_le32(target_left); 569 right->header.nr_entries = cpu_to_le32(target_right); 570 } 571 572 /* 573 * Redistribute entries between three nodes. Assumes the central 574 * node is empty. 575 */ 576 static void redistribute3(struct btree_node *left, struct btree_node *center, 577 struct btree_node *right) 578 { 579 unsigned nr_left = le32_to_cpu(left->header.nr_entries); 580 unsigned nr_center = le32_to_cpu(center->header.nr_entries); 581 unsigned nr_right = le32_to_cpu(right->header.nr_entries); 582 unsigned total, target_left, target_center, target_right; 583 584 BUG_ON(nr_center); 585 586 total = nr_left + nr_right; 587 target_left = total / 3; 588 target_center = (total - target_left) / 2; 589 target_right = (total - target_left - target_center); 590 591 if (nr_left < target_left) { 592 unsigned left_short = target_left - nr_left; 593 copy_entries(left, nr_left, right, 0, left_short); 594 copy_entries(center, 0, right, left_short, target_center); 595 shift_down(right, nr_right - target_right); 596 597 } else if (nr_left < (target_left + target_center)) { 598 unsigned left_to_center = nr_left - target_left; 599 copy_entries(center, 0, left, target_left, left_to_center); 600 copy_entries(center, left_to_center, right, 0, target_center - left_to_center); 601 shift_down(right, nr_right - target_right); 602 603 } else { 604 unsigned right_short = target_right - nr_right; 605 shift_up(right, right_short); 606 copy_entries(right, 0, left, nr_left - right_short, right_short); 607 copy_entries(center, 0, left, target_left, nr_left - target_left); 608 } 609 610 left->header.nr_entries = cpu_to_le32(target_left); 611 center->header.nr_entries = cpu_to_le32(target_center); 612 right->header.nr_entries = cpu_to_le32(target_right); 613 } 614 615 /* 616 * Splits a node by creating a sibling node and shifting half the nodes 617 * contents across. Assumes there is a parent node, and it has room for 618 * another child. 619 * 620 * Before: 621 * +--------+ 622 * | Parent | 623 * +--------+ 624 * | 625 * v 626 * +----------+ 627 * | A ++++++ | 628 * +----------+ 629 * 630 * 631 * After: 632 * +--------+ 633 * | Parent | 634 * +--------+ 635 * | | 636 * v +------+ 637 * +---------+ | 638 * | A* +++ | v 639 * +---------+ +-------+ 640 * | B +++ | 641 * +-------+ 642 * 643 * Where A* is a shadow of A. 644 */ 645 static int split_one_into_two(struct shadow_spine *s, unsigned parent_index, 646 struct dm_btree_value_type *vt, uint64_t key) 647 { 648 int r; 649 struct dm_block *left, *right, *parent; 650 struct btree_node *ln, *rn, *pn; 651 __le64 location; 652 653 left = shadow_current(s); 654 655 r = new_block(s->info, &right); 656 if (r < 0) 657 return r; 658 659 ln = dm_block_data(left); 660 rn = dm_block_data(right); 661 662 rn->header.flags = ln->header.flags; 663 rn->header.nr_entries = cpu_to_le32(0); 664 rn->header.max_entries = ln->header.max_entries; 665 rn->header.value_size = ln->header.value_size; 666 redistribute2(ln, rn); 667 668 /* patch up the parent */ 669 parent = shadow_parent(s); 670 pn = dm_block_data(parent); 671 672 location = cpu_to_le64(dm_block_location(right)); 673 __dm_bless_for_disk(&location); 674 r = insert_at(sizeof(__le64), pn, parent_index + 1, 675 le64_to_cpu(rn->keys[0]), &location); 676 if (r) { 677 unlock_block(s->info, right); 678 return r; 679 } 680 681 /* patch up the spine */ 682 if (key < le64_to_cpu(rn->keys[0])) { 683 unlock_block(s->info, right); 684 s->nodes[1] = left; 685 } else { 686 unlock_block(s->info, left); 687 s->nodes[1] = right; 688 } 689 690 return 0; 691 } 692 693 /* 694 * We often need to modify a sibling node. This function shadows a particular 695 * child of the given parent node. Making sure to update the parent to point 696 * to the new shadow. 697 */ 698 static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 699 struct btree_node *parent, unsigned index, 700 struct dm_block **result) 701 { 702 int r, inc; 703 dm_block_t root; 704 struct btree_node *node; 705 706 root = value64(parent, index); 707 708 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 709 result, &inc); 710 if (r) 711 return r; 712 713 node = dm_block_data(*result); 714 715 if (inc) 716 inc_children(info->tm, node, vt); 717 718 *((__le64 *) value_ptr(parent, index)) = 719 cpu_to_le64(dm_block_location(*result)); 720 721 return 0; 722 } 723 724 /* 725 * Splits two nodes into three. This is more work, but results in fuller 726 * nodes, so saves metadata space. 727 */ 728 static int split_two_into_three(struct shadow_spine *s, unsigned parent_index, 729 struct dm_btree_value_type *vt, uint64_t key) 730 { 731 int r; 732 unsigned middle_index; 733 struct dm_block *left, *middle, *right, *parent; 734 struct btree_node *ln, *rn, *mn, *pn; 735 __le64 location; 736 737 parent = shadow_parent(s); 738 pn = dm_block_data(parent); 739 740 if (parent_index == 0) { 741 middle_index = 1; 742 left = shadow_current(s); 743 r = shadow_child(s->info, vt, pn, parent_index + 1, &right); 744 if (r) 745 return r; 746 } else { 747 middle_index = parent_index; 748 right = shadow_current(s); 749 r = shadow_child(s->info, vt, pn, parent_index - 1, &left); 750 if (r) 751 return r; 752 } 753 754 r = new_block(s->info, &middle); 755 if (r < 0) 756 return r; 757 758 ln = dm_block_data(left); 759 mn = dm_block_data(middle); 760 rn = dm_block_data(right); 761 762 mn->header.nr_entries = cpu_to_le32(0); 763 mn->header.flags = ln->header.flags; 764 mn->header.max_entries = ln->header.max_entries; 765 mn->header.value_size = ln->header.value_size; 766 767 redistribute3(ln, mn, rn); 768 769 /* patch up the parent */ 770 pn->keys[middle_index] = rn->keys[0]; 771 location = cpu_to_le64(dm_block_location(middle)); 772 __dm_bless_for_disk(&location); 773 r = insert_at(sizeof(__le64), pn, middle_index, 774 le64_to_cpu(mn->keys[0]), &location); 775 if (r) { 776 if (shadow_current(s) != left) 777 unlock_block(s->info, left); 778 779 unlock_block(s->info, middle); 780 781 if (shadow_current(s) != right) 782 unlock_block(s->info, right); 783 784 return r; 785 } 786 787 788 /* patch up the spine */ 789 if (key < le64_to_cpu(mn->keys[0])) { 790 unlock_block(s->info, middle); 791 unlock_block(s->info, right); 792 s->nodes[1] = left; 793 } else if (key < le64_to_cpu(rn->keys[0])) { 794 unlock_block(s->info, left); 795 unlock_block(s->info, right); 796 s->nodes[1] = middle; 797 } else { 798 unlock_block(s->info, left); 799 unlock_block(s->info, middle); 800 s->nodes[1] = right; 801 } 802 803 return 0; 804 } 805 806 /*----------------------------------------------------------------*/ 807 808 /* 809 * Splits a node by creating two new children beneath the given node. 810 * 811 * Before: 812 * +----------+ 813 * | A ++++++ | 814 * +----------+ 815 * 816 * 817 * After: 818 * +------------+ 819 * | A (shadow) | 820 * +------------+ 821 * | | 822 * +------+ +----+ 823 * | | 824 * v v 825 * +-------+ +-------+ 826 * | B +++ | | C +++ | 827 * +-------+ +-------+ 828 */ 829 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 830 { 831 int r; 832 size_t size; 833 unsigned nr_left, nr_right; 834 struct dm_block *left, *right, *new_parent; 835 struct btree_node *pn, *ln, *rn; 836 __le64 val; 837 838 new_parent = shadow_current(s); 839 840 pn = dm_block_data(new_parent); 841 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 842 sizeof(__le64) : s->info->value_type.size; 843 844 /* create & init the left block */ 845 r = new_block(s->info, &left); 846 if (r < 0) 847 return r; 848 849 ln = dm_block_data(left); 850 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 851 852 ln->header.flags = pn->header.flags; 853 ln->header.nr_entries = cpu_to_le32(nr_left); 854 ln->header.max_entries = pn->header.max_entries; 855 ln->header.value_size = pn->header.value_size; 856 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 857 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 858 859 /* create & init the right block */ 860 r = new_block(s->info, &right); 861 if (r < 0) { 862 unlock_block(s->info, left); 863 return r; 864 } 865 866 rn = dm_block_data(right); 867 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 868 869 rn->header.flags = pn->header.flags; 870 rn->header.nr_entries = cpu_to_le32(nr_right); 871 rn->header.max_entries = pn->header.max_entries; 872 rn->header.value_size = pn->header.value_size; 873 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 874 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 875 nr_right * size); 876 877 /* new_parent should just point to l and r now */ 878 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 879 pn->header.nr_entries = cpu_to_le32(2); 880 pn->header.max_entries = cpu_to_le32( 881 calc_max_entries(sizeof(__le64), 882 dm_bm_block_size( 883 dm_tm_get_bm(s->info->tm)))); 884 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 885 886 val = cpu_to_le64(dm_block_location(left)); 887 __dm_bless_for_disk(&val); 888 pn->keys[0] = ln->keys[0]; 889 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 890 891 val = cpu_to_le64(dm_block_location(right)); 892 __dm_bless_for_disk(&val); 893 pn->keys[1] = rn->keys[0]; 894 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 895 896 unlock_block(s->info, left); 897 unlock_block(s->info, right); 898 return 0; 899 } 900 901 /*----------------------------------------------------------------*/ 902 903 /* 904 * Redistributes a node's entries with its left sibling. 905 */ 906 static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, 907 unsigned parent_index, uint64_t key) 908 { 909 int r; 910 struct dm_block *sib; 911 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 912 913 r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); 914 if (r) 915 return r; 916 917 left = dm_block_data(sib); 918 right = dm_block_data(shadow_current(s)); 919 redistribute2(left, right); 920 *key_ptr(parent, parent_index) = right->keys[0]; 921 922 if (key < le64_to_cpu(right->keys[0])) { 923 unlock_block(s->info, s->nodes[1]); 924 s->nodes[1] = sib; 925 } else { 926 unlock_block(s->info, sib); 927 } 928 929 return 0; 930 } 931 932 /* 933 * Redistributes a nodes entries with its right sibling. 934 */ 935 static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, 936 unsigned parent_index, uint64_t key) 937 { 938 int r; 939 struct dm_block *sib; 940 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 941 942 r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); 943 if (r) 944 return r; 945 946 left = dm_block_data(shadow_current(s)); 947 right = dm_block_data(sib); 948 redistribute2(left, right); 949 *key_ptr(parent, parent_index + 1) = right->keys[0]; 950 951 if (key < le64_to_cpu(right->keys[0])) { 952 unlock_block(s->info, sib); 953 } else { 954 unlock_block(s->info, s->nodes[1]); 955 s->nodes[1] = sib; 956 } 957 958 return 0; 959 } 960 961 /* 962 * Returns the number of spare entries in a node. 963 */ 964 static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space) 965 { 966 int r; 967 unsigned nr_entries; 968 struct dm_block *block; 969 struct btree_node *node; 970 971 r = bn_read_lock(info, b, &block); 972 if (r) 973 return r; 974 975 node = dm_block_data(block); 976 nr_entries = le32_to_cpu(node->header.nr_entries); 977 *space = le32_to_cpu(node->header.max_entries) - nr_entries; 978 979 unlock_block(info, block); 980 return 0; 981 } 982 983 /* 984 * Make space in a node, either by moving some entries to a sibling, 985 * or creating a new sibling node. SPACE_THRESHOLD defines the minimum 986 * number of free entries that must be in the sibling to make the move 987 * worth while. If the siblings are shared (eg, part of a snapshot), 988 * then they are not touched, since this break sharing and so consume 989 * more space than we save. 990 */ 991 #define SPACE_THRESHOLD 8 992 static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, 993 unsigned parent_index, uint64_t key) 994 { 995 int r; 996 struct btree_node *parent = dm_block_data(shadow_parent(s)); 997 unsigned nr_parent = le32_to_cpu(parent->header.nr_entries); 998 unsigned free_space; 999 int left_shared = 0, right_shared = 0; 1000 1001 /* Should we move entries to the left sibling? */ 1002 if (parent_index > 0) { 1003 dm_block_t left_b = value64(parent, parent_index - 1); 1004 r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); 1005 if (r) 1006 return r; 1007 1008 if (!left_shared) { 1009 r = get_node_free_space(s->info, left_b, &free_space); 1010 if (r) 1011 return r; 1012 1013 if (free_space >= SPACE_THRESHOLD) 1014 return rebalance_left(s, vt, parent_index, key); 1015 } 1016 } 1017 1018 /* Should we move entries to the right sibling? */ 1019 if (parent_index < (nr_parent - 1)) { 1020 dm_block_t right_b = value64(parent, parent_index + 1); 1021 r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); 1022 if (r) 1023 return r; 1024 1025 if (!right_shared) { 1026 r = get_node_free_space(s->info, right_b, &free_space); 1027 if (r) 1028 return r; 1029 1030 if (free_space >= SPACE_THRESHOLD) 1031 return rebalance_right(s, vt, parent_index, key); 1032 } 1033 } 1034 1035 /* 1036 * We need to split the node, normally we split two nodes 1037 * into three. But when inserting a sequence that is either 1038 * monotonically increasing or decreasing it's better to split 1039 * a single node into two. 1040 */ 1041 if (left_shared || right_shared || (nr_parent <= 2) || 1042 (parent_index == 0) || (parent_index + 1 == nr_parent)) { 1043 return split_one_into_two(s, parent_index, vt, key); 1044 } else { 1045 return split_two_into_three(s, parent_index, vt, key); 1046 } 1047 } 1048 1049 /* 1050 * Does the node contain a particular key? 1051 */ 1052 static bool contains_key(struct btree_node *node, uint64_t key) 1053 { 1054 int i = lower_bound(node, key); 1055 1056 if (i >= 0 && le64_to_cpu(node->keys[i]) == key) 1057 return true; 1058 1059 return false; 1060 } 1061 1062 /* 1063 * In general we preemptively make sure there's a free entry in every 1064 * node on the spine when doing an insert. But we can avoid that with 1065 * leaf nodes if we know it's an overwrite. 1066 */ 1067 static bool has_space_for_insert(struct btree_node *node, uint64_t key) 1068 { 1069 if (node->header.nr_entries == node->header.max_entries) { 1070 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1071 /* we don't need space if it's an overwrite */ 1072 return contains_key(node, key); 1073 } 1074 1075 return false; 1076 } 1077 1078 return true; 1079 } 1080 1081 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 1082 struct dm_btree_value_type *vt, 1083 uint64_t key, unsigned *index) 1084 { 1085 int r, i = *index, top = 1; 1086 struct btree_node *node; 1087 1088 for (;;) { 1089 r = shadow_step(s, root, vt); 1090 if (r < 0) 1091 return r; 1092 1093 node = dm_block_data(shadow_current(s)); 1094 1095 /* 1096 * We have to patch up the parent node, ugly, but I don't 1097 * see a way to do this automatically as part of the spine 1098 * op. 1099 */ 1100 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 1101 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1102 1103 __dm_bless_for_disk(&location); 1104 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1105 &location, sizeof(__le64)); 1106 } 1107 1108 node = dm_block_data(shadow_current(s)); 1109 1110 if (!has_space_for_insert(node, key)) { 1111 if (top) 1112 r = btree_split_beneath(s, key); 1113 else 1114 r = rebalance_or_split(s, vt, i, key); 1115 1116 if (r < 0) 1117 return r; 1118 1119 /* making space can cause the current node to change */ 1120 node = dm_block_data(shadow_current(s)); 1121 } 1122 1123 i = lower_bound(node, key); 1124 1125 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 1126 break; 1127 1128 if (i < 0) { 1129 /* change the bounds on the lowest key */ 1130 node->keys[0] = cpu_to_le64(key); 1131 i = 0; 1132 } 1133 1134 root = value64(node, i); 1135 top = 0; 1136 } 1137 1138 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 1139 i++; 1140 1141 *index = i; 1142 return 0; 1143 } 1144 1145 static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, 1146 uint64_t key, int *index) 1147 { 1148 int r, i = -1; 1149 struct btree_node *node; 1150 1151 *index = 0; 1152 for (;;) { 1153 r = shadow_step(s, root, &s->info->value_type); 1154 if (r < 0) 1155 return r; 1156 1157 node = dm_block_data(shadow_current(s)); 1158 1159 /* 1160 * We have to patch up the parent node, ugly, but I don't 1161 * see a way to do this automatically as part of the spine 1162 * op. 1163 */ 1164 if (shadow_has_parent(s) && i >= 0) { 1165 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1166 1167 __dm_bless_for_disk(&location); 1168 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1169 &location, sizeof(__le64)); 1170 } 1171 1172 node = dm_block_data(shadow_current(s)); 1173 i = lower_bound(node, key); 1174 1175 BUG_ON(i < 0); 1176 BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); 1177 1178 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1179 if (key != le64_to_cpu(node->keys[i])) 1180 return -EINVAL; 1181 break; 1182 } 1183 1184 root = value64(node, i); 1185 } 1186 1187 *index = i; 1188 return 0; 1189 } 1190 1191 int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, 1192 uint64_t key, int *index, 1193 dm_block_t *new_root, struct dm_block **leaf) 1194 { 1195 int r; 1196 struct shadow_spine spine; 1197 1198 BUG_ON(info->levels > 1); 1199 init_shadow_spine(&spine, info); 1200 r = __btree_get_overwrite_leaf(&spine, root, key, index); 1201 if (!r) { 1202 *new_root = shadow_root(&spine); 1203 *leaf = shadow_current(&spine); 1204 1205 /* 1206 * Decrement the count so exit_shadow_spine() doesn't 1207 * unlock the leaf. 1208 */ 1209 spine.count--; 1210 } 1211 exit_shadow_spine(&spine); 1212 1213 return r; 1214 } 1215 1216 static bool need_insert(struct btree_node *node, uint64_t *keys, 1217 unsigned level, unsigned index) 1218 { 1219 return ((index >= le32_to_cpu(node->header.nr_entries)) || 1220 (le64_to_cpu(node->keys[index]) != keys[level])); 1221 } 1222 1223 static int insert(struct dm_btree_info *info, dm_block_t root, 1224 uint64_t *keys, void *value, dm_block_t *new_root, 1225 int *inserted) 1226 __dm_written_to_disk(value) 1227 { 1228 int r; 1229 unsigned level, index = -1, last_level = info->levels - 1; 1230 dm_block_t block = root; 1231 struct shadow_spine spine; 1232 struct btree_node *n; 1233 struct dm_btree_value_type le64_type; 1234 1235 init_le64_type(info->tm, &le64_type); 1236 init_shadow_spine(&spine, info); 1237 1238 for (level = 0; level < (info->levels - 1); level++) { 1239 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 1240 if (r < 0) 1241 goto bad; 1242 1243 n = dm_block_data(shadow_current(&spine)); 1244 1245 if (need_insert(n, keys, level, index)) { 1246 dm_block_t new_tree; 1247 __le64 new_le; 1248 1249 r = dm_btree_empty(info, &new_tree); 1250 if (r < 0) 1251 goto bad; 1252 1253 new_le = cpu_to_le64(new_tree); 1254 __dm_bless_for_disk(&new_le); 1255 1256 r = insert_at(sizeof(uint64_t), n, index, 1257 keys[level], &new_le); 1258 if (r) 1259 goto bad; 1260 } 1261 1262 if (level < last_level) 1263 block = value64(n, index); 1264 } 1265 1266 r = btree_insert_raw(&spine, block, &info->value_type, 1267 keys[level], &index); 1268 if (r < 0) 1269 goto bad; 1270 1271 n = dm_block_data(shadow_current(&spine)); 1272 1273 if (need_insert(n, keys, level, index)) { 1274 if (inserted) 1275 *inserted = 1; 1276 1277 r = insert_at(info->value_type.size, n, index, 1278 keys[level], value); 1279 if (r) 1280 goto bad_unblessed; 1281 } else { 1282 if (inserted) 1283 *inserted = 0; 1284 1285 if (info->value_type.dec && 1286 (!info->value_type.equal || 1287 !info->value_type.equal( 1288 info->value_type.context, 1289 value_ptr(n, index), 1290 value))) { 1291 info->value_type.dec(info->value_type.context, 1292 value_ptr(n, index), 1); 1293 } 1294 memcpy_disk(value_ptr(n, index), 1295 value, info->value_type.size); 1296 } 1297 1298 *new_root = shadow_root(&spine); 1299 exit_shadow_spine(&spine); 1300 1301 return 0; 1302 1303 bad: 1304 __dm_unbless_for_disk(value); 1305 bad_unblessed: 1306 exit_shadow_spine(&spine); 1307 return r; 1308 } 1309 1310 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 1311 uint64_t *keys, void *value, dm_block_t *new_root) 1312 __dm_written_to_disk(value) 1313 { 1314 return insert(info, root, keys, value, new_root, NULL); 1315 } 1316 EXPORT_SYMBOL_GPL(dm_btree_insert); 1317 1318 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 1319 uint64_t *keys, void *value, dm_block_t *new_root, 1320 int *inserted) 1321 __dm_written_to_disk(value) 1322 { 1323 return insert(info, root, keys, value, new_root, inserted); 1324 } 1325 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 1326 1327 /*----------------------------------------------------------------*/ 1328 1329 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 1330 uint64_t *result_key, dm_block_t *next_block) 1331 { 1332 int i, r; 1333 uint32_t flags; 1334 1335 do { 1336 r = ro_step(s, block); 1337 if (r < 0) 1338 return r; 1339 1340 flags = le32_to_cpu(ro_node(s)->header.flags); 1341 i = le32_to_cpu(ro_node(s)->header.nr_entries); 1342 if (!i) 1343 return -ENODATA; 1344 else 1345 i--; 1346 1347 if (find_highest) 1348 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 1349 else 1350 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 1351 1352 if (next_block || flags & INTERNAL_NODE) { 1353 if (find_highest) 1354 block = value64(ro_node(s), i); 1355 else 1356 block = value64(ro_node(s), 0); 1357 } 1358 1359 } while (flags & INTERNAL_NODE); 1360 1361 if (next_block) 1362 *next_block = block; 1363 return 0; 1364 } 1365 1366 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 1367 bool find_highest, uint64_t *result_keys) 1368 { 1369 int r = 0, count = 0, level; 1370 struct ro_spine spine; 1371 1372 init_ro_spine(&spine, info); 1373 for (level = 0; level < info->levels; level++) { 1374 r = find_key(&spine, root, find_highest, result_keys + level, 1375 level == info->levels - 1 ? NULL : &root); 1376 if (r == -ENODATA) { 1377 r = 0; 1378 break; 1379 1380 } else if (r) 1381 break; 1382 1383 count++; 1384 } 1385 exit_ro_spine(&spine); 1386 1387 return r ? r : count; 1388 } 1389 1390 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 1391 uint64_t *result_keys) 1392 { 1393 return dm_btree_find_key(info, root, true, result_keys); 1394 } 1395 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 1396 1397 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 1398 uint64_t *result_keys) 1399 { 1400 return dm_btree_find_key(info, root, false, result_keys); 1401 } 1402 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 1403 1404 /*----------------------------------------------------------------*/ 1405 1406 /* 1407 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 1408 * space. Also this only works for single level trees. 1409 */ 1410 static int walk_node(struct dm_btree_info *info, dm_block_t block, 1411 int (*fn)(void *context, uint64_t *keys, void *leaf), 1412 void *context) 1413 { 1414 int r; 1415 unsigned i, nr; 1416 struct dm_block *node; 1417 struct btree_node *n; 1418 uint64_t keys; 1419 1420 r = bn_read_lock(info, block, &node); 1421 if (r) 1422 return r; 1423 1424 n = dm_block_data(node); 1425 1426 nr = le32_to_cpu(n->header.nr_entries); 1427 for (i = 0; i < nr; i++) { 1428 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 1429 r = walk_node(info, value64(n, i), fn, context); 1430 if (r) 1431 goto out; 1432 } else { 1433 keys = le64_to_cpu(*key_ptr(n, i)); 1434 r = fn(context, &keys, value_ptr(n, i)); 1435 if (r) 1436 goto out; 1437 } 1438 } 1439 1440 out: 1441 dm_tm_unlock(info->tm, node); 1442 return r; 1443 } 1444 1445 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 1446 int (*fn)(void *context, uint64_t *keys, void *leaf), 1447 void *context) 1448 { 1449 BUG_ON(info->levels > 1); 1450 return walk_node(info, root, fn, context); 1451 } 1452 EXPORT_SYMBOL_GPL(dm_btree_walk); 1453 1454 /*----------------------------------------------------------------*/ 1455 1456 static void prefetch_values(struct dm_btree_cursor *c) 1457 { 1458 unsigned i, nr; 1459 __le64 value_le; 1460 struct cursor_node *n = c->nodes + c->depth - 1; 1461 struct btree_node *bn = dm_block_data(n->b); 1462 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1463 1464 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1465 1466 nr = le32_to_cpu(bn->header.nr_entries); 1467 for (i = 0; i < nr; i++) { 1468 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1469 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1470 } 1471 } 1472 1473 static bool leaf_node(struct dm_btree_cursor *c) 1474 { 1475 struct cursor_node *n = c->nodes + c->depth - 1; 1476 struct btree_node *bn = dm_block_data(n->b); 1477 1478 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1479 } 1480 1481 static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1482 { 1483 int r; 1484 struct cursor_node *n = c->nodes + c->depth; 1485 1486 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1487 DMERR("couldn't push cursor node, stack depth too high"); 1488 return -EINVAL; 1489 } 1490 1491 r = bn_read_lock(c->info, b, &n->b); 1492 if (r) 1493 return r; 1494 1495 n->index = 0; 1496 c->depth++; 1497 1498 if (c->prefetch_leaves || !leaf_node(c)) 1499 prefetch_values(c); 1500 1501 return 0; 1502 } 1503 1504 static void pop_node(struct dm_btree_cursor *c) 1505 { 1506 c->depth--; 1507 unlock_block(c->info, c->nodes[c->depth].b); 1508 } 1509 1510 static int inc_or_backtrack(struct dm_btree_cursor *c) 1511 { 1512 struct cursor_node *n; 1513 struct btree_node *bn; 1514 1515 for (;;) { 1516 if (!c->depth) 1517 return -ENODATA; 1518 1519 n = c->nodes + c->depth - 1; 1520 bn = dm_block_data(n->b); 1521 1522 n->index++; 1523 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1524 break; 1525 1526 pop_node(c); 1527 } 1528 1529 return 0; 1530 } 1531 1532 static int find_leaf(struct dm_btree_cursor *c) 1533 { 1534 int r = 0; 1535 struct cursor_node *n; 1536 struct btree_node *bn; 1537 __le64 value_le; 1538 1539 for (;;) { 1540 n = c->nodes + c->depth - 1; 1541 bn = dm_block_data(n->b); 1542 1543 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1544 break; 1545 1546 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1547 r = push_node(c, le64_to_cpu(value_le)); 1548 if (r) { 1549 DMERR("push_node failed"); 1550 break; 1551 } 1552 } 1553 1554 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1555 return -ENODATA; 1556 1557 return r; 1558 } 1559 1560 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1561 bool prefetch_leaves, struct dm_btree_cursor *c) 1562 { 1563 int r; 1564 1565 c->info = info; 1566 c->root = root; 1567 c->depth = 0; 1568 c->prefetch_leaves = prefetch_leaves; 1569 1570 r = push_node(c, root); 1571 if (r) 1572 return r; 1573 1574 return find_leaf(c); 1575 } 1576 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1577 1578 void dm_btree_cursor_end(struct dm_btree_cursor *c) 1579 { 1580 while (c->depth) 1581 pop_node(c); 1582 } 1583 EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1584 1585 int dm_btree_cursor_next(struct dm_btree_cursor *c) 1586 { 1587 int r = inc_or_backtrack(c); 1588 if (!r) { 1589 r = find_leaf(c); 1590 if (r) 1591 DMERR("find_leaf failed"); 1592 } 1593 1594 return r; 1595 } 1596 EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1597 1598 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 1599 { 1600 int r = 0; 1601 1602 while (count-- && !r) 1603 r = dm_btree_cursor_next(c); 1604 1605 return r; 1606 } 1607 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 1608 1609 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1610 { 1611 if (c->depth) { 1612 struct cursor_node *n = c->nodes + c->depth - 1; 1613 struct btree_node *bn = dm_block_data(n->b); 1614 1615 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1616 return -EINVAL; 1617 1618 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1619 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1620 return 0; 1621 1622 } else 1623 return -ENODATA; 1624 } 1625 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1626