1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct btree_node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 static int upper_bound(struct btree_node *n, uint64_t key) 67 { 68 return bsearch(n, key, 1); 69 } 70 71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 72 struct dm_btree_value_type *vt) 73 { 74 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 75 76 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 77 dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range); 78 79 else if (vt->inc) 80 vt->inc(vt->context, value_ptr(n, 0), nr_entries); 81 } 82 83 static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 84 uint64_t key, void *value) 85 __dm_written_to_disk(value) 86 { 87 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 88 __le64 key_le = cpu_to_le64(key); 89 90 if (index > nr_entries || 91 index >= le32_to_cpu(node->header.max_entries)) { 92 DMERR("too many entries in btree node for insert"); 93 __dm_unbless_for_disk(value); 94 return -ENOMEM; 95 } 96 97 __dm_bless_for_disk(&key_le); 98 99 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 100 array_insert(value_base(node), value_size, nr_entries, index, value); 101 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 102 103 return 0; 104 } 105 106 /*----------------------------------------------------------------*/ 107 108 /* 109 * We want 3n entries (for some n). This works more nicely for repeated 110 * insert remove loops than (2n + 1). 111 */ 112 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 113 { 114 uint32_t total, n; 115 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 116 117 block_size -= sizeof(struct node_header); 118 total = block_size / elt_size; 119 n = total / 3; /* rounds down */ 120 121 return 3 * n; 122 } 123 124 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 125 { 126 int r; 127 struct dm_block *b; 128 struct btree_node *n; 129 size_t block_size; 130 uint32_t max_entries; 131 132 r = new_block(info, &b); 133 if (r < 0) 134 return r; 135 136 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 137 max_entries = calc_max_entries(info->value_type.size, block_size); 138 139 n = dm_block_data(b); 140 memset(n, 0, block_size); 141 n->header.flags = cpu_to_le32(LEAF_NODE); 142 n->header.nr_entries = cpu_to_le32(0); 143 n->header.max_entries = cpu_to_le32(max_entries); 144 n->header.value_size = cpu_to_le32(info->value_type.size); 145 146 *root = dm_block_location(b); 147 unlock_block(info, b); 148 149 return 0; 150 } 151 EXPORT_SYMBOL_GPL(dm_btree_empty); 152 153 /*----------------------------------------------------------------*/ 154 155 /* 156 * Deletion uses a recursive algorithm, since we have limited stack space 157 * we explicitly manage our own stack on the heap. 158 */ 159 #define MAX_SPINE_DEPTH 64 160 struct frame { 161 struct dm_block *b; 162 struct btree_node *n; 163 unsigned level; 164 unsigned nr_children; 165 unsigned current_child; 166 }; 167 168 struct del_stack { 169 struct dm_btree_info *info; 170 struct dm_transaction_manager *tm; 171 int top; 172 struct frame spine[MAX_SPINE_DEPTH]; 173 }; 174 175 static int top_frame(struct del_stack *s, struct frame **f) 176 { 177 if (s->top < 0) { 178 DMERR("btree deletion stack empty"); 179 return -EINVAL; 180 } 181 182 *f = s->spine + s->top; 183 184 return 0; 185 } 186 187 static int unprocessed_frames(struct del_stack *s) 188 { 189 return s->top >= 0; 190 } 191 192 static void prefetch_children(struct del_stack *s, struct frame *f) 193 { 194 unsigned i; 195 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 196 197 for (i = 0; i < f->nr_children; i++) 198 dm_bm_prefetch(bm, value64(f->n, i)); 199 } 200 201 static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 202 { 203 return f->level < (info->levels - 1); 204 } 205 206 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 207 { 208 int r; 209 uint32_t ref_count; 210 211 if (s->top >= MAX_SPINE_DEPTH - 1) { 212 DMERR("btree deletion stack out of memory"); 213 return -ENOMEM; 214 } 215 216 r = dm_tm_ref(s->tm, b, &ref_count); 217 if (r) 218 return r; 219 220 if (ref_count > 1) 221 /* 222 * This is a shared node, so we can just decrement it's 223 * reference counter and leave the children. 224 */ 225 dm_tm_dec(s->tm, b); 226 227 else { 228 uint32_t flags; 229 struct frame *f = s->spine + ++s->top; 230 231 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 232 if (r) { 233 s->top--; 234 return r; 235 } 236 237 f->n = dm_block_data(f->b); 238 f->level = level; 239 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 240 f->current_child = 0; 241 242 flags = le32_to_cpu(f->n->header.flags); 243 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 244 prefetch_children(s, f); 245 } 246 247 return 0; 248 } 249 250 static void pop_frame(struct del_stack *s) 251 { 252 struct frame *f = s->spine + s->top--; 253 254 dm_tm_dec(s->tm, dm_block_location(f->b)); 255 dm_tm_unlock(s->tm, f->b); 256 } 257 258 static void unlock_all_frames(struct del_stack *s) 259 { 260 struct frame *f; 261 262 while (unprocessed_frames(s)) { 263 f = s->spine + s->top--; 264 dm_tm_unlock(s->tm, f->b); 265 } 266 } 267 268 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 269 { 270 int r; 271 struct del_stack *s; 272 273 /* 274 * dm_btree_del() is called via an ioctl, as such should be 275 * considered an FS op. We can't recurse back into the FS, so we 276 * allocate GFP_NOFS. 277 */ 278 s = kmalloc(sizeof(*s), GFP_NOFS); 279 if (!s) 280 return -ENOMEM; 281 s->info = info; 282 s->tm = info->tm; 283 s->top = -1; 284 285 r = push_frame(s, root, 0); 286 if (r) 287 goto out; 288 289 while (unprocessed_frames(s)) { 290 uint32_t flags; 291 struct frame *f; 292 dm_block_t b; 293 294 r = top_frame(s, &f); 295 if (r) 296 goto out; 297 298 if (f->current_child >= f->nr_children) { 299 pop_frame(s); 300 continue; 301 } 302 303 flags = le32_to_cpu(f->n->header.flags); 304 if (flags & INTERNAL_NODE) { 305 b = value64(f->n, f->current_child); 306 f->current_child++; 307 r = push_frame(s, b, f->level); 308 if (r) 309 goto out; 310 311 } else if (is_internal_level(info, f)) { 312 b = value64(f->n, f->current_child); 313 f->current_child++; 314 r = push_frame(s, b, f->level + 1); 315 if (r) 316 goto out; 317 318 } else { 319 if (info->value_type.dec) 320 info->value_type.dec(info->value_type.context, 321 value_ptr(f->n, 0), f->nr_children); 322 pop_frame(s); 323 } 324 } 325 out: 326 if (r) { 327 /* cleanup all frames of del_stack */ 328 unlock_all_frames(s); 329 } 330 kfree(s); 331 332 return r; 333 } 334 EXPORT_SYMBOL_GPL(dm_btree_del); 335 336 /*----------------------------------------------------------------*/ 337 338 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 339 int (*search_fn)(struct btree_node *, uint64_t), 340 uint64_t *result_key, void *v, size_t value_size) 341 { 342 int i, r; 343 uint32_t flags, nr_entries; 344 345 do { 346 r = ro_step(s, block); 347 if (r < 0) 348 return r; 349 350 i = search_fn(ro_node(s), key); 351 352 flags = le32_to_cpu(ro_node(s)->header.flags); 353 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 354 if (i < 0 || i >= nr_entries) 355 return -ENODATA; 356 357 if (flags & INTERNAL_NODE) 358 block = value64(ro_node(s), i); 359 360 } while (!(flags & LEAF_NODE)); 361 362 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 363 if (v) 364 memcpy(v, value_ptr(ro_node(s), i), value_size); 365 366 return 0; 367 } 368 369 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 370 uint64_t *keys, void *value_le) 371 { 372 unsigned level, last_level = info->levels - 1; 373 int r = -ENODATA; 374 uint64_t rkey; 375 __le64 internal_value_le; 376 struct ro_spine spine; 377 378 init_ro_spine(&spine, info); 379 for (level = 0; level < info->levels; level++) { 380 size_t size; 381 void *value_p; 382 383 if (level == last_level) { 384 value_p = value_le; 385 size = info->value_type.size; 386 387 } else { 388 value_p = &internal_value_le; 389 size = sizeof(uint64_t); 390 } 391 392 r = btree_lookup_raw(&spine, root, keys[level], 393 lower_bound, &rkey, 394 value_p, size); 395 396 if (!r) { 397 if (rkey != keys[level]) { 398 exit_ro_spine(&spine); 399 return -ENODATA; 400 } 401 } else { 402 exit_ro_spine(&spine); 403 return r; 404 } 405 406 root = le64_to_cpu(internal_value_le); 407 } 408 exit_ro_spine(&spine); 409 410 return r; 411 } 412 EXPORT_SYMBOL_GPL(dm_btree_lookup); 413 414 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 415 uint64_t key, uint64_t *rkey, void *value_le) 416 { 417 int r, i; 418 uint32_t flags, nr_entries; 419 struct dm_block *node; 420 struct btree_node *n; 421 422 r = bn_read_lock(info, root, &node); 423 if (r) 424 return r; 425 426 n = dm_block_data(node); 427 flags = le32_to_cpu(n->header.flags); 428 nr_entries = le32_to_cpu(n->header.nr_entries); 429 430 if (flags & INTERNAL_NODE) { 431 i = lower_bound(n, key); 432 if (i < 0) { 433 /* 434 * avoid early -ENODATA return when all entries are 435 * higher than the search @key. 436 */ 437 i = 0; 438 } 439 if (i >= nr_entries) { 440 r = -ENODATA; 441 goto out; 442 } 443 444 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 445 if (r == -ENODATA && i < (nr_entries - 1)) { 446 i++; 447 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 448 } 449 450 } else { 451 i = upper_bound(n, key); 452 if (i < 0 || i >= nr_entries) { 453 r = -ENODATA; 454 goto out; 455 } 456 457 *rkey = le64_to_cpu(n->keys[i]); 458 memcpy(value_le, value_ptr(n, i), info->value_type.size); 459 } 460 out: 461 dm_tm_unlock(info->tm, node); 462 return r; 463 } 464 465 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 466 uint64_t *keys, uint64_t *rkey, void *value_le) 467 { 468 unsigned level; 469 int r = -ENODATA; 470 __le64 internal_value_le; 471 struct ro_spine spine; 472 473 init_ro_spine(&spine, info); 474 for (level = 0; level < info->levels - 1u; level++) { 475 r = btree_lookup_raw(&spine, root, keys[level], 476 lower_bound, rkey, 477 &internal_value_le, sizeof(uint64_t)); 478 if (r) 479 goto out; 480 481 if (*rkey != keys[level]) { 482 r = -ENODATA; 483 goto out; 484 } 485 486 root = le64_to_cpu(internal_value_le); 487 } 488 489 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 490 out: 491 exit_ro_spine(&spine); 492 return r; 493 } 494 495 EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 496 497 /*----------------------------------------------------------------*/ 498 499 /* 500 * Copies entries from one region of a btree node to another. The regions 501 * must not overlap. 502 */ 503 static void copy_entries(struct btree_node *dest, unsigned dest_offset, 504 struct btree_node *src, unsigned src_offset, 505 unsigned count) 506 { 507 size_t value_size = le32_to_cpu(dest->header.value_size); 508 memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 509 memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 510 } 511 512 /* 513 * Moves entries from one region fo a btree node to another. The regions 514 * may overlap. 515 */ 516 static void move_entries(struct btree_node *dest, unsigned dest_offset, 517 struct btree_node *src, unsigned src_offset, 518 unsigned count) 519 { 520 size_t value_size = le32_to_cpu(dest->header.value_size); 521 memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t)); 522 memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size); 523 } 524 525 /* 526 * Erases the first 'count' entries of a btree node, shifting following 527 * entries down into their place. 528 */ 529 static void shift_down(struct btree_node *n, unsigned count) 530 { 531 move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count); 532 } 533 534 /* 535 * Moves entries in a btree node up 'count' places, making space for 536 * new entries at the start of the node. 537 */ 538 static void shift_up(struct btree_node *n, unsigned count) 539 { 540 move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries)); 541 } 542 543 /* 544 * Redistributes entries between two btree nodes to make them 545 * have similar numbers of entries. 546 */ 547 static void redistribute2(struct btree_node *left, struct btree_node *right) 548 { 549 unsigned nr_left = le32_to_cpu(left->header.nr_entries); 550 unsigned nr_right = le32_to_cpu(right->header.nr_entries); 551 unsigned total = nr_left + nr_right; 552 unsigned target_left = total / 2; 553 unsigned target_right = total - target_left; 554 555 if (nr_left < target_left) { 556 unsigned delta = target_left - nr_left; 557 copy_entries(left, nr_left, right, 0, delta); 558 shift_down(right, delta); 559 } else if (nr_left > target_left) { 560 unsigned delta = nr_left - target_left; 561 if (nr_right) 562 shift_up(right, delta); 563 copy_entries(right, 0, left, target_left, delta); 564 } 565 566 left->header.nr_entries = cpu_to_le32(target_left); 567 right->header.nr_entries = cpu_to_le32(target_right); 568 } 569 570 /* 571 * Redistribute entries between three nodes. Assumes the central 572 * node is empty. 573 */ 574 static void redistribute3(struct btree_node *left, struct btree_node *center, 575 struct btree_node *right) 576 { 577 unsigned nr_left = le32_to_cpu(left->header.nr_entries); 578 unsigned nr_center = le32_to_cpu(center->header.nr_entries); 579 unsigned nr_right = le32_to_cpu(right->header.nr_entries); 580 unsigned total, target_left, target_center, target_right; 581 582 BUG_ON(nr_center); 583 584 total = nr_left + nr_right; 585 target_left = total / 3; 586 target_center = (total - target_left) / 2; 587 target_right = (total - target_left - target_center); 588 589 if (nr_left < target_left) { 590 unsigned left_short = target_left - nr_left; 591 copy_entries(left, nr_left, right, 0, left_short); 592 copy_entries(center, 0, right, left_short, target_center); 593 shift_down(right, nr_right - target_right); 594 595 } else if (nr_left < (target_left + target_center)) { 596 unsigned left_to_center = nr_left - target_left; 597 copy_entries(center, 0, left, target_left, left_to_center); 598 copy_entries(center, left_to_center, right, 0, target_center - left_to_center); 599 shift_down(right, nr_right - target_right); 600 601 } else { 602 unsigned right_short = target_right - nr_right; 603 shift_up(right, right_short); 604 copy_entries(right, 0, left, nr_left - right_short, right_short); 605 copy_entries(center, 0, left, target_left, nr_left - target_left); 606 } 607 608 left->header.nr_entries = cpu_to_le32(target_left); 609 center->header.nr_entries = cpu_to_le32(target_center); 610 right->header.nr_entries = cpu_to_le32(target_right); 611 } 612 613 /* 614 * Splits a node by creating a sibling node and shifting half the nodes 615 * contents across. Assumes there is a parent node, and it has room for 616 * another child. 617 * 618 * Before: 619 * +--------+ 620 * | Parent | 621 * +--------+ 622 * | 623 * v 624 * +----------+ 625 * | A ++++++ | 626 * +----------+ 627 * 628 * 629 * After: 630 * +--------+ 631 * | Parent | 632 * +--------+ 633 * | | 634 * v +------+ 635 * +---------+ | 636 * | A* +++ | v 637 * +---------+ +-------+ 638 * | B +++ | 639 * +-------+ 640 * 641 * Where A* is a shadow of A. 642 */ 643 static int split_one_into_two(struct shadow_spine *s, unsigned parent_index, 644 struct dm_btree_value_type *vt, uint64_t key) 645 { 646 int r; 647 struct dm_block *left, *right, *parent; 648 struct btree_node *ln, *rn, *pn; 649 __le64 location; 650 651 left = shadow_current(s); 652 653 r = new_block(s->info, &right); 654 if (r < 0) 655 return r; 656 657 ln = dm_block_data(left); 658 rn = dm_block_data(right); 659 660 rn->header.flags = ln->header.flags; 661 rn->header.nr_entries = cpu_to_le32(0); 662 rn->header.max_entries = ln->header.max_entries; 663 rn->header.value_size = ln->header.value_size; 664 redistribute2(ln, rn); 665 666 /* patch up the parent */ 667 parent = shadow_parent(s); 668 pn = dm_block_data(parent); 669 670 location = cpu_to_le64(dm_block_location(right)); 671 __dm_bless_for_disk(&location); 672 r = insert_at(sizeof(__le64), pn, parent_index + 1, 673 le64_to_cpu(rn->keys[0]), &location); 674 if (r) { 675 unlock_block(s->info, right); 676 return r; 677 } 678 679 /* patch up the spine */ 680 if (key < le64_to_cpu(rn->keys[0])) { 681 unlock_block(s->info, right); 682 s->nodes[1] = left; 683 } else { 684 unlock_block(s->info, left); 685 s->nodes[1] = right; 686 } 687 688 return 0; 689 } 690 691 /* 692 * We often need to modify a sibling node. This function shadows a particular 693 * child of the given parent node. Making sure to update the parent to point 694 * to the new shadow. 695 */ 696 static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 697 struct btree_node *parent, unsigned index, 698 struct dm_block **result) 699 { 700 int r, inc; 701 dm_block_t root; 702 struct btree_node *node; 703 704 root = value64(parent, index); 705 706 r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 707 result, &inc); 708 if (r) 709 return r; 710 711 node = dm_block_data(*result); 712 713 if (inc) 714 inc_children(info->tm, node, vt); 715 716 *((__le64 *) value_ptr(parent, index)) = 717 cpu_to_le64(dm_block_location(*result)); 718 719 return 0; 720 } 721 722 /* 723 * Splits two nodes into three. This is more work, but results in fuller 724 * nodes, so saves metadata space. 725 */ 726 static int split_two_into_three(struct shadow_spine *s, unsigned parent_index, 727 struct dm_btree_value_type *vt, uint64_t key) 728 { 729 int r; 730 unsigned middle_index; 731 struct dm_block *left, *middle, *right, *parent; 732 struct btree_node *ln, *rn, *mn, *pn; 733 __le64 location; 734 735 parent = shadow_parent(s); 736 pn = dm_block_data(parent); 737 738 if (parent_index == 0) { 739 middle_index = 1; 740 left = shadow_current(s); 741 r = shadow_child(s->info, vt, pn, parent_index + 1, &right); 742 if (r) 743 return r; 744 } else { 745 middle_index = parent_index; 746 right = shadow_current(s); 747 r = shadow_child(s->info, vt, pn, parent_index - 1, &left); 748 if (r) 749 return r; 750 } 751 752 r = new_block(s->info, &middle); 753 if (r < 0) 754 return r; 755 756 ln = dm_block_data(left); 757 mn = dm_block_data(middle); 758 rn = dm_block_data(right); 759 760 mn->header.nr_entries = cpu_to_le32(0); 761 mn->header.flags = ln->header.flags; 762 mn->header.max_entries = ln->header.max_entries; 763 mn->header.value_size = ln->header.value_size; 764 765 redistribute3(ln, mn, rn); 766 767 /* patch up the parent */ 768 pn->keys[middle_index] = rn->keys[0]; 769 location = cpu_to_le64(dm_block_location(middle)); 770 __dm_bless_for_disk(&location); 771 r = insert_at(sizeof(__le64), pn, middle_index, 772 le64_to_cpu(mn->keys[0]), &location); 773 if (r) { 774 if (shadow_current(s) != left) 775 unlock_block(s->info, left); 776 777 unlock_block(s->info, middle); 778 779 if (shadow_current(s) != right) 780 unlock_block(s->info, right); 781 782 return r; 783 } 784 785 786 /* patch up the spine */ 787 if (key < le64_to_cpu(mn->keys[0])) { 788 unlock_block(s->info, middle); 789 unlock_block(s->info, right); 790 s->nodes[1] = left; 791 } else if (key < le64_to_cpu(rn->keys[0])) { 792 unlock_block(s->info, left); 793 unlock_block(s->info, right); 794 s->nodes[1] = middle; 795 } else { 796 unlock_block(s->info, left); 797 unlock_block(s->info, middle); 798 s->nodes[1] = right; 799 } 800 801 return 0; 802 } 803 804 /*----------------------------------------------------------------*/ 805 806 /* 807 * Splits a node by creating two new children beneath the given node. 808 * 809 * Before: 810 * +----------+ 811 * | A ++++++ | 812 * +----------+ 813 * 814 * 815 * After: 816 * +------------+ 817 * | A (shadow) | 818 * +------------+ 819 * | | 820 * +------+ +----+ 821 * | | 822 * v v 823 * +-------+ +-------+ 824 * | B +++ | | C +++ | 825 * +-------+ +-------+ 826 */ 827 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 828 { 829 int r; 830 size_t size; 831 unsigned nr_left, nr_right; 832 struct dm_block *left, *right, *new_parent; 833 struct btree_node *pn, *ln, *rn; 834 __le64 val; 835 836 new_parent = shadow_current(s); 837 838 pn = dm_block_data(new_parent); 839 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 840 sizeof(__le64) : s->info->value_type.size; 841 842 /* create & init the left block */ 843 r = new_block(s->info, &left); 844 if (r < 0) 845 return r; 846 847 ln = dm_block_data(left); 848 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 849 850 ln->header.flags = pn->header.flags; 851 ln->header.nr_entries = cpu_to_le32(nr_left); 852 ln->header.max_entries = pn->header.max_entries; 853 ln->header.value_size = pn->header.value_size; 854 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 855 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 856 857 /* create & init the right block */ 858 r = new_block(s->info, &right); 859 if (r < 0) { 860 unlock_block(s->info, left); 861 return r; 862 } 863 864 rn = dm_block_data(right); 865 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 866 867 rn->header.flags = pn->header.flags; 868 rn->header.nr_entries = cpu_to_le32(nr_right); 869 rn->header.max_entries = pn->header.max_entries; 870 rn->header.value_size = pn->header.value_size; 871 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 872 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 873 nr_right * size); 874 875 /* new_parent should just point to l and r now */ 876 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 877 pn->header.nr_entries = cpu_to_le32(2); 878 pn->header.max_entries = cpu_to_le32( 879 calc_max_entries(sizeof(__le64), 880 dm_bm_block_size( 881 dm_tm_get_bm(s->info->tm)))); 882 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 883 884 val = cpu_to_le64(dm_block_location(left)); 885 __dm_bless_for_disk(&val); 886 pn->keys[0] = ln->keys[0]; 887 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 888 889 val = cpu_to_le64(dm_block_location(right)); 890 __dm_bless_for_disk(&val); 891 pn->keys[1] = rn->keys[0]; 892 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 893 894 unlock_block(s->info, left); 895 unlock_block(s->info, right); 896 return 0; 897 } 898 899 /*----------------------------------------------------------------*/ 900 901 /* 902 * Redistributes a node's entries with its left sibling. 903 */ 904 static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt, 905 unsigned parent_index, uint64_t key) 906 { 907 int r; 908 struct dm_block *sib; 909 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 910 911 r = shadow_child(s->info, vt, parent, parent_index - 1, &sib); 912 if (r) 913 return r; 914 915 left = dm_block_data(sib); 916 right = dm_block_data(shadow_current(s)); 917 redistribute2(left, right); 918 *key_ptr(parent, parent_index) = right->keys[0]; 919 920 if (key < le64_to_cpu(right->keys[0])) { 921 unlock_block(s->info, s->nodes[1]); 922 s->nodes[1] = sib; 923 } else { 924 unlock_block(s->info, sib); 925 } 926 927 return 0; 928 } 929 930 /* 931 * Redistributes a nodes entries with its right sibling. 932 */ 933 static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt, 934 unsigned parent_index, uint64_t key) 935 { 936 int r; 937 struct dm_block *sib; 938 struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s)); 939 940 r = shadow_child(s->info, vt, parent, parent_index + 1, &sib); 941 if (r) 942 return r; 943 944 left = dm_block_data(shadow_current(s)); 945 right = dm_block_data(sib); 946 redistribute2(left, right); 947 *key_ptr(parent, parent_index + 1) = right->keys[0]; 948 949 if (key < le64_to_cpu(right->keys[0])) { 950 unlock_block(s->info, sib); 951 } else { 952 unlock_block(s->info, s->nodes[1]); 953 s->nodes[1] = sib; 954 } 955 956 return 0; 957 } 958 959 /* 960 * Returns the number of spare entries in a node. 961 */ 962 static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space) 963 { 964 int r; 965 unsigned nr_entries; 966 struct dm_block *block; 967 struct btree_node *node; 968 969 r = bn_read_lock(info, b, &block); 970 if (r) 971 return r; 972 973 node = dm_block_data(block); 974 nr_entries = le32_to_cpu(node->header.nr_entries); 975 *space = le32_to_cpu(node->header.max_entries) - nr_entries; 976 977 unlock_block(info, block); 978 return 0; 979 } 980 981 /* 982 * Make space in a node, either by moving some entries to a sibling, 983 * or creating a new sibling node. SPACE_THRESHOLD defines the minimum 984 * number of free entries that must be in the sibling to make the move 985 * worth while. If the siblings are shared (eg, part of a snapshot), 986 * then they are not touched, since this break sharing and so consume 987 * more space than we save. 988 */ 989 #define SPACE_THRESHOLD 8 990 static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt, 991 unsigned parent_index, uint64_t key) 992 { 993 int r; 994 struct btree_node *parent = dm_block_data(shadow_parent(s)); 995 unsigned nr_parent = le32_to_cpu(parent->header.nr_entries); 996 unsigned free_space; 997 int left_shared = 0, right_shared = 0; 998 999 /* Should we move entries to the left sibling? */ 1000 if (parent_index > 0) { 1001 dm_block_t left_b = value64(parent, parent_index - 1); 1002 r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared); 1003 if (r) 1004 return r; 1005 1006 if (!left_shared) { 1007 r = get_node_free_space(s->info, left_b, &free_space); 1008 if (r) 1009 return r; 1010 1011 if (free_space >= SPACE_THRESHOLD) 1012 return rebalance_left(s, vt, parent_index, key); 1013 } 1014 } 1015 1016 /* Should we move entries to the right sibling? */ 1017 if (parent_index < (nr_parent - 1)) { 1018 dm_block_t right_b = value64(parent, parent_index + 1); 1019 r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared); 1020 if (r) 1021 return r; 1022 1023 if (!right_shared) { 1024 r = get_node_free_space(s->info, right_b, &free_space); 1025 if (r) 1026 return r; 1027 1028 if (free_space >= SPACE_THRESHOLD) 1029 return rebalance_right(s, vt, parent_index, key); 1030 } 1031 } 1032 1033 /* 1034 * We need to split the node, normally we split two nodes 1035 * into three. But when inserting a sequence that is either 1036 * monotonically increasing or decreasing it's better to split 1037 * a single node into two. 1038 */ 1039 if (left_shared || right_shared || (nr_parent <= 2) || 1040 (parent_index == 0) || (parent_index + 1 == nr_parent)) { 1041 return split_one_into_two(s, parent_index, vt, key); 1042 } else { 1043 return split_two_into_three(s, parent_index, vt, key); 1044 } 1045 } 1046 1047 /* 1048 * Does the node contain a particular key? 1049 */ 1050 static bool contains_key(struct btree_node *node, uint64_t key) 1051 { 1052 int i = lower_bound(node, key); 1053 1054 if (i >= 0 && le64_to_cpu(node->keys[i]) == key) 1055 return true; 1056 1057 return false; 1058 } 1059 1060 /* 1061 * In general we preemptively make sure there's a free entry in every 1062 * node on the spine when doing an insert. But we can avoid that with 1063 * leaf nodes if we know it's an overwrite. 1064 */ 1065 static bool has_space_for_insert(struct btree_node *node, uint64_t key) 1066 { 1067 if (node->header.nr_entries == node->header.max_entries) { 1068 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1069 /* we don't need space if it's an overwrite */ 1070 return contains_key(node, key); 1071 } 1072 1073 return false; 1074 } 1075 1076 return true; 1077 } 1078 1079 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 1080 struct dm_btree_value_type *vt, 1081 uint64_t key, unsigned *index) 1082 { 1083 int r, i = *index, top = 1; 1084 struct btree_node *node; 1085 1086 for (;;) { 1087 r = shadow_step(s, root, vt); 1088 if (r < 0) 1089 return r; 1090 1091 node = dm_block_data(shadow_current(s)); 1092 1093 /* 1094 * We have to patch up the parent node, ugly, but I don't 1095 * see a way to do this automatically as part of the spine 1096 * op. 1097 */ 1098 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 1099 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1100 1101 __dm_bless_for_disk(&location); 1102 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1103 &location, sizeof(__le64)); 1104 } 1105 1106 node = dm_block_data(shadow_current(s)); 1107 1108 if (!has_space_for_insert(node, key)) { 1109 if (top) 1110 r = btree_split_beneath(s, key); 1111 else 1112 r = rebalance_or_split(s, vt, i, key); 1113 1114 if (r < 0) 1115 return r; 1116 1117 /* making space can cause the current node to change */ 1118 node = dm_block_data(shadow_current(s)); 1119 } 1120 1121 i = lower_bound(node, key); 1122 1123 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 1124 break; 1125 1126 if (i < 0) { 1127 /* change the bounds on the lowest key */ 1128 node->keys[0] = cpu_to_le64(key); 1129 i = 0; 1130 } 1131 1132 root = value64(node, i); 1133 top = 0; 1134 } 1135 1136 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 1137 i++; 1138 1139 *index = i; 1140 return 0; 1141 } 1142 1143 static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root, 1144 uint64_t key, int *index) 1145 { 1146 int r, i = -1; 1147 struct btree_node *node; 1148 1149 *index = 0; 1150 for (;;) { 1151 r = shadow_step(s, root, &s->info->value_type); 1152 if (r < 0) 1153 return r; 1154 1155 node = dm_block_data(shadow_current(s)); 1156 1157 /* 1158 * We have to patch up the parent node, ugly, but I don't 1159 * see a way to do this automatically as part of the spine 1160 * op. 1161 */ 1162 if (shadow_has_parent(s) && i >= 0) { 1163 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 1164 1165 __dm_bless_for_disk(&location); 1166 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 1167 &location, sizeof(__le64)); 1168 } 1169 1170 node = dm_block_data(shadow_current(s)); 1171 i = lower_bound(node, key); 1172 1173 BUG_ON(i < 0); 1174 BUG_ON(i >= le32_to_cpu(node->header.nr_entries)); 1175 1176 if (le32_to_cpu(node->header.flags) & LEAF_NODE) { 1177 if (key != le64_to_cpu(node->keys[i])) 1178 return -EINVAL; 1179 break; 1180 } 1181 1182 root = value64(node, i); 1183 } 1184 1185 *index = i; 1186 return 0; 1187 } 1188 1189 int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root, 1190 uint64_t key, int *index, 1191 dm_block_t *new_root, struct dm_block **leaf) 1192 { 1193 int r; 1194 struct shadow_spine spine; 1195 1196 BUG_ON(info->levels > 1); 1197 init_shadow_spine(&spine, info); 1198 r = __btree_get_overwrite_leaf(&spine, root, key, index); 1199 if (!r) { 1200 *new_root = shadow_root(&spine); 1201 *leaf = shadow_current(&spine); 1202 1203 /* 1204 * Decrement the count so exit_shadow_spine() doesn't 1205 * unlock the leaf. 1206 */ 1207 spine.count--; 1208 } 1209 exit_shadow_spine(&spine); 1210 1211 return r; 1212 } 1213 1214 static bool need_insert(struct btree_node *node, uint64_t *keys, 1215 unsigned level, unsigned index) 1216 { 1217 return ((index >= le32_to_cpu(node->header.nr_entries)) || 1218 (le64_to_cpu(node->keys[index]) != keys[level])); 1219 } 1220 1221 static int insert(struct dm_btree_info *info, dm_block_t root, 1222 uint64_t *keys, void *value, dm_block_t *new_root, 1223 int *inserted) 1224 __dm_written_to_disk(value) 1225 { 1226 int r; 1227 unsigned level, index = -1, last_level = info->levels - 1; 1228 dm_block_t block = root; 1229 struct shadow_spine spine; 1230 struct btree_node *n; 1231 struct dm_btree_value_type le64_type; 1232 1233 init_le64_type(info->tm, &le64_type); 1234 init_shadow_spine(&spine, info); 1235 1236 for (level = 0; level < (info->levels - 1); level++) { 1237 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 1238 if (r < 0) 1239 goto bad; 1240 1241 n = dm_block_data(shadow_current(&spine)); 1242 1243 if (need_insert(n, keys, level, index)) { 1244 dm_block_t new_tree; 1245 __le64 new_le; 1246 1247 r = dm_btree_empty(info, &new_tree); 1248 if (r < 0) 1249 goto bad; 1250 1251 new_le = cpu_to_le64(new_tree); 1252 __dm_bless_for_disk(&new_le); 1253 1254 r = insert_at(sizeof(uint64_t), n, index, 1255 keys[level], &new_le); 1256 if (r) 1257 goto bad; 1258 } 1259 1260 if (level < last_level) 1261 block = value64(n, index); 1262 } 1263 1264 r = btree_insert_raw(&spine, block, &info->value_type, 1265 keys[level], &index); 1266 if (r < 0) 1267 goto bad; 1268 1269 n = dm_block_data(shadow_current(&spine)); 1270 1271 if (need_insert(n, keys, level, index)) { 1272 if (inserted) 1273 *inserted = 1; 1274 1275 r = insert_at(info->value_type.size, n, index, 1276 keys[level], value); 1277 if (r) 1278 goto bad_unblessed; 1279 } else { 1280 if (inserted) 1281 *inserted = 0; 1282 1283 if (info->value_type.dec && 1284 (!info->value_type.equal || 1285 !info->value_type.equal( 1286 info->value_type.context, 1287 value_ptr(n, index), 1288 value))) { 1289 info->value_type.dec(info->value_type.context, 1290 value_ptr(n, index), 1); 1291 } 1292 memcpy_disk(value_ptr(n, index), 1293 value, info->value_type.size); 1294 } 1295 1296 *new_root = shadow_root(&spine); 1297 exit_shadow_spine(&spine); 1298 1299 return 0; 1300 1301 bad: 1302 __dm_unbless_for_disk(value); 1303 bad_unblessed: 1304 exit_shadow_spine(&spine); 1305 return r; 1306 } 1307 1308 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 1309 uint64_t *keys, void *value, dm_block_t *new_root) 1310 __dm_written_to_disk(value) 1311 { 1312 return insert(info, root, keys, value, new_root, NULL); 1313 } 1314 EXPORT_SYMBOL_GPL(dm_btree_insert); 1315 1316 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 1317 uint64_t *keys, void *value, dm_block_t *new_root, 1318 int *inserted) 1319 __dm_written_to_disk(value) 1320 { 1321 return insert(info, root, keys, value, new_root, inserted); 1322 } 1323 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 1324 1325 /*----------------------------------------------------------------*/ 1326 1327 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 1328 uint64_t *result_key, dm_block_t *next_block) 1329 { 1330 int i, r; 1331 uint32_t flags; 1332 1333 do { 1334 r = ro_step(s, block); 1335 if (r < 0) 1336 return r; 1337 1338 flags = le32_to_cpu(ro_node(s)->header.flags); 1339 i = le32_to_cpu(ro_node(s)->header.nr_entries); 1340 if (!i) 1341 return -ENODATA; 1342 else 1343 i--; 1344 1345 if (find_highest) 1346 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 1347 else 1348 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 1349 1350 if (next_block || flags & INTERNAL_NODE) { 1351 if (find_highest) 1352 block = value64(ro_node(s), i); 1353 else 1354 block = value64(ro_node(s), 0); 1355 } 1356 1357 } while (flags & INTERNAL_NODE); 1358 1359 if (next_block) 1360 *next_block = block; 1361 return 0; 1362 } 1363 1364 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 1365 bool find_highest, uint64_t *result_keys) 1366 { 1367 int r = 0, count = 0, level; 1368 struct ro_spine spine; 1369 1370 init_ro_spine(&spine, info); 1371 for (level = 0; level < info->levels; level++) { 1372 r = find_key(&spine, root, find_highest, result_keys + level, 1373 level == info->levels - 1 ? NULL : &root); 1374 if (r == -ENODATA) { 1375 r = 0; 1376 break; 1377 1378 } else if (r) 1379 break; 1380 1381 count++; 1382 } 1383 exit_ro_spine(&spine); 1384 1385 return r ? r : count; 1386 } 1387 1388 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 1389 uint64_t *result_keys) 1390 { 1391 return dm_btree_find_key(info, root, true, result_keys); 1392 } 1393 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 1394 1395 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 1396 uint64_t *result_keys) 1397 { 1398 return dm_btree_find_key(info, root, false, result_keys); 1399 } 1400 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 1401 1402 /*----------------------------------------------------------------*/ 1403 1404 /* 1405 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 1406 * space. Also this only works for single level trees. 1407 */ 1408 static int walk_node(struct dm_btree_info *info, dm_block_t block, 1409 int (*fn)(void *context, uint64_t *keys, void *leaf), 1410 void *context) 1411 { 1412 int r; 1413 unsigned i, nr; 1414 struct dm_block *node; 1415 struct btree_node *n; 1416 uint64_t keys; 1417 1418 r = bn_read_lock(info, block, &node); 1419 if (r) 1420 return r; 1421 1422 n = dm_block_data(node); 1423 1424 nr = le32_to_cpu(n->header.nr_entries); 1425 for (i = 0; i < nr; i++) { 1426 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 1427 r = walk_node(info, value64(n, i), fn, context); 1428 if (r) 1429 goto out; 1430 } else { 1431 keys = le64_to_cpu(*key_ptr(n, i)); 1432 r = fn(context, &keys, value_ptr(n, i)); 1433 if (r) 1434 goto out; 1435 } 1436 } 1437 1438 out: 1439 dm_tm_unlock(info->tm, node); 1440 return r; 1441 } 1442 1443 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 1444 int (*fn)(void *context, uint64_t *keys, void *leaf), 1445 void *context) 1446 { 1447 BUG_ON(info->levels > 1); 1448 return walk_node(info, root, fn, context); 1449 } 1450 EXPORT_SYMBOL_GPL(dm_btree_walk); 1451 1452 /*----------------------------------------------------------------*/ 1453 1454 static void prefetch_values(struct dm_btree_cursor *c) 1455 { 1456 unsigned i, nr; 1457 __le64 value_le; 1458 struct cursor_node *n = c->nodes + c->depth - 1; 1459 struct btree_node *bn = dm_block_data(n->b); 1460 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1461 1462 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1463 1464 nr = le32_to_cpu(bn->header.nr_entries); 1465 for (i = 0; i < nr; i++) { 1466 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1467 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1468 } 1469 } 1470 1471 static bool leaf_node(struct dm_btree_cursor *c) 1472 { 1473 struct cursor_node *n = c->nodes + c->depth - 1; 1474 struct btree_node *bn = dm_block_data(n->b); 1475 1476 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1477 } 1478 1479 static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1480 { 1481 int r; 1482 struct cursor_node *n = c->nodes + c->depth; 1483 1484 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1485 DMERR("couldn't push cursor node, stack depth too high"); 1486 return -EINVAL; 1487 } 1488 1489 r = bn_read_lock(c->info, b, &n->b); 1490 if (r) 1491 return r; 1492 1493 n->index = 0; 1494 c->depth++; 1495 1496 if (c->prefetch_leaves || !leaf_node(c)) 1497 prefetch_values(c); 1498 1499 return 0; 1500 } 1501 1502 static void pop_node(struct dm_btree_cursor *c) 1503 { 1504 c->depth--; 1505 unlock_block(c->info, c->nodes[c->depth].b); 1506 } 1507 1508 static int inc_or_backtrack(struct dm_btree_cursor *c) 1509 { 1510 struct cursor_node *n; 1511 struct btree_node *bn; 1512 1513 for (;;) { 1514 if (!c->depth) 1515 return -ENODATA; 1516 1517 n = c->nodes + c->depth - 1; 1518 bn = dm_block_data(n->b); 1519 1520 n->index++; 1521 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1522 break; 1523 1524 pop_node(c); 1525 } 1526 1527 return 0; 1528 } 1529 1530 static int find_leaf(struct dm_btree_cursor *c) 1531 { 1532 int r = 0; 1533 struct cursor_node *n; 1534 struct btree_node *bn; 1535 __le64 value_le; 1536 1537 for (;;) { 1538 n = c->nodes + c->depth - 1; 1539 bn = dm_block_data(n->b); 1540 1541 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1542 break; 1543 1544 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1545 r = push_node(c, le64_to_cpu(value_le)); 1546 if (r) { 1547 DMERR("push_node failed"); 1548 break; 1549 } 1550 } 1551 1552 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1553 return -ENODATA; 1554 1555 return r; 1556 } 1557 1558 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1559 bool prefetch_leaves, struct dm_btree_cursor *c) 1560 { 1561 int r; 1562 1563 c->info = info; 1564 c->root = root; 1565 c->depth = 0; 1566 c->prefetch_leaves = prefetch_leaves; 1567 1568 r = push_node(c, root); 1569 if (r) 1570 return r; 1571 1572 return find_leaf(c); 1573 } 1574 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1575 1576 void dm_btree_cursor_end(struct dm_btree_cursor *c) 1577 { 1578 while (c->depth) 1579 pop_node(c); 1580 } 1581 EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1582 1583 int dm_btree_cursor_next(struct dm_btree_cursor *c) 1584 { 1585 int r = inc_or_backtrack(c); 1586 if (!r) { 1587 r = find_leaf(c); 1588 if (r) 1589 DMERR("find_leaf failed"); 1590 } 1591 1592 return r; 1593 } 1594 EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1595 1596 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 1597 { 1598 int r = 0; 1599 1600 while (count-- && !r) 1601 r = dm_btree_cursor_next(c); 1602 1603 return r; 1604 } 1605 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 1606 1607 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1608 { 1609 if (c->depth) { 1610 struct cursor_node *n = c->nodes + c->depth - 1; 1611 struct btree_node *bn = dm_block_data(n->b); 1612 1613 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1614 return -EINVAL; 1615 1616 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1617 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1618 return 0; 1619 1620 } else 1621 return -ENODATA; 1622 } 1623 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1624