1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct btree_node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 static int upper_bound(struct btree_node *n, uint64_t key) 67 { 68 return bsearch(n, key, 1); 69 } 70 71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 72 struct dm_btree_value_type *vt) 73 { 74 unsigned i; 75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 76 77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 78 for (i = 0; i < nr_entries; i++) 79 dm_tm_inc(tm, value64(n, i)); 80 else if (vt->inc) 81 for (i = 0; i < nr_entries; i++) 82 vt->inc(vt->context, value_ptr(n, i)); 83 } 84 85 static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 86 uint64_t key, void *value) 87 __dm_written_to_disk(value) 88 { 89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 90 __le64 key_le = cpu_to_le64(key); 91 92 if (index > nr_entries || 93 index >= le32_to_cpu(node->header.max_entries)) { 94 DMERR("too many entries in btree node for insert"); 95 __dm_unbless_for_disk(value); 96 return -ENOMEM; 97 } 98 99 __dm_bless_for_disk(&key_le); 100 101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 102 array_insert(value_base(node), value_size, nr_entries, index, value); 103 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 104 105 return 0; 106 } 107 108 /*----------------------------------------------------------------*/ 109 110 /* 111 * We want 3n entries (for some n). This works more nicely for repeated 112 * insert remove loops than (2n + 1). 113 */ 114 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 115 { 116 uint32_t total, n; 117 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 118 119 block_size -= sizeof(struct node_header); 120 total = block_size / elt_size; 121 n = total / 3; /* rounds down */ 122 123 return 3 * n; 124 } 125 126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 127 { 128 int r; 129 struct dm_block *b; 130 struct btree_node *n; 131 size_t block_size; 132 uint32_t max_entries; 133 134 r = new_block(info, &b); 135 if (r < 0) 136 return r; 137 138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 139 max_entries = calc_max_entries(info->value_type.size, block_size); 140 141 n = dm_block_data(b); 142 memset(n, 0, block_size); 143 n->header.flags = cpu_to_le32(LEAF_NODE); 144 n->header.nr_entries = cpu_to_le32(0); 145 n->header.max_entries = cpu_to_le32(max_entries); 146 n->header.value_size = cpu_to_le32(info->value_type.size); 147 148 *root = dm_block_location(b); 149 unlock_block(info, b); 150 151 return 0; 152 } 153 EXPORT_SYMBOL_GPL(dm_btree_empty); 154 155 /*----------------------------------------------------------------*/ 156 157 /* 158 * Deletion uses a recursive algorithm, since we have limited stack space 159 * we explicitly manage our own stack on the heap. 160 */ 161 #define MAX_SPINE_DEPTH 64 162 struct frame { 163 struct dm_block *b; 164 struct btree_node *n; 165 unsigned level; 166 unsigned nr_children; 167 unsigned current_child; 168 }; 169 170 struct del_stack { 171 struct dm_btree_info *info; 172 struct dm_transaction_manager *tm; 173 int top; 174 struct frame spine[MAX_SPINE_DEPTH]; 175 }; 176 177 static int top_frame(struct del_stack *s, struct frame **f) 178 { 179 if (s->top < 0) { 180 DMERR("btree deletion stack empty"); 181 return -EINVAL; 182 } 183 184 *f = s->spine + s->top; 185 186 return 0; 187 } 188 189 static int unprocessed_frames(struct del_stack *s) 190 { 191 return s->top >= 0; 192 } 193 194 static void prefetch_children(struct del_stack *s, struct frame *f) 195 { 196 unsigned i; 197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 198 199 for (i = 0; i < f->nr_children; i++) 200 dm_bm_prefetch(bm, value64(f->n, i)); 201 } 202 203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 204 { 205 return f->level < (info->levels - 1); 206 } 207 208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 209 { 210 int r; 211 uint32_t ref_count; 212 213 if (s->top >= MAX_SPINE_DEPTH - 1) { 214 DMERR("btree deletion stack out of memory"); 215 return -ENOMEM; 216 } 217 218 r = dm_tm_ref(s->tm, b, &ref_count); 219 if (r) 220 return r; 221 222 if (ref_count > 1) 223 /* 224 * This is a shared node, so we can just decrement it's 225 * reference counter and leave the children. 226 */ 227 dm_tm_dec(s->tm, b); 228 229 else { 230 uint32_t flags; 231 struct frame *f = s->spine + ++s->top; 232 233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 234 if (r) { 235 s->top--; 236 return r; 237 } 238 239 f->n = dm_block_data(f->b); 240 f->level = level; 241 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 242 f->current_child = 0; 243 244 flags = le32_to_cpu(f->n->header.flags); 245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 246 prefetch_children(s, f); 247 } 248 249 return 0; 250 } 251 252 static void pop_frame(struct del_stack *s) 253 { 254 struct frame *f = s->spine + s->top--; 255 256 dm_tm_dec(s->tm, dm_block_location(f->b)); 257 dm_tm_unlock(s->tm, f->b); 258 } 259 260 static void unlock_all_frames(struct del_stack *s) 261 { 262 struct frame *f; 263 264 while (unprocessed_frames(s)) { 265 f = s->spine + s->top--; 266 dm_tm_unlock(s->tm, f->b); 267 } 268 } 269 270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 271 { 272 int r; 273 struct del_stack *s; 274 275 s = kmalloc(sizeof(*s), GFP_NOIO); 276 if (!s) 277 return -ENOMEM; 278 s->info = info; 279 s->tm = info->tm; 280 s->top = -1; 281 282 r = push_frame(s, root, 0); 283 if (r) 284 goto out; 285 286 while (unprocessed_frames(s)) { 287 uint32_t flags; 288 struct frame *f; 289 dm_block_t b; 290 291 r = top_frame(s, &f); 292 if (r) 293 goto out; 294 295 if (f->current_child >= f->nr_children) { 296 pop_frame(s); 297 continue; 298 } 299 300 flags = le32_to_cpu(f->n->header.flags); 301 if (flags & INTERNAL_NODE) { 302 b = value64(f->n, f->current_child); 303 f->current_child++; 304 r = push_frame(s, b, f->level); 305 if (r) 306 goto out; 307 308 } else if (is_internal_level(info, f)) { 309 b = value64(f->n, f->current_child); 310 f->current_child++; 311 r = push_frame(s, b, f->level + 1); 312 if (r) 313 goto out; 314 315 } else { 316 if (info->value_type.dec) { 317 unsigned i; 318 319 for (i = 0; i < f->nr_children; i++) 320 info->value_type.dec(info->value_type.context, 321 value_ptr(f->n, i)); 322 } 323 pop_frame(s); 324 } 325 } 326 out: 327 if (r) { 328 /* cleanup all frames of del_stack */ 329 unlock_all_frames(s); 330 } 331 kfree(s); 332 333 return r; 334 } 335 EXPORT_SYMBOL_GPL(dm_btree_del); 336 337 /*----------------------------------------------------------------*/ 338 339 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 340 int (*search_fn)(struct btree_node *, uint64_t), 341 uint64_t *result_key, void *v, size_t value_size) 342 { 343 int i, r; 344 uint32_t flags, nr_entries; 345 346 do { 347 r = ro_step(s, block); 348 if (r < 0) 349 return r; 350 351 i = search_fn(ro_node(s), key); 352 353 flags = le32_to_cpu(ro_node(s)->header.flags); 354 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 355 if (i < 0 || i >= nr_entries) 356 return -ENODATA; 357 358 if (flags & INTERNAL_NODE) 359 block = value64(ro_node(s), i); 360 361 } while (!(flags & LEAF_NODE)); 362 363 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 364 memcpy(v, value_ptr(ro_node(s), i), value_size); 365 366 return 0; 367 } 368 369 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 370 uint64_t *keys, void *value_le) 371 { 372 unsigned level, last_level = info->levels - 1; 373 int r = -ENODATA; 374 uint64_t rkey; 375 __le64 internal_value_le; 376 struct ro_spine spine; 377 378 init_ro_spine(&spine, info); 379 for (level = 0; level < info->levels; level++) { 380 size_t size; 381 void *value_p; 382 383 if (level == last_level) { 384 value_p = value_le; 385 size = info->value_type.size; 386 387 } else { 388 value_p = &internal_value_le; 389 size = sizeof(uint64_t); 390 } 391 392 r = btree_lookup_raw(&spine, root, keys[level], 393 lower_bound, &rkey, 394 value_p, size); 395 396 if (!r) { 397 if (rkey != keys[level]) { 398 exit_ro_spine(&spine); 399 return -ENODATA; 400 } 401 } else { 402 exit_ro_spine(&spine); 403 return r; 404 } 405 406 root = le64_to_cpu(internal_value_le); 407 } 408 exit_ro_spine(&spine); 409 410 return r; 411 } 412 EXPORT_SYMBOL_GPL(dm_btree_lookup); 413 414 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 415 uint64_t key, uint64_t *rkey, void *value_le) 416 { 417 int r, i; 418 uint32_t flags, nr_entries; 419 struct dm_block *node; 420 struct btree_node *n; 421 422 r = bn_read_lock(info, root, &node); 423 if (r) 424 return r; 425 426 n = dm_block_data(node); 427 flags = le32_to_cpu(n->header.flags); 428 nr_entries = le32_to_cpu(n->header.nr_entries); 429 430 if (flags & INTERNAL_NODE) { 431 i = lower_bound(n, key); 432 if (i < 0) { 433 /* 434 * avoid early -ENODATA return when all entries are 435 * higher than the search @key. 436 */ 437 i = 0; 438 } 439 if (i >= nr_entries) { 440 r = -ENODATA; 441 goto out; 442 } 443 444 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 445 if (r == -ENODATA && i < (nr_entries - 1)) { 446 i++; 447 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 448 } 449 450 } else { 451 i = upper_bound(n, key); 452 if (i < 0 || i >= nr_entries) { 453 r = -ENODATA; 454 goto out; 455 } 456 457 *rkey = le64_to_cpu(n->keys[i]); 458 memcpy(value_le, value_ptr(n, i), info->value_type.size); 459 } 460 out: 461 dm_tm_unlock(info->tm, node); 462 return r; 463 } 464 465 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 466 uint64_t *keys, uint64_t *rkey, void *value_le) 467 { 468 unsigned level; 469 int r = -ENODATA; 470 __le64 internal_value_le; 471 struct ro_spine spine; 472 473 init_ro_spine(&spine, info); 474 for (level = 0; level < info->levels - 1u; level++) { 475 r = btree_lookup_raw(&spine, root, keys[level], 476 lower_bound, rkey, 477 &internal_value_le, sizeof(uint64_t)); 478 if (r) 479 goto out; 480 481 if (*rkey != keys[level]) { 482 r = -ENODATA; 483 goto out; 484 } 485 486 root = le64_to_cpu(internal_value_le); 487 } 488 489 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 490 out: 491 exit_ro_spine(&spine); 492 return r; 493 } 494 495 EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 496 497 /* 498 * Splits a node by creating a sibling node and shifting half the nodes 499 * contents across. Assumes there is a parent node, and it has room for 500 * another child. 501 * 502 * Before: 503 * +--------+ 504 * | Parent | 505 * +--------+ 506 * | 507 * v 508 * +----------+ 509 * | A ++++++ | 510 * +----------+ 511 * 512 * 513 * After: 514 * +--------+ 515 * | Parent | 516 * +--------+ 517 * | | 518 * v +------+ 519 * +---------+ | 520 * | A* +++ | v 521 * +---------+ +-------+ 522 * | B +++ | 523 * +-------+ 524 * 525 * Where A* is a shadow of A. 526 */ 527 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, 528 uint64_t key) 529 { 530 int r; 531 size_t size; 532 unsigned nr_left, nr_right; 533 struct dm_block *left, *right, *parent; 534 struct btree_node *ln, *rn, *pn; 535 __le64 location; 536 537 left = shadow_current(s); 538 539 r = new_block(s->info, &right); 540 if (r < 0) 541 return r; 542 543 ln = dm_block_data(left); 544 rn = dm_block_data(right); 545 546 nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 547 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 548 549 ln->header.nr_entries = cpu_to_le32(nr_left); 550 551 rn->header.flags = ln->header.flags; 552 rn->header.nr_entries = cpu_to_le32(nr_right); 553 rn->header.max_entries = ln->header.max_entries; 554 rn->header.value_size = ln->header.value_size; 555 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 556 557 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 558 sizeof(uint64_t) : s->info->value_type.size; 559 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), 560 size * nr_right); 561 562 /* 563 * Patch up the parent 564 */ 565 parent = shadow_parent(s); 566 567 pn = dm_block_data(parent); 568 location = cpu_to_le64(dm_block_location(left)); 569 __dm_bless_for_disk(&location); 570 memcpy_disk(value_ptr(pn, parent_index), 571 &location, sizeof(__le64)); 572 573 location = cpu_to_le64(dm_block_location(right)); 574 __dm_bless_for_disk(&location); 575 576 r = insert_at(sizeof(__le64), pn, parent_index + 1, 577 le64_to_cpu(rn->keys[0]), &location); 578 if (r) { 579 unlock_block(s->info, right); 580 return r; 581 } 582 583 if (key < le64_to_cpu(rn->keys[0])) { 584 unlock_block(s->info, right); 585 s->nodes[1] = left; 586 } else { 587 unlock_block(s->info, left); 588 s->nodes[1] = right; 589 } 590 591 return 0; 592 } 593 594 /* 595 * Splits a node by creating two new children beneath the given node. 596 * 597 * Before: 598 * +----------+ 599 * | A ++++++ | 600 * +----------+ 601 * 602 * 603 * After: 604 * +------------+ 605 * | A (shadow) | 606 * +------------+ 607 * | | 608 * +------+ +----+ 609 * | | 610 * v v 611 * +-------+ +-------+ 612 * | B +++ | | C +++ | 613 * +-------+ +-------+ 614 */ 615 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 616 { 617 int r; 618 size_t size; 619 unsigned nr_left, nr_right; 620 struct dm_block *left, *right, *new_parent; 621 struct btree_node *pn, *ln, *rn; 622 __le64 val; 623 624 new_parent = shadow_current(s); 625 626 r = new_block(s->info, &left); 627 if (r < 0) 628 return r; 629 630 r = new_block(s->info, &right); 631 if (r < 0) { 632 unlock_block(s->info, left); 633 return r; 634 } 635 636 pn = dm_block_data(new_parent); 637 ln = dm_block_data(left); 638 rn = dm_block_data(right); 639 640 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 641 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 642 643 ln->header.flags = pn->header.flags; 644 ln->header.nr_entries = cpu_to_le32(nr_left); 645 ln->header.max_entries = pn->header.max_entries; 646 ln->header.value_size = pn->header.value_size; 647 648 rn->header.flags = pn->header.flags; 649 rn->header.nr_entries = cpu_to_le32(nr_right); 650 rn->header.max_entries = pn->header.max_entries; 651 rn->header.value_size = pn->header.value_size; 652 653 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 654 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 655 656 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 657 sizeof(__le64) : s->info->value_type.size; 658 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 659 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 660 nr_right * size); 661 662 /* new_parent should just point to l and r now */ 663 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 664 pn->header.nr_entries = cpu_to_le32(2); 665 pn->header.max_entries = cpu_to_le32( 666 calc_max_entries(sizeof(__le64), 667 dm_bm_block_size( 668 dm_tm_get_bm(s->info->tm)))); 669 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 670 671 val = cpu_to_le64(dm_block_location(left)); 672 __dm_bless_for_disk(&val); 673 pn->keys[0] = ln->keys[0]; 674 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 675 676 val = cpu_to_le64(dm_block_location(right)); 677 __dm_bless_for_disk(&val); 678 pn->keys[1] = rn->keys[0]; 679 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 680 681 /* 682 * rejig the spine. This is ugly, since it knows too 683 * much about the spine 684 */ 685 if (s->nodes[0] != new_parent) { 686 unlock_block(s->info, s->nodes[0]); 687 s->nodes[0] = new_parent; 688 } 689 if (key < le64_to_cpu(rn->keys[0])) { 690 unlock_block(s->info, right); 691 s->nodes[1] = left; 692 } else { 693 unlock_block(s->info, left); 694 s->nodes[1] = right; 695 } 696 s->count = 2; 697 698 return 0; 699 } 700 701 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 702 struct dm_btree_value_type *vt, 703 uint64_t key, unsigned *index) 704 { 705 int r, i = *index, top = 1; 706 struct btree_node *node; 707 708 for (;;) { 709 r = shadow_step(s, root, vt); 710 if (r < 0) 711 return r; 712 713 node = dm_block_data(shadow_current(s)); 714 715 /* 716 * We have to patch up the parent node, ugly, but I don't 717 * see a way to do this automatically as part of the spine 718 * op. 719 */ 720 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 721 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 722 723 __dm_bless_for_disk(&location); 724 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 725 &location, sizeof(__le64)); 726 } 727 728 node = dm_block_data(shadow_current(s)); 729 730 if (node->header.nr_entries == node->header.max_entries) { 731 if (top) 732 r = btree_split_beneath(s, key); 733 else 734 r = btree_split_sibling(s, i, key); 735 736 if (r < 0) 737 return r; 738 } 739 740 node = dm_block_data(shadow_current(s)); 741 742 i = lower_bound(node, key); 743 744 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 745 break; 746 747 if (i < 0) { 748 /* change the bounds on the lowest key */ 749 node->keys[0] = cpu_to_le64(key); 750 i = 0; 751 } 752 753 root = value64(node, i); 754 top = 0; 755 } 756 757 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 758 i++; 759 760 *index = i; 761 return 0; 762 } 763 764 static bool need_insert(struct btree_node *node, uint64_t *keys, 765 unsigned level, unsigned index) 766 { 767 return ((index >= le32_to_cpu(node->header.nr_entries)) || 768 (le64_to_cpu(node->keys[index]) != keys[level])); 769 } 770 771 static int insert(struct dm_btree_info *info, dm_block_t root, 772 uint64_t *keys, void *value, dm_block_t *new_root, 773 int *inserted) 774 __dm_written_to_disk(value) 775 { 776 int r; 777 unsigned level, index = -1, last_level = info->levels - 1; 778 dm_block_t block = root; 779 struct shadow_spine spine; 780 struct btree_node *n; 781 struct dm_btree_value_type le64_type; 782 783 init_le64_type(info->tm, &le64_type); 784 init_shadow_spine(&spine, info); 785 786 for (level = 0; level < (info->levels - 1); level++) { 787 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 788 if (r < 0) 789 goto bad; 790 791 n = dm_block_data(shadow_current(&spine)); 792 793 if (need_insert(n, keys, level, index)) { 794 dm_block_t new_tree; 795 __le64 new_le; 796 797 r = dm_btree_empty(info, &new_tree); 798 if (r < 0) 799 goto bad; 800 801 new_le = cpu_to_le64(new_tree); 802 __dm_bless_for_disk(&new_le); 803 804 r = insert_at(sizeof(uint64_t), n, index, 805 keys[level], &new_le); 806 if (r) 807 goto bad; 808 } 809 810 if (level < last_level) 811 block = value64(n, index); 812 } 813 814 r = btree_insert_raw(&spine, block, &info->value_type, 815 keys[level], &index); 816 if (r < 0) 817 goto bad; 818 819 n = dm_block_data(shadow_current(&spine)); 820 821 if (need_insert(n, keys, level, index)) { 822 if (inserted) 823 *inserted = 1; 824 825 r = insert_at(info->value_type.size, n, index, 826 keys[level], value); 827 if (r) 828 goto bad_unblessed; 829 } else { 830 if (inserted) 831 *inserted = 0; 832 833 if (info->value_type.dec && 834 (!info->value_type.equal || 835 !info->value_type.equal( 836 info->value_type.context, 837 value_ptr(n, index), 838 value))) { 839 info->value_type.dec(info->value_type.context, 840 value_ptr(n, index)); 841 } 842 memcpy_disk(value_ptr(n, index), 843 value, info->value_type.size); 844 } 845 846 *new_root = shadow_root(&spine); 847 exit_shadow_spine(&spine); 848 849 return 0; 850 851 bad: 852 __dm_unbless_for_disk(value); 853 bad_unblessed: 854 exit_shadow_spine(&spine); 855 return r; 856 } 857 858 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 859 uint64_t *keys, void *value, dm_block_t *new_root) 860 __dm_written_to_disk(value) 861 { 862 return insert(info, root, keys, value, new_root, NULL); 863 } 864 EXPORT_SYMBOL_GPL(dm_btree_insert); 865 866 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 867 uint64_t *keys, void *value, dm_block_t *new_root, 868 int *inserted) 869 __dm_written_to_disk(value) 870 { 871 return insert(info, root, keys, value, new_root, inserted); 872 } 873 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 874 875 /*----------------------------------------------------------------*/ 876 877 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 878 uint64_t *result_key, dm_block_t *next_block) 879 { 880 int i, r; 881 uint32_t flags; 882 883 do { 884 r = ro_step(s, block); 885 if (r < 0) 886 return r; 887 888 flags = le32_to_cpu(ro_node(s)->header.flags); 889 i = le32_to_cpu(ro_node(s)->header.nr_entries); 890 if (!i) 891 return -ENODATA; 892 else 893 i--; 894 895 if (find_highest) 896 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 897 else 898 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 899 900 if (next_block || flags & INTERNAL_NODE) 901 block = value64(ro_node(s), i); 902 903 } while (flags & INTERNAL_NODE); 904 905 if (next_block) 906 *next_block = block; 907 return 0; 908 } 909 910 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 911 bool find_highest, uint64_t *result_keys) 912 { 913 int r = 0, count = 0, level; 914 struct ro_spine spine; 915 916 init_ro_spine(&spine, info); 917 for (level = 0; level < info->levels; level++) { 918 r = find_key(&spine, root, find_highest, result_keys + level, 919 level == info->levels - 1 ? NULL : &root); 920 if (r == -ENODATA) { 921 r = 0; 922 break; 923 924 } else if (r) 925 break; 926 927 count++; 928 } 929 exit_ro_spine(&spine); 930 931 return r ? r : count; 932 } 933 934 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 935 uint64_t *result_keys) 936 { 937 return dm_btree_find_key(info, root, true, result_keys); 938 } 939 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 940 941 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 942 uint64_t *result_keys) 943 { 944 return dm_btree_find_key(info, root, false, result_keys); 945 } 946 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 947 948 /*----------------------------------------------------------------*/ 949 950 /* 951 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 952 * space. Also this only works for single level trees. 953 */ 954 static int walk_node(struct dm_btree_info *info, dm_block_t block, 955 int (*fn)(void *context, uint64_t *keys, void *leaf), 956 void *context) 957 { 958 int r; 959 unsigned i, nr; 960 struct dm_block *node; 961 struct btree_node *n; 962 uint64_t keys; 963 964 r = bn_read_lock(info, block, &node); 965 if (r) 966 return r; 967 968 n = dm_block_data(node); 969 970 nr = le32_to_cpu(n->header.nr_entries); 971 for (i = 0; i < nr; i++) { 972 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 973 r = walk_node(info, value64(n, i), fn, context); 974 if (r) 975 goto out; 976 } else { 977 keys = le64_to_cpu(*key_ptr(n, i)); 978 r = fn(context, &keys, value_ptr(n, i)); 979 if (r) 980 goto out; 981 } 982 } 983 984 out: 985 dm_tm_unlock(info->tm, node); 986 return r; 987 } 988 989 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 990 int (*fn)(void *context, uint64_t *keys, void *leaf), 991 void *context) 992 { 993 BUG_ON(info->levels > 1); 994 return walk_node(info, root, fn, context); 995 } 996 EXPORT_SYMBOL_GPL(dm_btree_walk); 997 998 /*----------------------------------------------------------------*/ 999 1000 static void prefetch_values(struct dm_btree_cursor *c) 1001 { 1002 unsigned i, nr; 1003 __le64 value_le; 1004 struct cursor_node *n = c->nodes + c->depth - 1; 1005 struct btree_node *bn = dm_block_data(n->b); 1006 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1007 1008 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1009 1010 nr = le32_to_cpu(bn->header.nr_entries); 1011 for (i = 0; i < nr; i++) { 1012 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1013 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1014 } 1015 } 1016 1017 static bool leaf_node(struct dm_btree_cursor *c) 1018 { 1019 struct cursor_node *n = c->nodes + c->depth - 1; 1020 struct btree_node *bn = dm_block_data(n->b); 1021 1022 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1023 } 1024 1025 static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1026 { 1027 int r; 1028 struct cursor_node *n = c->nodes + c->depth; 1029 1030 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1031 DMERR("couldn't push cursor node, stack depth too high"); 1032 return -EINVAL; 1033 } 1034 1035 r = bn_read_lock(c->info, b, &n->b); 1036 if (r) 1037 return r; 1038 1039 n->index = 0; 1040 c->depth++; 1041 1042 if (c->prefetch_leaves || !leaf_node(c)) 1043 prefetch_values(c); 1044 1045 return 0; 1046 } 1047 1048 static void pop_node(struct dm_btree_cursor *c) 1049 { 1050 c->depth--; 1051 unlock_block(c->info, c->nodes[c->depth].b); 1052 } 1053 1054 static int inc_or_backtrack(struct dm_btree_cursor *c) 1055 { 1056 struct cursor_node *n; 1057 struct btree_node *bn; 1058 1059 for (;;) { 1060 if (!c->depth) 1061 return -ENODATA; 1062 1063 n = c->nodes + c->depth - 1; 1064 bn = dm_block_data(n->b); 1065 1066 n->index++; 1067 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1068 break; 1069 1070 pop_node(c); 1071 } 1072 1073 return 0; 1074 } 1075 1076 static int find_leaf(struct dm_btree_cursor *c) 1077 { 1078 int r = 0; 1079 struct cursor_node *n; 1080 struct btree_node *bn; 1081 __le64 value_le; 1082 1083 for (;;) { 1084 n = c->nodes + c->depth - 1; 1085 bn = dm_block_data(n->b); 1086 1087 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1088 break; 1089 1090 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1091 r = push_node(c, le64_to_cpu(value_le)); 1092 if (r) { 1093 DMERR("push_node failed"); 1094 break; 1095 } 1096 } 1097 1098 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1099 return -ENODATA; 1100 1101 return r; 1102 } 1103 1104 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1105 bool prefetch_leaves, struct dm_btree_cursor *c) 1106 { 1107 int r; 1108 1109 c->info = info; 1110 c->root = root; 1111 c->depth = 0; 1112 c->prefetch_leaves = prefetch_leaves; 1113 1114 r = push_node(c, root); 1115 if (r) 1116 return r; 1117 1118 return find_leaf(c); 1119 } 1120 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1121 1122 void dm_btree_cursor_end(struct dm_btree_cursor *c) 1123 { 1124 while (c->depth) 1125 pop_node(c); 1126 } 1127 EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1128 1129 int dm_btree_cursor_next(struct dm_btree_cursor *c) 1130 { 1131 int r = inc_or_backtrack(c); 1132 if (!r) { 1133 r = find_leaf(c); 1134 if (r) 1135 DMERR("find_leaf failed"); 1136 } 1137 1138 return r; 1139 } 1140 EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1141 1142 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1143 { 1144 if (c->depth) { 1145 struct cursor_node *n = c->nodes + c->depth - 1; 1146 struct btree_node *bn = dm_block_data(n->b); 1147 1148 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1149 return -EINVAL; 1150 1151 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1152 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1153 return 0; 1154 1155 } else 1156 return -ENODATA; 1157 } 1158 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1159