1 /* 2 * Copyright (C) 2011 Red Hat, Inc. 3 * 4 * This file is released under the GPL. 5 */ 6 7 #include "dm-btree-internal.h" 8 #include "dm-space-map.h" 9 #include "dm-transaction-manager.h" 10 11 #include <linux/export.h> 12 #include <linux/device-mapper.h> 13 14 #define DM_MSG_PREFIX "btree" 15 16 /*---------------------------------------------------------------- 17 * Array manipulation 18 *--------------------------------------------------------------*/ 19 static void memcpy_disk(void *dest, const void *src, size_t len) 20 __dm_written_to_disk(src) 21 { 22 memcpy(dest, src, len); 23 __dm_unbless_for_disk(src); 24 } 25 26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts, 27 unsigned index, void *elt) 28 __dm_written_to_disk(elt) 29 { 30 if (index < nr_elts) 31 memmove(base + (elt_size * (index + 1)), 32 base + (elt_size * index), 33 (nr_elts - index) * elt_size); 34 35 memcpy_disk(base + (elt_size * index), elt, elt_size); 36 } 37 38 /*----------------------------------------------------------------*/ 39 40 /* makes the assumption that no two keys are the same. */ 41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi) 42 { 43 int lo = -1, hi = le32_to_cpu(n->header.nr_entries); 44 45 while (hi - lo > 1) { 46 int mid = lo + ((hi - lo) / 2); 47 uint64_t mid_key = le64_to_cpu(n->keys[mid]); 48 49 if (mid_key == key) 50 return mid; 51 52 if (mid_key < key) 53 lo = mid; 54 else 55 hi = mid; 56 } 57 58 return want_hi ? hi : lo; 59 } 60 61 int lower_bound(struct btree_node *n, uint64_t key) 62 { 63 return bsearch(n, key, 0); 64 } 65 66 static int upper_bound(struct btree_node *n, uint64_t key) 67 { 68 return bsearch(n, key, 1); 69 } 70 71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n, 72 struct dm_btree_value_type *vt) 73 { 74 unsigned i; 75 uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 76 77 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) 78 for (i = 0; i < nr_entries; i++) 79 dm_tm_inc(tm, value64(n, i)); 80 else if (vt->inc) 81 for (i = 0; i < nr_entries; i++) 82 vt->inc(vt->context, value_ptr(n, i)); 83 } 84 85 static int insert_at(size_t value_size, struct btree_node *node, unsigned index, 86 uint64_t key, void *value) 87 __dm_written_to_disk(value) 88 { 89 uint32_t nr_entries = le32_to_cpu(node->header.nr_entries); 90 __le64 key_le = cpu_to_le64(key); 91 92 if (index > nr_entries || 93 index >= le32_to_cpu(node->header.max_entries)) { 94 DMERR("too many entries in btree node for insert"); 95 __dm_unbless_for_disk(value); 96 return -ENOMEM; 97 } 98 99 __dm_bless_for_disk(&key_le); 100 101 array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le); 102 array_insert(value_base(node), value_size, nr_entries, index, value); 103 node->header.nr_entries = cpu_to_le32(nr_entries + 1); 104 105 return 0; 106 } 107 108 /*----------------------------------------------------------------*/ 109 110 /* 111 * We want 3n entries (for some n). This works more nicely for repeated 112 * insert remove loops than (2n + 1). 113 */ 114 static uint32_t calc_max_entries(size_t value_size, size_t block_size) 115 { 116 uint32_t total, n; 117 size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */ 118 119 block_size -= sizeof(struct node_header); 120 total = block_size / elt_size; 121 n = total / 3; /* rounds down */ 122 123 return 3 * n; 124 } 125 126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root) 127 { 128 int r; 129 struct dm_block *b; 130 struct btree_node *n; 131 size_t block_size; 132 uint32_t max_entries; 133 134 r = new_block(info, &b); 135 if (r < 0) 136 return r; 137 138 block_size = dm_bm_block_size(dm_tm_get_bm(info->tm)); 139 max_entries = calc_max_entries(info->value_type.size, block_size); 140 141 n = dm_block_data(b); 142 memset(n, 0, block_size); 143 n->header.flags = cpu_to_le32(LEAF_NODE); 144 n->header.nr_entries = cpu_to_le32(0); 145 n->header.max_entries = cpu_to_le32(max_entries); 146 n->header.value_size = cpu_to_le32(info->value_type.size); 147 148 *root = dm_block_location(b); 149 unlock_block(info, b); 150 151 return 0; 152 } 153 EXPORT_SYMBOL_GPL(dm_btree_empty); 154 155 /*----------------------------------------------------------------*/ 156 157 /* 158 * Deletion uses a recursive algorithm, since we have limited stack space 159 * we explicitly manage our own stack on the heap. 160 */ 161 #define MAX_SPINE_DEPTH 64 162 struct frame { 163 struct dm_block *b; 164 struct btree_node *n; 165 unsigned level; 166 unsigned nr_children; 167 unsigned current_child; 168 }; 169 170 struct del_stack { 171 struct dm_btree_info *info; 172 struct dm_transaction_manager *tm; 173 int top; 174 struct frame spine[MAX_SPINE_DEPTH]; 175 }; 176 177 static int top_frame(struct del_stack *s, struct frame **f) 178 { 179 if (s->top < 0) { 180 DMERR("btree deletion stack empty"); 181 return -EINVAL; 182 } 183 184 *f = s->spine + s->top; 185 186 return 0; 187 } 188 189 static int unprocessed_frames(struct del_stack *s) 190 { 191 return s->top >= 0; 192 } 193 194 static void prefetch_children(struct del_stack *s, struct frame *f) 195 { 196 unsigned i; 197 struct dm_block_manager *bm = dm_tm_get_bm(s->tm); 198 199 for (i = 0; i < f->nr_children; i++) 200 dm_bm_prefetch(bm, value64(f->n, i)); 201 } 202 203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f) 204 { 205 return f->level < (info->levels - 1); 206 } 207 208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level) 209 { 210 int r; 211 uint32_t ref_count; 212 213 if (s->top >= MAX_SPINE_DEPTH - 1) { 214 DMERR("btree deletion stack out of memory"); 215 return -ENOMEM; 216 } 217 218 r = dm_tm_ref(s->tm, b, &ref_count); 219 if (r) 220 return r; 221 222 if (ref_count > 1) 223 /* 224 * This is a shared node, so we can just decrement it's 225 * reference counter and leave the children. 226 */ 227 dm_tm_dec(s->tm, b); 228 229 else { 230 uint32_t flags; 231 struct frame *f = s->spine + ++s->top; 232 233 r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b); 234 if (r) { 235 s->top--; 236 return r; 237 } 238 239 f->n = dm_block_data(f->b); 240 f->level = level; 241 f->nr_children = le32_to_cpu(f->n->header.nr_entries); 242 f->current_child = 0; 243 244 flags = le32_to_cpu(f->n->header.flags); 245 if (flags & INTERNAL_NODE || is_internal_level(s->info, f)) 246 prefetch_children(s, f); 247 } 248 249 return 0; 250 } 251 252 static void pop_frame(struct del_stack *s) 253 { 254 struct frame *f = s->spine + s->top--; 255 256 dm_tm_dec(s->tm, dm_block_location(f->b)); 257 dm_tm_unlock(s->tm, f->b); 258 } 259 260 static void unlock_all_frames(struct del_stack *s) 261 { 262 struct frame *f; 263 264 while (unprocessed_frames(s)) { 265 f = s->spine + s->top--; 266 dm_tm_unlock(s->tm, f->b); 267 } 268 } 269 270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root) 271 { 272 int r; 273 struct del_stack *s; 274 275 /* 276 * dm_btree_del() is called via an ioctl, as such should be 277 * considered an FS op. We can't recurse back into the FS, so we 278 * allocate GFP_NOFS. 279 */ 280 s = kmalloc(sizeof(*s), GFP_NOFS); 281 if (!s) 282 return -ENOMEM; 283 s->info = info; 284 s->tm = info->tm; 285 s->top = -1; 286 287 r = push_frame(s, root, 0); 288 if (r) 289 goto out; 290 291 while (unprocessed_frames(s)) { 292 uint32_t flags; 293 struct frame *f; 294 dm_block_t b; 295 296 r = top_frame(s, &f); 297 if (r) 298 goto out; 299 300 if (f->current_child >= f->nr_children) { 301 pop_frame(s); 302 continue; 303 } 304 305 flags = le32_to_cpu(f->n->header.flags); 306 if (flags & INTERNAL_NODE) { 307 b = value64(f->n, f->current_child); 308 f->current_child++; 309 r = push_frame(s, b, f->level); 310 if (r) 311 goto out; 312 313 } else if (is_internal_level(info, f)) { 314 b = value64(f->n, f->current_child); 315 f->current_child++; 316 r = push_frame(s, b, f->level + 1); 317 if (r) 318 goto out; 319 320 } else { 321 if (info->value_type.dec) { 322 unsigned i; 323 324 for (i = 0; i < f->nr_children; i++) 325 info->value_type.dec(info->value_type.context, 326 value_ptr(f->n, i)); 327 } 328 pop_frame(s); 329 } 330 } 331 out: 332 if (r) { 333 /* cleanup all frames of del_stack */ 334 unlock_all_frames(s); 335 } 336 kfree(s); 337 338 return r; 339 } 340 EXPORT_SYMBOL_GPL(dm_btree_del); 341 342 /*----------------------------------------------------------------*/ 343 344 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key, 345 int (*search_fn)(struct btree_node *, uint64_t), 346 uint64_t *result_key, void *v, size_t value_size) 347 { 348 int i, r; 349 uint32_t flags, nr_entries; 350 351 do { 352 r = ro_step(s, block); 353 if (r < 0) 354 return r; 355 356 i = search_fn(ro_node(s), key); 357 358 flags = le32_to_cpu(ro_node(s)->header.flags); 359 nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries); 360 if (i < 0 || i >= nr_entries) 361 return -ENODATA; 362 363 if (flags & INTERNAL_NODE) 364 block = value64(ro_node(s), i); 365 366 } while (!(flags & LEAF_NODE)); 367 368 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 369 memcpy(v, value_ptr(ro_node(s), i), value_size); 370 371 return 0; 372 } 373 374 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root, 375 uint64_t *keys, void *value_le) 376 { 377 unsigned level, last_level = info->levels - 1; 378 int r = -ENODATA; 379 uint64_t rkey; 380 __le64 internal_value_le; 381 struct ro_spine spine; 382 383 init_ro_spine(&spine, info); 384 for (level = 0; level < info->levels; level++) { 385 size_t size; 386 void *value_p; 387 388 if (level == last_level) { 389 value_p = value_le; 390 size = info->value_type.size; 391 392 } else { 393 value_p = &internal_value_le; 394 size = sizeof(uint64_t); 395 } 396 397 r = btree_lookup_raw(&spine, root, keys[level], 398 lower_bound, &rkey, 399 value_p, size); 400 401 if (!r) { 402 if (rkey != keys[level]) { 403 exit_ro_spine(&spine); 404 return -ENODATA; 405 } 406 } else { 407 exit_ro_spine(&spine); 408 return r; 409 } 410 411 root = le64_to_cpu(internal_value_le); 412 } 413 exit_ro_spine(&spine); 414 415 return r; 416 } 417 EXPORT_SYMBOL_GPL(dm_btree_lookup); 418 419 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root, 420 uint64_t key, uint64_t *rkey, void *value_le) 421 { 422 int r, i; 423 uint32_t flags, nr_entries; 424 struct dm_block *node; 425 struct btree_node *n; 426 427 r = bn_read_lock(info, root, &node); 428 if (r) 429 return r; 430 431 n = dm_block_data(node); 432 flags = le32_to_cpu(n->header.flags); 433 nr_entries = le32_to_cpu(n->header.nr_entries); 434 435 if (flags & INTERNAL_NODE) { 436 i = lower_bound(n, key); 437 if (i < 0) { 438 /* 439 * avoid early -ENODATA return when all entries are 440 * higher than the search @key. 441 */ 442 i = 0; 443 } 444 if (i >= nr_entries) { 445 r = -ENODATA; 446 goto out; 447 } 448 449 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 450 if (r == -ENODATA && i < (nr_entries - 1)) { 451 i++; 452 r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le); 453 } 454 455 } else { 456 i = upper_bound(n, key); 457 if (i < 0 || i >= nr_entries) { 458 r = -ENODATA; 459 goto out; 460 } 461 462 *rkey = le64_to_cpu(n->keys[i]); 463 memcpy(value_le, value_ptr(n, i), info->value_type.size); 464 } 465 out: 466 dm_tm_unlock(info->tm, node); 467 return r; 468 } 469 470 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root, 471 uint64_t *keys, uint64_t *rkey, void *value_le) 472 { 473 unsigned level; 474 int r = -ENODATA; 475 __le64 internal_value_le; 476 struct ro_spine spine; 477 478 init_ro_spine(&spine, info); 479 for (level = 0; level < info->levels - 1u; level++) { 480 r = btree_lookup_raw(&spine, root, keys[level], 481 lower_bound, rkey, 482 &internal_value_le, sizeof(uint64_t)); 483 if (r) 484 goto out; 485 486 if (*rkey != keys[level]) { 487 r = -ENODATA; 488 goto out; 489 } 490 491 root = le64_to_cpu(internal_value_le); 492 } 493 494 r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le); 495 out: 496 exit_ro_spine(&spine); 497 return r; 498 } 499 500 EXPORT_SYMBOL_GPL(dm_btree_lookup_next); 501 502 /* 503 * Splits a node by creating a sibling node and shifting half the nodes 504 * contents across. Assumes there is a parent node, and it has room for 505 * another child. 506 * 507 * Before: 508 * +--------+ 509 * | Parent | 510 * +--------+ 511 * | 512 * v 513 * +----------+ 514 * | A ++++++ | 515 * +----------+ 516 * 517 * 518 * After: 519 * +--------+ 520 * | Parent | 521 * +--------+ 522 * | | 523 * v +------+ 524 * +---------+ | 525 * | A* +++ | v 526 * +---------+ +-------+ 527 * | B +++ | 528 * +-------+ 529 * 530 * Where A* is a shadow of A. 531 */ 532 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index, 533 uint64_t key) 534 { 535 int r; 536 size_t size; 537 unsigned nr_left, nr_right; 538 struct dm_block *left, *right, *parent; 539 struct btree_node *ln, *rn, *pn; 540 __le64 location; 541 542 left = shadow_current(s); 543 544 r = new_block(s->info, &right); 545 if (r < 0) 546 return r; 547 548 ln = dm_block_data(left); 549 rn = dm_block_data(right); 550 551 nr_left = le32_to_cpu(ln->header.nr_entries) / 2; 552 nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left; 553 554 ln->header.nr_entries = cpu_to_le32(nr_left); 555 556 rn->header.flags = ln->header.flags; 557 rn->header.nr_entries = cpu_to_le32(nr_right); 558 rn->header.max_entries = ln->header.max_entries; 559 rn->header.value_size = ln->header.value_size; 560 memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0])); 561 562 size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ? 563 sizeof(uint64_t) : s->info->value_type.size; 564 memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left), 565 size * nr_right); 566 567 /* 568 * Patch up the parent 569 */ 570 parent = shadow_parent(s); 571 572 pn = dm_block_data(parent); 573 location = cpu_to_le64(dm_block_location(left)); 574 __dm_bless_for_disk(&location); 575 memcpy_disk(value_ptr(pn, parent_index), 576 &location, sizeof(__le64)); 577 578 location = cpu_to_le64(dm_block_location(right)); 579 __dm_bless_for_disk(&location); 580 581 r = insert_at(sizeof(__le64), pn, parent_index + 1, 582 le64_to_cpu(rn->keys[0]), &location); 583 if (r) { 584 unlock_block(s->info, right); 585 return r; 586 } 587 588 if (key < le64_to_cpu(rn->keys[0])) { 589 unlock_block(s->info, right); 590 s->nodes[1] = left; 591 } else { 592 unlock_block(s->info, left); 593 s->nodes[1] = right; 594 } 595 596 return 0; 597 } 598 599 /* 600 * Splits a node by creating two new children beneath the given node. 601 * 602 * Before: 603 * +----------+ 604 * | A ++++++ | 605 * +----------+ 606 * 607 * 608 * After: 609 * +------------+ 610 * | A (shadow) | 611 * +------------+ 612 * | | 613 * +------+ +----+ 614 * | | 615 * v v 616 * +-------+ +-------+ 617 * | B +++ | | C +++ | 618 * +-------+ +-------+ 619 */ 620 static int btree_split_beneath(struct shadow_spine *s, uint64_t key) 621 { 622 int r; 623 size_t size; 624 unsigned nr_left, nr_right; 625 struct dm_block *left, *right, *new_parent; 626 struct btree_node *pn, *ln, *rn; 627 __le64 val; 628 629 new_parent = shadow_current(s); 630 631 r = new_block(s->info, &left); 632 if (r < 0) 633 return r; 634 635 r = new_block(s->info, &right); 636 if (r < 0) { 637 unlock_block(s->info, left); 638 return r; 639 } 640 641 pn = dm_block_data(new_parent); 642 ln = dm_block_data(left); 643 rn = dm_block_data(right); 644 645 nr_left = le32_to_cpu(pn->header.nr_entries) / 2; 646 nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left; 647 648 ln->header.flags = pn->header.flags; 649 ln->header.nr_entries = cpu_to_le32(nr_left); 650 ln->header.max_entries = pn->header.max_entries; 651 ln->header.value_size = pn->header.value_size; 652 653 rn->header.flags = pn->header.flags; 654 rn->header.nr_entries = cpu_to_le32(nr_right); 655 rn->header.max_entries = pn->header.max_entries; 656 rn->header.value_size = pn->header.value_size; 657 658 memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0])); 659 memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0])); 660 661 size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ? 662 sizeof(__le64) : s->info->value_type.size; 663 memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size); 664 memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left), 665 nr_right * size); 666 667 /* new_parent should just point to l and r now */ 668 pn->header.flags = cpu_to_le32(INTERNAL_NODE); 669 pn->header.nr_entries = cpu_to_le32(2); 670 pn->header.max_entries = cpu_to_le32( 671 calc_max_entries(sizeof(__le64), 672 dm_bm_block_size( 673 dm_tm_get_bm(s->info->tm)))); 674 pn->header.value_size = cpu_to_le32(sizeof(__le64)); 675 676 val = cpu_to_le64(dm_block_location(left)); 677 __dm_bless_for_disk(&val); 678 pn->keys[0] = ln->keys[0]; 679 memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64)); 680 681 val = cpu_to_le64(dm_block_location(right)); 682 __dm_bless_for_disk(&val); 683 pn->keys[1] = rn->keys[0]; 684 memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64)); 685 686 /* 687 * rejig the spine. This is ugly, since it knows too 688 * much about the spine 689 */ 690 if (s->nodes[0] != new_parent) { 691 unlock_block(s->info, s->nodes[0]); 692 s->nodes[0] = new_parent; 693 } 694 if (key < le64_to_cpu(rn->keys[0])) { 695 unlock_block(s->info, right); 696 s->nodes[1] = left; 697 } else { 698 unlock_block(s->info, left); 699 s->nodes[1] = right; 700 } 701 s->count = 2; 702 703 return 0; 704 } 705 706 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root, 707 struct dm_btree_value_type *vt, 708 uint64_t key, unsigned *index) 709 { 710 int r, i = *index, top = 1; 711 struct btree_node *node; 712 713 for (;;) { 714 r = shadow_step(s, root, vt); 715 if (r < 0) 716 return r; 717 718 node = dm_block_data(shadow_current(s)); 719 720 /* 721 * We have to patch up the parent node, ugly, but I don't 722 * see a way to do this automatically as part of the spine 723 * op. 724 */ 725 if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */ 726 __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 727 728 __dm_bless_for_disk(&location); 729 memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i), 730 &location, sizeof(__le64)); 731 } 732 733 node = dm_block_data(shadow_current(s)); 734 735 if (node->header.nr_entries == node->header.max_entries) { 736 if (top) 737 r = btree_split_beneath(s, key); 738 else 739 r = btree_split_sibling(s, i, key); 740 741 if (r < 0) 742 return r; 743 } 744 745 node = dm_block_data(shadow_current(s)); 746 747 i = lower_bound(node, key); 748 749 if (le32_to_cpu(node->header.flags) & LEAF_NODE) 750 break; 751 752 if (i < 0) { 753 /* change the bounds on the lowest key */ 754 node->keys[0] = cpu_to_le64(key); 755 i = 0; 756 } 757 758 root = value64(node, i); 759 top = 0; 760 } 761 762 if (i < 0 || le64_to_cpu(node->keys[i]) != key) 763 i++; 764 765 *index = i; 766 return 0; 767 } 768 769 static bool need_insert(struct btree_node *node, uint64_t *keys, 770 unsigned level, unsigned index) 771 { 772 return ((index >= le32_to_cpu(node->header.nr_entries)) || 773 (le64_to_cpu(node->keys[index]) != keys[level])); 774 } 775 776 static int insert(struct dm_btree_info *info, dm_block_t root, 777 uint64_t *keys, void *value, dm_block_t *new_root, 778 int *inserted) 779 __dm_written_to_disk(value) 780 { 781 int r; 782 unsigned level, index = -1, last_level = info->levels - 1; 783 dm_block_t block = root; 784 struct shadow_spine spine; 785 struct btree_node *n; 786 struct dm_btree_value_type le64_type; 787 788 init_le64_type(info->tm, &le64_type); 789 init_shadow_spine(&spine, info); 790 791 for (level = 0; level < (info->levels - 1); level++) { 792 r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index); 793 if (r < 0) 794 goto bad; 795 796 n = dm_block_data(shadow_current(&spine)); 797 798 if (need_insert(n, keys, level, index)) { 799 dm_block_t new_tree; 800 __le64 new_le; 801 802 r = dm_btree_empty(info, &new_tree); 803 if (r < 0) 804 goto bad; 805 806 new_le = cpu_to_le64(new_tree); 807 __dm_bless_for_disk(&new_le); 808 809 r = insert_at(sizeof(uint64_t), n, index, 810 keys[level], &new_le); 811 if (r) 812 goto bad; 813 } 814 815 if (level < last_level) 816 block = value64(n, index); 817 } 818 819 r = btree_insert_raw(&spine, block, &info->value_type, 820 keys[level], &index); 821 if (r < 0) 822 goto bad; 823 824 n = dm_block_data(shadow_current(&spine)); 825 826 if (need_insert(n, keys, level, index)) { 827 if (inserted) 828 *inserted = 1; 829 830 r = insert_at(info->value_type.size, n, index, 831 keys[level], value); 832 if (r) 833 goto bad_unblessed; 834 } else { 835 if (inserted) 836 *inserted = 0; 837 838 if (info->value_type.dec && 839 (!info->value_type.equal || 840 !info->value_type.equal( 841 info->value_type.context, 842 value_ptr(n, index), 843 value))) { 844 info->value_type.dec(info->value_type.context, 845 value_ptr(n, index)); 846 } 847 memcpy_disk(value_ptr(n, index), 848 value, info->value_type.size); 849 } 850 851 *new_root = shadow_root(&spine); 852 exit_shadow_spine(&spine); 853 854 return 0; 855 856 bad: 857 __dm_unbless_for_disk(value); 858 bad_unblessed: 859 exit_shadow_spine(&spine); 860 return r; 861 } 862 863 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root, 864 uint64_t *keys, void *value, dm_block_t *new_root) 865 __dm_written_to_disk(value) 866 { 867 return insert(info, root, keys, value, new_root, NULL); 868 } 869 EXPORT_SYMBOL_GPL(dm_btree_insert); 870 871 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root, 872 uint64_t *keys, void *value, dm_block_t *new_root, 873 int *inserted) 874 __dm_written_to_disk(value) 875 { 876 return insert(info, root, keys, value, new_root, inserted); 877 } 878 EXPORT_SYMBOL_GPL(dm_btree_insert_notify); 879 880 /*----------------------------------------------------------------*/ 881 882 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest, 883 uint64_t *result_key, dm_block_t *next_block) 884 { 885 int i, r; 886 uint32_t flags; 887 888 do { 889 r = ro_step(s, block); 890 if (r < 0) 891 return r; 892 893 flags = le32_to_cpu(ro_node(s)->header.flags); 894 i = le32_to_cpu(ro_node(s)->header.nr_entries); 895 if (!i) 896 return -ENODATA; 897 else 898 i--; 899 900 if (find_highest) 901 *result_key = le64_to_cpu(ro_node(s)->keys[i]); 902 else 903 *result_key = le64_to_cpu(ro_node(s)->keys[0]); 904 905 if (next_block || flags & INTERNAL_NODE) 906 block = value64(ro_node(s), i); 907 908 } while (flags & INTERNAL_NODE); 909 910 if (next_block) 911 *next_block = block; 912 return 0; 913 } 914 915 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root, 916 bool find_highest, uint64_t *result_keys) 917 { 918 int r = 0, count = 0, level; 919 struct ro_spine spine; 920 921 init_ro_spine(&spine, info); 922 for (level = 0; level < info->levels; level++) { 923 r = find_key(&spine, root, find_highest, result_keys + level, 924 level == info->levels - 1 ? NULL : &root); 925 if (r == -ENODATA) { 926 r = 0; 927 break; 928 929 } else if (r) 930 break; 931 932 count++; 933 } 934 exit_ro_spine(&spine); 935 936 return r ? r : count; 937 } 938 939 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root, 940 uint64_t *result_keys) 941 { 942 return dm_btree_find_key(info, root, true, result_keys); 943 } 944 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key); 945 946 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root, 947 uint64_t *result_keys) 948 { 949 return dm_btree_find_key(info, root, false, result_keys); 950 } 951 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key); 952 953 /*----------------------------------------------------------------*/ 954 955 /* 956 * FIXME: We shouldn't use a recursive algorithm when we have limited stack 957 * space. Also this only works for single level trees. 958 */ 959 static int walk_node(struct dm_btree_info *info, dm_block_t block, 960 int (*fn)(void *context, uint64_t *keys, void *leaf), 961 void *context) 962 { 963 int r; 964 unsigned i, nr; 965 struct dm_block *node; 966 struct btree_node *n; 967 uint64_t keys; 968 969 r = bn_read_lock(info, block, &node); 970 if (r) 971 return r; 972 973 n = dm_block_data(node); 974 975 nr = le32_to_cpu(n->header.nr_entries); 976 for (i = 0; i < nr; i++) { 977 if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) { 978 r = walk_node(info, value64(n, i), fn, context); 979 if (r) 980 goto out; 981 } else { 982 keys = le64_to_cpu(*key_ptr(n, i)); 983 r = fn(context, &keys, value_ptr(n, i)); 984 if (r) 985 goto out; 986 } 987 } 988 989 out: 990 dm_tm_unlock(info->tm, node); 991 return r; 992 } 993 994 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root, 995 int (*fn)(void *context, uint64_t *keys, void *leaf), 996 void *context) 997 { 998 BUG_ON(info->levels > 1); 999 return walk_node(info, root, fn, context); 1000 } 1001 EXPORT_SYMBOL_GPL(dm_btree_walk); 1002 1003 /*----------------------------------------------------------------*/ 1004 1005 static void prefetch_values(struct dm_btree_cursor *c) 1006 { 1007 unsigned i, nr; 1008 __le64 value_le; 1009 struct cursor_node *n = c->nodes + c->depth - 1; 1010 struct btree_node *bn = dm_block_data(n->b); 1011 struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm); 1012 1013 BUG_ON(c->info->value_type.size != sizeof(value_le)); 1014 1015 nr = le32_to_cpu(bn->header.nr_entries); 1016 for (i = 0; i < nr; i++) { 1017 memcpy(&value_le, value_ptr(bn, i), sizeof(value_le)); 1018 dm_bm_prefetch(bm, le64_to_cpu(value_le)); 1019 } 1020 } 1021 1022 static bool leaf_node(struct dm_btree_cursor *c) 1023 { 1024 struct cursor_node *n = c->nodes + c->depth - 1; 1025 struct btree_node *bn = dm_block_data(n->b); 1026 1027 return le32_to_cpu(bn->header.flags) & LEAF_NODE; 1028 } 1029 1030 static int push_node(struct dm_btree_cursor *c, dm_block_t b) 1031 { 1032 int r; 1033 struct cursor_node *n = c->nodes + c->depth; 1034 1035 if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) { 1036 DMERR("couldn't push cursor node, stack depth too high"); 1037 return -EINVAL; 1038 } 1039 1040 r = bn_read_lock(c->info, b, &n->b); 1041 if (r) 1042 return r; 1043 1044 n->index = 0; 1045 c->depth++; 1046 1047 if (c->prefetch_leaves || !leaf_node(c)) 1048 prefetch_values(c); 1049 1050 return 0; 1051 } 1052 1053 static void pop_node(struct dm_btree_cursor *c) 1054 { 1055 c->depth--; 1056 unlock_block(c->info, c->nodes[c->depth].b); 1057 } 1058 1059 static int inc_or_backtrack(struct dm_btree_cursor *c) 1060 { 1061 struct cursor_node *n; 1062 struct btree_node *bn; 1063 1064 for (;;) { 1065 if (!c->depth) 1066 return -ENODATA; 1067 1068 n = c->nodes + c->depth - 1; 1069 bn = dm_block_data(n->b); 1070 1071 n->index++; 1072 if (n->index < le32_to_cpu(bn->header.nr_entries)) 1073 break; 1074 1075 pop_node(c); 1076 } 1077 1078 return 0; 1079 } 1080 1081 static int find_leaf(struct dm_btree_cursor *c) 1082 { 1083 int r = 0; 1084 struct cursor_node *n; 1085 struct btree_node *bn; 1086 __le64 value_le; 1087 1088 for (;;) { 1089 n = c->nodes + c->depth - 1; 1090 bn = dm_block_data(n->b); 1091 1092 if (le32_to_cpu(bn->header.flags) & LEAF_NODE) 1093 break; 1094 1095 memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le)); 1096 r = push_node(c, le64_to_cpu(value_le)); 1097 if (r) { 1098 DMERR("push_node failed"); 1099 break; 1100 } 1101 } 1102 1103 if (!r && (le32_to_cpu(bn->header.nr_entries) == 0)) 1104 return -ENODATA; 1105 1106 return r; 1107 } 1108 1109 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root, 1110 bool prefetch_leaves, struct dm_btree_cursor *c) 1111 { 1112 int r; 1113 1114 c->info = info; 1115 c->root = root; 1116 c->depth = 0; 1117 c->prefetch_leaves = prefetch_leaves; 1118 1119 r = push_node(c, root); 1120 if (r) 1121 return r; 1122 1123 return find_leaf(c); 1124 } 1125 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin); 1126 1127 void dm_btree_cursor_end(struct dm_btree_cursor *c) 1128 { 1129 while (c->depth) 1130 pop_node(c); 1131 } 1132 EXPORT_SYMBOL_GPL(dm_btree_cursor_end); 1133 1134 int dm_btree_cursor_next(struct dm_btree_cursor *c) 1135 { 1136 int r = inc_or_backtrack(c); 1137 if (!r) { 1138 r = find_leaf(c); 1139 if (r) 1140 DMERR("find_leaf failed"); 1141 } 1142 1143 return r; 1144 } 1145 EXPORT_SYMBOL_GPL(dm_btree_cursor_next); 1146 1147 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count) 1148 { 1149 int r = 0; 1150 1151 while (count-- && !r) 1152 r = dm_btree_cursor_next(c); 1153 1154 return r; 1155 } 1156 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip); 1157 1158 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le) 1159 { 1160 if (c->depth) { 1161 struct cursor_node *n = c->nodes + c->depth - 1; 1162 struct btree_node *bn = dm_block_data(n->b); 1163 1164 if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE) 1165 return -EINVAL; 1166 1167 *key = le64_to_cpu(*key_ptr(bn, n->index)); 1168 memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size); 1169 return 0; 1170 1171 } else 1172 return -ENODATA; 1173 } 1174 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value); 1175