1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10 
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13 
14 #define DM_MSG_PREFIX "btree"
15 
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 	__dm_written_to_disk(src)
21 {
22 	memcpy(dest, src, len);
23 	__dm_unbless_for_disk(src);
24 }
25 
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 			 unsigned index, void *elt)
28 	__dm_written_to_disk(elt)
29 {
30 	if (index < nr_elts)
31 		memmove(base + (elt_size * (index + 1)),
32 			base + (elt_size * index),
33 			(nr_elts - index) * elt_size);
34 
35 	memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37 
38 /*----------------------------------------------------------------*/
39 
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44 
45 	while (hi - lo > 1) {
46 		int mid = lo + ((hi - lo) / 2);
47 		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48 
49 		if (mid_key == key)
50 			return mid;
51 
52 		if (mid_key < key)
53 			lo = mid;
54 		else
55 			hi = mid;
56 	}
57 
58 	return want_hi ? hi : lo;
59 }
60 
61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63 	return bsearch(n, key, 0);
64 }
65 
66 static int upper_bound(struct btree_node *n, uint64_t key)
67 {
68 	return bsearch(n, key, 1);
69 }
70 
71 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
72 		  struct dm_btree_value_type *vt)
73 {
74 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
75 
76 	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
77 		dm_tm_with_runs(tm, value_ptr(n, 0), nr_entries, dm_tm_inc_range);
78 
79 	else if (vt->inc)
80 		vt->inc(vt->context, value_ptr(n, 0), nr_entries);
81 }
82 
83 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
84 		     uint64_t key, void *value)
85 	__dm_written_to_disk(value)
86 {
87 	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
88 	uint32_t max_entries = le32_to_cpu(node->header.max_entries);
89 	__le64 key_le = cpu_to_le64(key);
90 
91 	if (index > nr_entries ||
92 	    index >= max_entries ||
93 	    nr_entries >= max_entries) {
94 		DMERR("too many entries in btree node for insert");
95 		__dm_unbless_for_disk(value);
96 		return -ENOMEM;
97 	}
98 
99 	__dm_bless_for_disk(&key_le);
100 
101 	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
102 	array_insert(value_base(node), value_size, nr_entries, index, value);
103 	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
104 
105 	return 0;
106 }
107 
108 /*----------------------------------------------------------------*/
109 
110 /*
111  * We want 3n entries (for some n).  This works more nicely for repeated
112  * insert remove loops than (2n + 1).
113  */
114 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
115 {
116 	uint32_t total, n;
117 	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
118 
119 	block_size -= sizeof(struct node_header);
120 	total = block_size / elt_size;
121 	n = total / 3;		/* rounds down */
122 
123 	return 3 * n;
124 }
125 
126 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
127 {
128 	int r;
129 	struct dm_block *b;
130 	struct btree_node *n;
131 	size_t block_size;
132 	uint32_t max_entries;
133 
134 	r = new_block(info, &b);
135 	if (r < 0)
136 		return r;
137 
138 	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
139 	max_entries = calc_max_entries(info->value_type.size, block_size);
140 
141 	n = dm_block_data(b);
142 	memset(n, 0, block_size);
143 	n->header.flags = cpu_to_le32(LEAF_NODE);
144 	n->header.nr_entries = cpu_to_le32(0);
145 	n->header.max_entries = cpu_to_le32(max_entries);
146 	n->header.value_size = cpu_to_le32(info->value_type.size);
147 
148 	*root = dm_block_location(b);
149 	unlock_block(info, b);
150 
151 	return 0;
152 }
153 EXPORT_SYMBOL_GPL(dm_btree_empty);
154 
155 /*----------------------------------------------------------------*/
156 
157 /*
158  * Deletion uses a recursive algorithm, since we have limited stack space
159  * we explicitly manage our own stack on the heap.
160  */
161 #define MAX_SPINE_DEPTH 64
162 struct frame {
163 	struct dm_block *b;
164 	struct btree_node *n;
165 	unsigned level;
166 	unsigned nr_children;
167 	unsigned current_child;
168 };
169 
170 struct del_stack {
171 	struct dm_btree_info *info;
172 	struct dm_transaction_manager *tm;
173 	int top;
174 	struct frame spine[MAX_SPINE_DEPTH];
175 };
176 
177 static int top_frame(struct del_stack *s, struct frame **f)
178 {
179 	if (s->top < 0) {
180 		DMERR("btree deletion stack empty");
181 		return -EINVAL;
182 	}
183 
184 	*f = s->spine + s->top;
185 
186 	return 0;
187 }
188 
189 static int unprocessed_frames(struct del_stack *s)
190 {
191 	return s->top >= 0;
192 }
193 
194 static void prefetch_children(struct del_stack *s, struct frame *f)
195 {
196 	unsigned i;
197 	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
198 
199 	for (i = 0; i < f->nr_children; i++)
200 		dm_bm_prefetch(bm, value64(f->n, i));
201 }
202 
203 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
204 {
205 	return f->level < (info->levels - 1);
206 }
207 
208 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
209 {
210 	int r;
211 	uint32_t ref_count;
212 
213 	if (s->top >= MAX_SPINE_DEPTH - 1) {
214 		DMERR("btree deletion stack out of memory");
215 		return -ENOMEM;
216 	}
217 
218 	r = dm_tm_ref(s->tm, b, &ref_count);
219 	if (r)
220 		return r;
221 
222 	if (ref_count > 1)
223 		/*
224 		 * This is a shared node, so we can just decrement it's
225 		 * reference counter and leave the children.
226 		 */
227 		dm_tm_dec(s->tm, b);
228 
229 	else {
230 		uint32_t flags;
231 		struct frame *f = s->spine + ++s->top;
232 
233 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
234 		if (r) {
235 			s->top--;
236 			return r;
237 		}
238 
239 		f->n = dm_block_data(f->b);
240 		f->level = level;
241 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
242 		f->current_child = 0;
243 
244 		flags = le32_to_cpu(f->n->header.flags);
245 		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
246 			prefetch_children(s, f);
247 	}
248 
249 	return 0;
250 }
251 
252 static void pop_frame(struct del_stack *s)
253 {
254 	struct frame *f = s->spine + s->top--;
255 
256 	dm_tm_dec(s->tm, dm_block_location(f->b));
257 	dm_tm_unlock(s->tm, f->b);
258 }
259 
260 static void unlock_all_frames(struct del_stack *s)
261 {
262 	struct frame *f;
263 
264 	while (unprocessed_frames(s)) {
265 		f = s->spine + s->top--;
266 		dm_tm_unlock(s->tm, f->b);
267 	}
268 }
269 
270 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
271 {
272 	int r;
273 	struct del_stack *s;
274 
275 	/*
276 	 * dm_btree_del() is called via an ioctl, as such should be
277 	 * considered an FS op.  We can't recurse back into the FS, so we
278 	 * allocate GFP_NOFS.
279 	 */
280 	s = kmalloc(sizeof(*s), GFP_NOFS);
281 	if (!s)
282 		return -ENOMEM;
283 	s->info = info;
284 	s->tm = info->tm;
285 	s->top = -1;
286 
287 	r = push_frame(s, root, 0);
288 	if (r)
289 		goto out;
290 
291 	while (unprocessed_frames(s)) {
292 		uint32_t flags;
293 		struct frame *f;
294 		dm_block_t b;
295 
296 		r = top_frame(s, &f);
297 		if (r)
298 			goto out;
299 
300 		if (f->current_child >= f->nr_children) {
301 			pop_frame(s);
302 			continue;
303 		}
304 
305 		flags = le32_to_cpu(f->n->header.flags);
306 		if (flags & INTERNAL_NODE) {
307 			b = value64(f->n, f->current_child);
308 			f->current_child++;
309 			r = push_frame(s, b, f->level);
310 			if (r)
311 				goto out;
312 
313 		} else if (is_internal_level(info, f)) {
314 			b = value64(f->n, f->current_child);
315 			f->current_child++;
316 			r = push_frame(s, b, f->level + 1);
317 			if (r)
318 				goto out;
319 
320 		} else {
321 			if (info->value_type.dec)
322 				info->value_type.dec(info->value_type.context,
323 						     value_ptr(f->n, 0), f->nr_children);
324 			pop_frame(s);
325 		}
326 	}
327 out:
328 	if (r) {
329 		/* cleanup all frames of del_stack */
330 		unlock_all_frames(s);
331 	}
332 	kfree(s);
333 
334 	return r;
335 }
336 EXPORT_SYMBOL_GPL(dm_btree_del);
337 
338 /*----------------------------------------------------------------*/
339 
340 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
341 			    int (*search_fn)(struct btree_node *, uint64_t),
342 			    uint64_t *result_key, void *v, size_t value_size)
343 {
344 	int i, r;
345 	uint32_t flags, nr_entries;
346 
347 	do {
348 		r = ro_step(s, block);
349 		if (r < 0)
350 			return r;
351 
352 		i = search_fn(ro_node(s), key);
353 
354 		flags = le32_to_cpu(ro_node(s)->header.flags);
355 		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
356 		if (i < 0 || i >= nr_entries)
357 			return -ENODATA;
358 
359 		if (flags & INTERNAL_NODE)
360 			block = value64(ro_node(s), i);
361 
362 	} while (!(flags & LEAF_NODE));
363 
364 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
365 	if (v)
366 		memcpy(v, value_ptr(ro_node(s), i), value_size);
367 
368 	return 0;
369 }
370 
371 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
372 		    uint64_t *keys, void *value_le)
373 {
374 	unsigned level, last_level = info->levels - 1;
375 	int r = -ENODATA;
376 	uint64_t rkey;
377 	__le64 internal_value_le;
378 	struct ro_spine spine;
379 
380 	init_ro_spine(&spine, info);
381 	for (level = 0; level < info->levels; level++) {
382 		size_t size;
383 		void *value_p;
384 
385 		if (level == last_level) {
386 			value_p = value_le;
387 			size = info->value_type.size;
388 
389 		} else {
390 			value_p = &internal_value_le;
391 			size = sizeof(uint64_t);
392 		}
393 
394 		r = btree_lookup_raw(&spine, root, keys[level],
395 				     lower_bound, &rkey,
396 				     value_p, size);
397 
398 		if (!r) {
399 			if (rkey != keys[level]) {
400 				exit_ro_spine(&spine);
401 				return -ENODATA;
402 			}
403 		} else {
404 			exit_ro_spine(&spine);
405 			return r;
406 		}
407 
408 		root = le64_to_cpu(internal_value_le);
409 	}
410 	exit_ro_spine(&spine);
411 
412 	return r;
413 }
414 EXPORT_SYMBOL_GPL(dm_btree_lookup);
415 
416 static int dm_btree_lookup_next_single(struct dm_btree_info *info, dm_block_t root,
417 				       uint64_t key, uint64_t *rkey, void *value_le)
418 {
419 	int r, i;
420 	uint32_t flags, nr_entries;
421 	struct dm_block *node;
422 	struct btree_node *n;
423 
424 	r = bn_read_lock(info, root, &node);
425 	if (r)
426 		return r;
427 
428 	n = dm_block_data(node);
429 	flags = le32_to_cpu(n->header.flags);
430 	nr_entries = le32_to_cpu(n->header.nr_entries);
431 
432 	if (flags & INTERNAL_NODE) {
433 		i = lower_bound(n, key);
434 		if (i < 0) {
435 			/*
436 			 * avoid early -ENODATA return when all entries are
437 			 * higher than the search @key.
438 			 */
439 			i = 0;
440 		}
441 		if (i >= nr_entries) {
442 			r = -ENODATA;
443 			goto out;
444 		}
445 
446 		r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
447 		if (r == -ENODATA && i < (nr_entries - 1)) {
448 			i++;
449 			r = dm_btree_lookup_next_single(info, value64(n, i), key, rkey, value_le);
450 		}
451 
452 	} else {
453 		i = upper_bound(n, key);
454 		if (i < 0 || i >= nr_entries) {
455 			r = -ENODATA;
456 			goto out;
457 		}
458 
459 		*rkey = le64_to_cpu(n->keys[i]);
460 		memcpy(value_le, value_ptr(n, i), info->value_type.size);
461 	}
462 out:
463 	dm_tm_unlock(info->tm, node);
464 	return r;
465 }
466 
467 int dm_btree_lookup_next(struct dm_btree_info *info, dm_block_t root,
468 			 uint64_t *keys, uint64_t *rkey, void *value_le)
469 {
470 	unsigned level;
471 	int r = -ENODATA;
472 	__le64 internal_value_le;
473 	struct ro_spine spine;
474 
475 	init_ro_spine(&spine, info);
476 	for (level = 0; level < info->levels - 1u; level++) {
477 		r = btree_lookup_raw(&spine, root, keys[level],
478 				     lower_bound, rkey,
479 				     &internal_value_le, sizeof(uint64_t));
480 		if (r)
481 			goto out;
482 
483 		if (*rkey != keys[level]) {
484 			r = -ENODATA;
485 			goto out;
486 		}
487 
488 		root = le64_to_cpu(internal_value_le);
489 	}
490 
491 	r = dm_btree_lookup_next_single(info, root, keys[level], rkey, value_le);
492 out:
493 	exit_ro_spine(&spine);
494 	return r;
495 }
496 
497 EXPORT_SYMBOL_GPL(dm_btree_lookup_next);
498 
499 /*----------------------------------------------------------------*/
500 
501 /*
502  * Copies entries from one region of a btree node to another.  The regions
503  * must not overlap.
504  */
505 static void copy_entries(struct btree_node *dest, unsigned dest_offset,
506 			 struct btree_node *src, unsigned src_offset,
507 			 unsigned count)
508 {
509 	size_t value_size = le32_to_cpu(dest->header.value_size);
510 	memcpy(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
511 	memcpy(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
512 }
513 
514 /*
515  * Moves entries from one region fo a btree node to another.  The regions
516  * may overlap.
517  */
518 static void move_entries(struct btree_node *dest, unsigned dest_offset,
519 			 struct btree_node *src, unsigned src_offset,
520 			 unsigned count)
521 {
522 	size_t value_size = le32_to_cpu(dest->header.value_size);
523 	memmove(dest->keys + dest_offset, src->keys + src_offset, count * sizeof(uint64_t));
524 	memmove(value_ptr(dest, dest_offset), value_ptr(src, src_offset), count * value_size);
525 }
526 
527 /*
528  * Erases the first 'count' entries of a btree node, shifting following
529  * entries down into their place.
530  */
531 static void shift_down(struct btree_node *n, unsigned count)
532 {
533 	move_entries(n, 0, n, count, le32_to_cpu(n->header.nr_entries) - count);
534 }
535 
536 /*
537  * Moves entries in a btree node up 'count' places, making space for
538  * new entries at the start of the node.
539  */
540 static void shift_up(struct btree_node *n, unsigned count)
541 {
542 	move_entries(n, count, n, 0, le32_to_cpu(n->header.nr_entries));
543 }
544 
545 /*
546  * Redistributes entries between two btree nodes to make them
547  * have similar numbers of entries.
548  */
549 static void redistribute2(struct btree_node *left, struct btree_node *right)
550 {
551 	unsigned nr_left = le32_to_cpu(left->header.nr_entries);
552 	unsigned nr_right = le32_to_cpu(right->header.nr_entries);
553 	unsigned total = nr_left + nr_right;
554 	unsigned target_left = total / 2;
555 	unsigned target_right = total - target_left;
556 
557 	if (nr_left < target_left) {
558 		unsigned delta = target_left - nr_left;
559 		copy_entries(left, nr_left, right, 0, delta);
560 		shift_down(right, delta);
561 	} else if (nr_left > target_left) {
562 		unsigned delta = nr_left - target_left;
563 		if (nr_right)
564 			shift_up(right, delta);
565 		copy_entries(right, 0, left, target_left, delta);
566 	}
567 
568 	left->header.nr_entries = cpu_to_le32(target_left);
569 	right->header.nr_entries = cpu_to_le32(target_right);
570 }
571 
572 /*
573  * Redistribute entries between three nodes.  Assumes the central
574  * node is empty.
575  */
576 static void redistribute3(struct btree_node *left, struct btree_node *center,
577 			  struct btree_node *right)
578 {
579 	unsigned nr_left = le32_to_cpu(left->header.nr_entries);
580 	unsigned nr_center = le32_to_cpu(center->header.nr_entries);
581 	unsigned nr_right = le32_to_cpu(right->header.nr_entries);
582 	unsigned total, target_left, target_center, target_right;
583 
584 	BUG_ON(nr_center);
585 
586 	total = nr_left + nr_right;
587 	target_left = total / 3;
588 	target_center = (total - target_left) / 2;
589 	target_right = (total - target_left - target_center);
590 
591 	if (nr_left < target_left) {
592 		unsigned left_short = target_left - nr_left;
593 		copy_entries(left, nr_left, right, 0, left_short);
594 		copy_entries(center, 0, right, left_short, target_center);
595 		shift_down(right, nr_right - target_right);
596 
597 	} else if (nr_left < (target_left + target_center)) {
598 		unsigned left_to_center = nr_left - target_left;
599 		copy_entries(center, 0, left, target_left, left_to_center);
600 		copy_entries(center, left_to_center, right, 0, target_center - left_to_center);
601 		shift_down(right, nr_right - target_right);
602 
603 	} else {
604 		unsigned right_short = target_right - nr_right;
605 		shift_up(right, right_short);
606 		copy_entries(right, 0, left, nr_left - right_short, right_short);
607 		copy_entries(center, 0, left, target_left, nr_left - target_left);
608 	}
609 
610 	left->header.nr_entries = cpu_to_le32(target_left);
611 	center->header.nr_entries = cpu_to_le32(target_center);
612 	right->header.nr_entries = cpu_to_le32(target_right);
613 }
614 
615 /*
616  * Splits a node by creating a sibling node and shifting half the nodes
617  * contents across.  Assumes there is a parent node, and it has room for
618  * another child.
619  *
620  * Before:
621  *	  +--------+
622  *	  | Parent |
623  *	  +--------+
624  *	     |
625  *	     v
626  *	+----------+
627  *	| A ++++++ |
628  *	+----------+
629  *
630  *
631  * After:
632  *		+--------+
633  *		| Parent |
634  *		+--------+
635  *		  |	|
636  *		  v	+------+
637  *	    +---------+	       |
638  *	    | A* +++  |	       v
639  *	    +---------+	  +-------+
640  *			  | B +++ |
641  *			  +-------+
642  *
643  * Where A* is a shadow of A.
644  */
645 static int split_one_into_two(struct shadow_spine *s, unsigned parent_index,
646 			      struct dm_btree_value_type *vt, uint64_t key)
647 {
648 	int r;
649 	struct dm_block *left, *right, *parent;
650 	struct btree_node *ln, *rn, *pn;
651 	__le64 location;
652 
653 	left = shadow_current(s);
654 
655 	r = new_block(s->info, &right);
656 	if (r < 0)
657 		return r;
658 
659 	ln = dm_block_data(left);
660 	rn = dm_block_data(right);
661 
662 	rn->header.flags = ln->header.flags;
663 	rn->header.nr_entries = cpu_to_le32(0);
664 	rn->header.max_entries = ln->header.max_entries;
665 	rn->header.value_size = ln->header.value_size;
666 	redistribute2(ln, rn);
667 
668 	/* patch up the parent */
669 	parent = shadow_parent(s);
670 	pn = dm_block_data(parent);
671 
672 	location = cpu_to_le64(dm_block_location(right));
673 	__dm_bless_for_disk(&location);
674 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
675 		      le64_to_cpu(rn->keys[0]), &location);
676 	if (r) {
677 		unlock_block(s->info, right);
678 		return r;
679 	}
680 
681 	/* patch up the spine */
682 	if (key < le64_to_cpu(rn->keys[0])) {
683 		unlock_block(s->info, right);
684 		s->nodes[1] = left;
685 	} else {
686 		unlock_block(s->info, left);
687 		s->nodes[1] = right;
688 	}
689 
690 	return 0;
691 }
692 
693 /*
694  * We often need to modify a sibling node.  This function shadows a particular
695  * child of the given parent node.  Making sure to update the parent to point
696  * to the new shadow.
697  */
698 static int shadow_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
699 			struct btree_node *parent, unsigned index,
700 			struct dm_block **result)
701 {
702 	int r, inc;
703 	dm_block_t root;
704 	struct btree_node *node;
705 
706 	root = value64(parent, index);
707 
708 	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
709 			       result, &inc);
710 	if (r)
711 		return r;
712 
713 	node = dm_block_data(*result);
714 
715 	if (inc)
716 		inc_children(info->tm, node, vt);
717 
718 	*((__le64 *) value_ptr(parent, index)) =
719 		cpu_to_le64(dm_block_location(*result));
720 
721 	return 0;
722 }
723 
724 /*
725  * Splits two nodes into three.  This is more work, but results in fuller
726  * nodes, so saves metadata space.
727  */
728 static int split_two_into_three(struct shadow_spine *s, unsigned parent_index,
729                                 struct dm_btree_value_type *vt, uint64_t key)
730 {
731 	int r;
732 	unsigned middle_index;
733 	struct dm_block *left, *middle, *right, *parent;
734 	struct btree_node *ln, *rn, *mn, *pn;
735 	__le64 location;
736 
737 	parent = shadow_parent(s);
738 	pn = dm_block_data(parent);
739 
740 	if (parent_index == 0) {
741 		middle_index = 1;
742 		left = shadow_current(s);
743 		r = shadow_child(s->info, vt, pn, parent_index + 1, &right);
744 		if (r)
745 			return r;
746 	} else {
747 		middle_index = parent_index;
748 		right = shadow_current(s);
749 		r = shadow_child(s->info, vt, pn, parent_index - 1, &left);
750 		if (r)
751 			return r;
752 	}
753 
754 	r = new_block(s->info, &middle);
755 	if (r < 0)
756 		return r;
757 
758 	ln = dm_block_data(left);
759 	mn = dm_block_data(middle);
760 	rn = dm_block_data(right);
761 
762 	mn->header.nr_entries = cpu_to_le32(0);
763 	mn->header.flags = ln->header.flags;
764 	mn->header.max_entries = ln->header.max_entries;
765 	mn->header.value_size = ln->header.value_size;
766 
767 	redistribute3(ln, mn, rn);
768 
769 	/* patch up the parent */
770 	pn->keys[middle_index] = rn->keys[0];
771 	location = cpu_to_le64(dm_block_location(middle));
772 	__dm_bless_for_disk(&location);
773 	r = insert_at(sizeof(__le64), pn, middle_index,
774 		      le64_to_cpu(mn->keys[0]), &location);
775 	if (r) {
776 		if (shadow_current(s) != left)
777 			unlock_block(s->info, left);
778 
779 		unlock_block(s->info, middle);
780 
781 		if (shadow_current(s) != right)
782 			unlock_block(s->info, right);
783 
784 	        return r;
785 	}
786 
787 
788 	/* patch up the spine */
789 	if (key < le64_to_cpu(mn->keys[0])) {
790 		unlock_block(s->info, middle);
791 		unlock_block(s->info, right);
792 		s->nodes[1] = left;
793 	} else if (key < le64_to_cpu(rn->keys[0])) {
794 		unlock_block(s->info, left);
795 		unlock_block(s->info, right);
796 		s->nodes[1] = middle;
797 	} else {
798 		unlock_block(s->info, left);
799 		unlock_block(s->info, middle);
800 		s->nodes[1] = right;
801 	}
802 
803 	return 0;
804 }
805 
806 /*----------------------------------------------------------------*/
807 
808 /*
809  * Splits a node by creating two new children beneath the given node.
810  *
811  * Before:
812  *	  +----------+
813  *	  | A ++++++ |
814  *	  +----------+
815  *
816  *
817  * After:
818  *	+------------+
819  *	| A (shadow) |
820  *	+------------+
821  *	    |	|
822  *   +------+	+----+
823  *   |		     |
824  *   v		     v
825  * +-------+	 +-------+
826  * | B +++ |	 | C +++ |
827  * +-------+	 +-------+
828  */
829 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
830 {
831 	int r;
832 	size_t size;
833 	unsigned nr_left, nr_right;
834 	struct dm_block *left, *right, *new_parent;
835 	struct btree_node *pn, *ln, *rn;
836 	__le64 val;
837 
838 	new_parent = shadow_current(s);
839 
840 	pn = dm_block_data(new_parent);
841 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
842 		sizeof(__le64) : s->info->value_type.size;
843 
844 	/* create & init the left block */
845 	r = new_block(s->info, &left);
846 	if (r < 0)
847 		return r;
848 
849 	ln = dm_block_data(left);
850 	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
851 
852 	ln->header.flags = pn->header.flags;
853 	ln->header.nr_entries = cpu_to_le32(nr_left);
854 	ln->header.max_entries = pn->header.max_entries;
855 	ln->header.value_size = pn->header.value_size;
856 	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
857 	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
858 
859 	/* create & init the right block */
860 	r = new_block(s->info, &right);
861 	if (r < 0) {
862 		unlock_block(s->info, left);
863 		return r;
864 	}
865 
866 	rn = dm_block_data(right);
867 	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
868 
869 	rn->header.flags = pn->header.flags;
870 	rn->header.nr_entries = cpu_to_le32(nr_right);
871 	rn->header.max_entries = pn->header.max_entries;
872 	rn->header.value_size = pn->header.value_size;
873 	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
874 	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
875 	       nr_right * size);
876 
877 	/* new_parent should just point to l and r now */
878 	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
879 	pn->header.nr_entries = cpu_to_le32(2);
880 	pn->header.max_entries = cpu_to_le32(
881 		calc_max_entries(sizeof(__le64),
882 				 dm_bm_block_size(
883 					 dm_tm_get_bm(s->info->tm))));
884 	pn->header.value_size = cpu_to_le32(sizeof(__le64));
885 
886 	val = cpu_to_le64(dm_block_location(left));
887 	__dm_bless_for_disk(&val);
888 	pn->keys[0] = ln->keys[0];
889 	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
890 
891 	val = cpu_to_le64(dm_block_location(right));
892 	__dm_bless_for_disk(&val);
893 	pn->keys[1] = rn->keys[0];
894 	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
895 
896 	unlock_block(s->info, left);
897 	unlock_block(s->info, right);
898 	return 0;
899 }
900 
901 /*----------------------------------------------------------------*/
902 
903 /*
904  * Redistributes a node's entries with its left sibling.
905  */
906 static int rebalance_left(struct shadow_spine *s, struct dm_btree_value_type *vt,
907 			  unsigned parent_index, uint64_t key)
908 {
909 	int r;
910 	struct dm_block *sib;
911 	struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
912 
913 	r = shadow_child(s->info, vt, parent, parent_index - 1, &sib);
914 	if (r)
915 		return r;
916 
917 	left = dm_block_data(sib);
918 	right = dm_block_data(shadow_current(s));
919 	redistribute2(left, right);
920 	*key_ptr(parent, parent_index) = right->keys[0];
921 
922 	if (key < le64_to_cpu(right->keys[0])) {
923 		unlock_block(s->info, s->nodes[1]);
924 		s->nodes[1] = sib;
925 	} else {
926 		unlock_block(s->info, sib);
927 	}
928 
929 	return 0;
930 }
931 
932 /*
933  * Redistributes a nodes entries with its right sibling.
934  */
935 static int rebalance_right(struct shadow_spine *s, struct dm_btree_value_type *vt,
936 			   unsigned parent_index, uint64_t key)
937 {
938 	int r;
939 	struct dm_block *sib;
940 	struct btree_node *left, *right, *parent = dm_block_data(shadow_parent(s));
941 
942 	r = shadow_child(s->info, vt, parent, parent_index + 1, &sib);
943 	if (r)
944 		return r;
945 
946 	left = dm_block_data(shadow_current(s));
947 	right = dm_block_data(sib);
948 	redistribute2(left, right);
949 	*key_ptr(parent, parent_index + 1) = right->keys[0];
950 
951 	if (key < le64_to_cpu(right->keys[0])) {
952 		unlock_block(s->info, sib);
953 	} else {
954 		unlock_block(s->info, s->nodes[1]);
955 		s->nodes[1] = sib;
956 	}
957 
958 	return 0;
959 }
960 
961 /*
962  * Returns the number of spare entries in a node.
963  */
964 static int get_node_free_space(struct dm_btree_info *info, dm_block_t b, unsigned *space)
965 {
966 	int r;
967 	unsigned nr_entries;
968 	struct dm_block *block;
969 	struct btree_node *node;
970 
971 	r = bn_read_lock(info, b, &block);
972 	if (r)
973 		return r;
974 
975 	node = dm_block_data(block);
976 	nr_entries = le32_to_cpu(node->header.nr_entries);
977 	*space = le32_to_cpu(node->header.max_entries) - nr_entries;
978 
979 	unlock_block(info, block);
980 	return 0;
981 }
982 
983 /*
984  * Make space in a node, either by moving some entries to a sibling,
985  * or creating a new sibling node.  SPACE_THRESHOLD defines the minimum
986  * number of free entries that must be in the sibling to make the move
987  * worth while.  If the siblings are shared (eg, part of a snapshot),
988  * then they are not touched, since this break sharing and so consume
989  * more space than we save.
990  */
991 #define SPACE_THRESHOLD 8
992 static int rebalance_or_split(struct shadow_spine *s, struct dm_btree_value_type *vt,
993 			      unsigned parent_index, uint64_t key)
994 {
995 	int r;
996 	struct btree_node *parent = dm_block_data(shadow_parent(s));
997 	unsigned nr_parent = le32_to_cpu(parent->header.nr_entries);
998 	unsigned free_space;
999 	int left_shared = 0, right_shared = 0;
1000 
1001 	/* Should we move entries to the left sibling? */
1002 	if (parent_index > 0) {
1003 		dm_block_t left_b = value64(parent, parent_index - 1);
1004 		r = dm_tm_block_is_shared(s->info->tm, left_b, &left_shared);
1005 		if (r)
1006 			return r;
1007 
1008 		if (!left_shared) {
1009 			r = get_node_free_space(s->info, left_b, &free_space);
1010 			if (r)
1011 				return r;
1012 
1013 			if (free_space >= SPACE_THRESHOLD)
1014 				return rebalance_left(s, vt, parent_index, key);
1015 		}
1016 	}
1017 
1018 	/* Should we move entries to the right sibling? */
1019 	if (parent_index < (nr_parent - 1)) {
1020 		dm_block_t right_b = value64(parent, parent_index + 1);
1021 		r = dm_tm_block_is_shared(s->info->tm, right_b, &right_shared);
1022 		if (r)
1023 			return r;
1024 
1025 		if (!right_shared) {
1026 			r = get_node_free_space(s->info, right_b, &free_space);
1027 			if (r)
1028 				return r;
1029 
1030 			if (free_space >= SPACE_THRESHOLD)
1031 				return rebalance_right(s, vt, parent_index, key);
1032 		}
1033 	}
1034 
1035 	/*
1036 	 * We need to split the node, normally we split two nodes
1037 	 * into three.	But when inserting a sequence that is either
1038 	 * monotonically increasing or decreasing it's better to split
1039 	 * a single node into two.
1040 	 */
1041 	if (left_shared || right_shared || (nr_parent <= 2) ||
1042 	    (parent_index == 0) || (parent_index + 1 == nr_parent)) {
1043 		return split_one_into_two(s, parent_index, vt, key);
1044 	} else {
1045 		return split_two_into_three(s, parent_index, vt, key);
1046 	}
1047 }
1048 
1049 /*
1050  * Does the node contain a particular key?
1051  */
1052 static bool contains_key(struct btree_node *node, uint64_t key)
1053 {
1054 	int i = lower_bound(node, key);
1055 
1056 	if (i >= 0 && le64_to_cpu(node->keys[i]) == key)
1057 		return true;
1058 
1059 	return false;
1060 }
1061 
1062 /*
1063  * In general we preemptively make sure there's a free entry in every
1064  * node on the spine when doing an insert.  But we can avoid that with
1065  * leaf nodes if we know it's an overwrite.
1066  */
1067 static bool has_space_for_insert(struct btree_node *node, uint64_t key)
1068 {
1069 	if (node->header.nr_entries == node->header.max_entries) {
1070 		if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1071 			/* we don't need space if it's an overwrite */
1072 			return contains_key(node, key);
1073 		}
1074 
1075 		return false;
1076 	}
1077 
1078 	return true;
1079 }
1080 
1081 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
1082 			    struct dm_btree_value_type *vt,
1083 			    uint64_t key, unsigned *index)
1084 {
1085 	int r, i = *index, top = 1;
1086 	struct btree_node *node;
1087 
1088 	for (;;) {
1089 		r = shadow_step(s, root, vt);
1090 		if (r < 0)
1091 			return r;
1092 
1093 		node = dm_block_data(shadow_current(s));
1094 
1095 		/*
1096 		 * We have to patch up the parent node, ugly, but I don't
1097 		 * see a way to do this automatically as part of the spine
1098 		 * op.
1099 		 */
1100 		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
1101 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1102 
1103 			__dm_bless_for_disk(&location);
1104 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1105 				    &location, sizeof(__le64));
1106 		}
1107 
1108 		node = dm_block_data(shadow_current(s));
1109 
1110 		if (!has_space_for_insert(node, key)) {
1111 			if (top)
1112 				r = btree_split_beneath(s, key);
1113 			else
1114 				r = rebalance_or_split(s, vt, i, key);
1115 
1116 			if (r < 0)
1117 				return r;
1118 
1119 			/* making space can cause the current node to change */
1120 			node = dm_block_data(shadow_current(s));
1121 		}
1122 
1123 		i = lower_bound(node, key);
1124 
1125 		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
1126 			break;
1127 
1128 		if (i < 0) {
1129 			/* change the bounds on the lowest key */
1130 			node->keys[0] = cpu_to_le64(key);
1131 			i = 0;
1132 		}
1133 
1134 		root = value64(node, i);
1135 		top = 0;
1136 	}
1137 
1138 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
1139 		i++;
1140 
1141 	*index = i;
1142 	return 0;
1143 }
1144 
1145 static int __btree_get_overwrite_leaf(struct shadow_spine *s, dm_block_t root,
1146 				      uint64_t key, int *index)
1147 {
1148 	int r, i = -1;
1149 	struct btree_node *node;
1150 
1151 	*index = 0;
1152 	for (;;) {
1153 		r = shadow_step(s, root, &s->info->value_type);
1154 		if (r < 0)
1155 			return r;
1156 
1157 		node = dm_block_data(shadow_current(s));
1158 
1159 		/*
1160 		 * We have to patch up the parent node, ugly, but I don't
1161 		 * see a way to do this automatically as part of the spine
1162 		 * op.
1163 		 */
1164 		if (shadow_has_parent(s) && i >= 0) {
1165 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
1166 
1167 			__dm_bless_for_disk(&location);
1168 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
1169 				    &location, sizeof(__le64));
1170 		}
1171 
1172 		node = dm_block_data(shadow_current(s));
1173 		i = lower_bound(node, key);
1174 
1175 		BUG_ON(i < 0);
1176 		BUG_ON(i >= le32_to_cpu(node->header.nr_entries));
1177 
1178 		if (le32_to_cpu(node->header.flags) & LEAF_NODE) {
1179 			if (key != le64_to_cpu(node->keys[i]))
1180 				return -EINVAL;
1181 			break;
1182 		}
1183 
1184 		root = value64(node, i);
1185 	}
1186 
1187 	*index = i;
1188 	return 0;
1189 }
1190 
1191 int btree_get_overwrite_leaf(struct dm_btree_info *info, dm_block_t root,
1192 			     uint64_t key, int *index,
1193 			     dm_block_t *new_root, struct dm_block **leaf)
1194 {
1195 	int r;
1196 	struct shadow_spine spine;
1197 
1198 	BUG_ON(info->levels > 1);
1199 	init_shadow_spine(&spine, info);
1200 	r = __btree_get_overwrite_leaf(&spine, root, key, index);
1201 	if (!r) {
1202 		*new_root = shadow_root(&spine);
1203 		*leaf = shadow_current(&spine);
1204 
1205 		/*
1206 		 * Decrement the count so exit_shadow_spine() doesn't
1207 		 * unlock the leaf.
1208 		 */
1209 		spine.count--;
1210 	}
1211 	exit_shadow_spine(&spine);
1212 
1213 	return r;
1214 }
1215 
1216 static bool need_insert(struct btree_node *node, uint64_t *keys,
1217 			unsigned level, unsigned index)
1218 {
1219         return ((index >= le32_to_cpu(node->header.nr_entries)) ||
1220 		(le64_to_cpu(node->keys[index]) != keys[level]));
1221 }
1222 
1223 static int insert(struct dm_btree_info *info, dm_block_t root,
1224 		  uint64_t *keys, void *value, dm_block_t *new_root,
1225 		  int *inserted)
1226 		  __dm_written_to_disk(value)
1227 {
1228 	int r;
1229 	unsigned level, index = -1, last_level = info->levels - 1;
1230 	dm_block_t block = root;
1231 	struct shadow_spine spine;
1232 	struct btree_node *n;
1233 	struct dm_btree_value_type le64_type;
1234 
1235 	init_le64_type(info->tm, &le64_type);
1236 	init_shadow_spine(&spine, info);
1237 
1238 	for (level = 0; level < (info->levels - 1); level++) {
1239 		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
1240 		if (r < 0)
1241 			goto bad;
1242 
1243 		n = dm_block_data(shadow_current(&spine));
1244 
1245 		if (need_insert(n, keys, level, index)) {
1246 			dm_block_t new_tree;
1247 			__le64 new_le;
1248 
1249 			r = dm_btree_empty(info, &new_tree);
1250 			if (r < 0)
1251 				goto bad;
1252 
1253 			new_le = cpu_to_le64(new_tree);
1254 			__dm_bless_for_disk(&new_le);
1255 
1256 			r = insert_at(sizeof(uint64_t), n, index,
1257 				      keys[level], &new_le);
1258 			if (r)
1259 				goto bad;
1260 		}
1261 
1262 		if (level < last_level)
1263 			block = value64(n, index);
1264 	}
1265 
1266 	r = btree_insert_raw(&spine, block, &info->value_type,
1267 			     keys[level], &index);
1268 	if (r < 0)
1269 		goto bad;
1270 
1271 	n = dm_block_data(shadow_current(&spine));
1272 
1273 	if (need_insert(n, keys, level, index)) {
1274 		if (inserted)
1275 			*inserted = 1;
1276 
1277 		r = insert_at(info->value_type.size, n, index,
1278 			      keys[level], value);
1279 		if (r)
1280 			goto bad_unblessed;
1281 	} else {
1282 		if (inserted)
1283 			*inserted = 0;
1284 
1285 		if (info->value_type.dec &&
1286 		    (!info->value_type.equal ||
1287 		     !info->value_type.equal(
1288 			     info->value_type.context,
1289 			     value_ptr(n, index),
1290 			     value))) {
1291 			info->value_type.dec(info->value_type.context,
1292 					     value_ptr(n, index), 1);
1293 		}
1294 		memcpy_disk(value_ptr(n, index),
1295 			    value, info->value_type.size);
1296 	}
1297 
1298 	*new_root = shadow_root(&spine);
1299 	exit_shadow_spine(&spine);
1300 
1301 	return 0;
1302 
1303 bad:
1304 	__dm_unbless_for_disk(value);
1305 bad_unblessed:
1306 	exit_shadow_spine(&spine);
1307 	return r;
1308 }
1309 
1310 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
1311 		    uint64_t *keys, void *value, dm_block_t *new_root)
1312 		    __dm_written_to_disk(value)
1313 {
1314 	return insert(info, root, keys, value, new_root, NULL);
1315 }
1316 EXPORT_SYMBOL_GPL(dm_btree_insert);
1317 
1318 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
1319 			   uint64_t *keys, void *value, dm_block_t *new_root,
1320 			   int *inserted)
1321 			   __dm_written_to_disk(value)
1322 {
1323 	return insert(info, root, keys, value, new_root, inserted);
1324 }
1325 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
1326 
1327 /*----------------------------------------------------------------*/
1328 
1329 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
1330 		    uint64_t *result_key, dm_block_t *next_block)
1331 {
1332 	int i, r;
1333 	uint32_t flags;
1334 
1335 	do {
1336 		r = ro_step(s, block);
1337 		if (r < 0)
1338 			return r;
1339 
1340 		flags = le32_to_cpu(ro_node(s)->header.flags);
1341 		i = le32_to_cpu(ro_node(s)->header.nr_entries);
1342 		if (!i)
1343 			return -ENODATA;
1344 		else
1345 			i--;
1346 
1347 		if (find_highest)
1348 			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
1349 		else
1350 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
1351 
1352 		if (next_block || flags & INTERNAL_NODE) {
1353 			if (find_highest)
1354 				block = value64(ro_node(s), i);
1355 			else
1356 				block = value64(ro_node(s), 0);
1357 		}
1358 
1359 	} while (flags & INTERNAL_NODE);
1360 
1361 	if (next_block)
1362 		*next_block = block;
1363 	return 0;
1364 }
1365 
1366 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
1367 			     bool find_highest, uint64_t *result_keys)
1368 {
1369 	int r = 0, count = 0, level;
1370 	struct ro_spine spine;
1371 
1372 	init_ro_spine(&spine, info);
1373 	for (level = 0; level < info->levels; level++) {
1374 		r = find_key(&spine, root, find_highest, result_keys + level,
1375 			     level == info->levels - 1 ? NULL : &root);
1376 		if (r == -ENODATA) {
1377 			r = 0;
1378 			break;
1379 
1380 		} else if (r)
1381 			break;
1382 
1383 		count++;
1384 	}
1385 	exit_ro_spine(&spine);
1386 
1387 	return r ? r : count;
1388 }
1389 
1390 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
1391 			      uint64_t *result_keys)
1392 {
1393 	return dm_btree_find_key(info, root, true, result_keys);
1394 }
1395 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
1396 
1397 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
1398 			     uint64_t *result_keys)
1399 {
1400 	return dm_btree_find_key(info, root, false, result_keys);
1401 }
1402 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
1403 
1404 /*----------------------------------------------------------------*/
1405 
1406 /*
1407  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
1408  * space.  Also this only works for single level trees.
1409  */
1410 static int walk_node(struct dm_btree_info *info, dm_block_t block,
1411 		     int (*fn)(void *context, uint64_t *keys, void *leaf),
1412 		     void *context)
1413 {
1414 	int r;
1415 	unsigned i, nr;
1416 	struct dm_block *node;
1417 	struct btree_node *n;
1418 	uint64_t keys;
1419 
1420 	r = bn_read_lock(info, block, &node);
1421 	if (r)
1422 		return r;
1423 
1424 	n = dm_block_data(node);
1425 
1426 	nr = le32_to_cpu(n->header.nr_entries);
1427 	for (i = 0; i < nr; i++) {
1428 		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
1429 			r = walk_node(info, value64(n, i), fn, context);
1430 			if (r)
1431 				goto out;
1432 		} else {
1433 			keys = le64_to_cpu(*key_ptr(n, i));
1434 			r = fn(context, &keys, value_ptr(n, i));
1435 			if (r)
1436 				goto out;
1437 		}
1438 	}
1439 
1440 out:
1441 	dm_tm_unlock(info->tm, node);
1442 	return r;
1443 }
1444 
1445 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
1446 		  int (*fn)(void *context, uint64_t *keys, void *leaf),
1447 		  void *context)
1448 {
1449 	BUG_ON(info->levels > 1);
1450 	return walk_node(info, root, fn, context);
1451 }
1452 EXPORT_SYMBOL_GPL(dm_btree_walk);
1453 
1454 /*----------------------------------------------------------------*/
1455 
1456 static void prefetch_values(struct dm_btree_cursor *c)
1457 {
1458 	unsigned i, nr;
1459 	__le64 value_le;
1460 	struct cursor_node *n = c->nodes + c->depth - 1;
1461 	struct btree_node *bn = dm_block_data(n->b);
1462 	struct dm_block_manager *bm = dm_tm_get_bm(c->info->tm);
1463 
1464 	BUG_ON(c->info->value_type.size != sizeof(value_le));
1465 
1466 	nr = le32_to_cpu(bn->header.nr_entries);
1467 	for (i = 0; i < nr; i++) {
1468 		memcpy(&value_le, value_ptr(bn, i), sizeof(value_le));
1469 		dm_bm_prefetch(bm, le64_to_cpu(value_le));
1470 	}
1471 }
1472 
1473 static bool leaf_node(struct dm_btree_cursor *c)
1474 {
1475 	struct cursor_node *n = c->nodes + c->depth - 1;
1476 	struct btree_node *bn = dm_block_data(n->b);
1477 
1478 	return le32_to_cpu(bn->header.flags) & LEAF_NODE;
1479 }
1480 
1481 static int push_node(struct dm_btree_cursor *c, dm_block_t b)
1482 {
1483 	int r;
1484 	struct cursor_node *n = c->nodes + c->depth;
1485 
1486 	if (c->depth >= DM_BTREE_CURSOR_MAX_DEPTH - 1) {
1487 		DMERR("couldn't push cursor node, stack depth too high");
1488 		return -EINVAL;
1489 	}
1490 
1491 	r = bn_read_lock(c->info, b, &n->b);
1492 	if (r)
1493 		return r;
1494 
1495 	n->index = 0;
1496 	c->depth++;
1497 
1498 	if (c->prefetch_leaves || !leaf_node(c))
1499 		prefetch_values(c);
1500 
1501 	return 0;
1502 }
1503 
1504 static void pop_node(struct dm_btree_cursor *c)
1505 {
1506 	c->depth--;
1507 	unlock_block(c->info, c->nodes[c->depth].b);
1508 }
1509 
1510 static int inc_or_backtrack(struct dm_btree_cursor *c)
1511 {
1512 	struct cursor_node *n;
1513 	struct btree_node *bn;
1514 
1515 	for (;;) {
1516 		if (!c->depth)
1517 			return -ENODATA;
1518 
1519 		n = c->nodes + c->depth - 1;
1520 		bn = dm_block_data(n->b);
1521 
1522 		n->index++;
1523 		if (n->index < le32_to_cpu(bn->header.nr_entries))
1524 			break;
1525 
1526 		pop_node(c);
1527 	}
1528 
1529 	return 0;
1530 }
1531 
1532 static int find_leaf(struct dm_btree_cursor *c)
1533 {
1534 	int r = 0;
1535 	struct cursor_node *n;
1536 	struct btree_node *bn;
1537 	__le64 value_le;
1538 
1539 	for (;;) {
1540 		n = c->nodes + c->depth - 1;
1541 		bn = dm_block_data(n->b);
1542 
1543 		if (le32_to_cpu(bn->header.flags) & LEAF_NODE)
1544 			break;
1545 
1546 		memcpy(&value_le, value_ptr(bn, n->index), sizeof(value_le));
1547 		r = push_node(c, le64_to_cpu(value_le));
1548 		if (r) {
1549 			DMERR("push_node failed");
1550 			break;
1551 		}
1552 	}
1553 
1554 	if (!r && (le32_to_cpu(bn->header.nr_entries) == 0))
1555 		return -ENODATA;
1556 
1557 	return r;
1558 }
1559 
1560 int dm_btree_cursor_begin(struct dm_btree_info *info, dm_block_t root,
1561 			  bool prefetch_leaves, struct dm_btree_cursor *c)
1562 {
1563 	int r;
1564 
1565 	c->info = info;
1566 	c->root = root;
1567 	c->depth = 0;
1568 	c->prefetch_leaves = prefetch_leaves;
1569 
1570 	r = push_node(c, root);
1571 	if (r)
1572 		return r;
1573 
1574 	return find_leaf(c);
1575 }
1576 EXPORT_SYMBOL_GPL(dm_btree_cursor_begin);
1577 
1578 void dm_btree_cursor_end(struct dm_btree_cursor *c)
1579 {
1580 	while (c->depth)
1581 		pop_node(c);
1582 }
1583 EXPORT_SYMBOL_GPL(dm_btree_cursor_end);
1584 
1585 int dm_btree_cursor_next(struct dm_btree_cursor *c)
1586 {
1587 	int r = inc_or_backtrack(c);
1588 	if (!r) {
1589 		r = find_leaf(c);
1590 		if (r)
1591 			DMERR("find_leaf failed");
1592 	}
1593 
1594 	return r;
1595 }
1596 EXPORT_SYMBOL_GPL(dm_btree_cursor_next);
1597 
1598 int dm_btree_cursor_skip(struct dm_btree_cursor *c, uint32_t count)
1599 {
1600 	int r = 0;
1601 
1602 	while (count-- && !r)
1603 		r = dm_btree_cursor_next(c);
1604 
1605 	return r;
1606 }
1607 EXPORT_SYMBOL_GPL(dm_btree_cursor_skip);
1608 
1609 int dm_btree_cursor_get_value(struct dm_btree_cursor *c, uint64_t *key, void *value_le)
1610 {
1611 	if (c->depth) {
1612 		struct cursor_node *n = c->nodes + c->depth - 1;
1613 		struct btree_node *bn = dm_block_data(n->b);
1614 
1615 		if (le32_to_cpu(bn->header.flags) & INTERNAL_NODE)
1616 			return -EINVAL;
1617 
1618 		*key = le64_to_cpu(*key_ptr(bn, n->index));
1619 		memcpy(value_le, value_ptr(bn, n->index), c->info->value_type.size);
1620 		return 0;
1621 
1622 	} else
1623 		return -ENODATA;
1624 }
1625 EXPORT_SYMBOL_GPL(dm_btree_cursor_get_value);
1626