1 /*
2  * Copyright (C) 2011 Red Hat, Inc.
3  *
4  * This file is released under the GPL.
5  */
6 
7 #include "dm-btree-internal.h"
8 #include "dm-space-map.h"
9 #include "dm-transaction-manager.h"
10 
11 #include <linux/export.h>
12 #include <linux/device-mapper.h>
13 
14 #define DM_MSG_PREFIX "btree"
15 
16 /*----------------------------------------------------------------
17  * Array manipulation
18  *--------------------------------------------------------------*/
19 static void memcpy_disk(void *dest, const void *src, size_t len)
20 	__dm_written_to_disk(src)
21 {
22 	memcpy(dest, src, len);
23 	__dm_unbless_for_disk(src);
24 }
25 
26 static void array_insert(void *base, size_t elt_size, unsigned nr_elts,
27 			 unsigned index, void *elt)
28 	__dm_written_to_disk(elt)
29 {
30 	if (index < nr_elts)
31 		memmove(base + (elt_size * (index + 1)),
32 			base + (elt_size * index),
33 			(nr_elts - index) * elt_size);
34 
35 	memcpy_disk(base + (elt_size * index), elt, elt_size);
36 }
37 
38 /*----------------------------------------------------------------*/
39 
40 /* makes the assumption that no two keys are the same. */
41 static int bsearch(struct btree_node *n, uint64_t key, int want_hi)
42 {
43 	int lo = -1, hi = le32_to_cpu(n->header.nr_entries);
44 
45 	while (hi - lo > 1) {
46 		int mid = lo + ((hi - lo) / 2);
47 		uint64_t mid_key = le64_to_cpu(n->keys[mid]);
48 
49 		if (mid_key == key)
50 			return mid;
51 
52 		if (mid_key < key)
53 			lo = mid;
54 		else
55 			hi = mid;
56 	}
57 
58 	return want_hi ? hi : lo;
59 }
60 
61 int lower_bound(struct btree_node *n, uint64_t key)
62 {
63 	return bsearch(n, key, 0);
64 }
65 
66 void inc_children(struct dm_transaction_manager *tm, struct btree_node *n,
67 		  struct dm_btree_value_type *vt)
68 {
69 	unsigned i;
70 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
71 
72 	if (le32_to_cpu(n->header.flags) & INTERNAL_NODE)
73 		for (i = 0; i < nr_entries; i++)
74 			dm_tm_inc(tm, value64(n, i));
75 	else if (vt->inc)
76 		for (i = 0; i < nr_entries; i++)
77 			vt->inc(vt->context, value_ptr(n, i));
78 }
79 
80 static int insert_at(size_t value_size, struct btree_node *node, unsigned index,
81 		      uint64_t key, void *value)
82 		      __dm_written_to_disk(value)
83 {
84 	uint32_t nr_entries = le32_to_cpu(node->header.nr_entries);
85 	__le64 key_le = cpu_to_le64(key);
86 
87 	if (index > nr_entries ||
88 	    index >= le32_to_cpu(node->header.max_entries)) {
89 		DMERR("too many entries in btree node for insert");
90 		__dm_unbless_for_disk(value);
91 		return -ENOMEM;
92 	}
93 
94 	__dm_bless_for_disk(&key_le);
95 
96 	array_insert(node->keys, sizeof(*node->keys), nr_entries, index, &key_le);
97 	array_insert(value_base(node), value_size, nr_entries, index, value);
98 	node->header.nr_entries = cpu_to_le32(nr_entries + 1);
99 
100 	return 0;
101 }
102 
103 /*----------------------------------------------------------------*/
104 
105 /*
106  * We want 3n entries (for some n).  This works more nicely for repeated
107  * insert remove loops than (2n + 1).
108  */
109 static uint32_t calc_max_entries(size_t value_size, size_t block_size)
110 {
111 	uint32_t total, n;
112 	size_t elt_size = sizeof(uint64_t) + value_size; /* key + value */
113 
114 	block_size -= sizeof(struct node_header);
115 	total = block_size / elt_size;
116 	n = total / 3;		/* rounds down */
117 
118 	return 3 * n;
119 }
120 
121 int dm_btree_empty(struct dm_btree_info *info, dm_block_t *root)
122 {
123 	int r;
124 	struct dm_block *b;
125 	struct btree_node *n;
126 	size_t block_size;
127 	uint32_t max_entries;
128 
129 	r = new_block(info, &b);
130 	if (r < 0)
131 		return r;
132 
133 	block_size = dm_bm_block_size(dm_tm_get_bm(info->tm));
134 	max_entries = calc_max_entries(info->value_type.size, block_size);
135 
136 	n = dm_block_data(b);
137 	memset(n, 0, block_size);
138 	n->header.flags = cpu_to_le32(LEAF_NODE);
139 	n->header.nr_entries = cpu_to_le32(0);
140 	n->header.max_entries = cpu_to_le32(max_entries);
141 	n->header.value_size = cpu_to_le32(info->value_type.size);
142 
143 	*root = dm_block_location(b);
144 	unlock_block(info, b);
145 
146 	return 0;
147 }
148 EXPORT_SYMBOL_GPL(dm_btree_empty);
149 
150 /*----------------------------------------------------------------*/
151 
152 /*
153  * Deletion uses a recursive algorithm, since we have limited stack space
154  * we explicitly manage our own stack on the heap.
155  */
156 #define MAX_SPINE_DEPTH 64
157 struct frame {
158 	struct dm_block *b;
159 	struct btree_node *n;
160 	unsigned level;
161 	unsigned nr_children;
162 	unsigned current_child;
163 };
164 
165 struct del_stack {
166 	struct dm_btree_info *info;
167 	struct dm_transaction_manager *tm;
168 	int top;
169 	struct frame spine[MAX_SPINE_DEPTH];
170 };
171 
172 static int top_frame(struct del_stack *s, struct frame **f)
173 {
174 	if (s->top < 0) {
175 		DMERR("btree deletion stack empty");
176 		return -EINVAL;
177 	}
178 
179 	*f = s->spine + s->top;
180 
181 	return 0;
182 }
183 
184 static int unprocessed_frames(struct del_stack *s)
185 {
186 	return s->top >= 0;
187 }
188 
189 static void prefetch_children(struct del_stack *s, struct frame *f)
190 {
191 	unsigned i;
192 	struct dm_block_manager *bm = dm_tm_get_bm(s->tm);
193 
194 	for (i = 0; i < f->nr_children; i++)
195 		dm_bm_prefetch(bm, value64(f->n, i));
196 }
197 
198 static bool is_internal_level(struct dm_btree_info *info, struct frame *f)
199 {
200 	return f->level < (info->levels - 1);
201 }
202 
203 static int push_frame(struct del_stack *s, dm_block_t b, unsigned level)
204 {
205 	int r;
206 	uint32_t ref_count;
207 
208 	if (s->top >= MAX_SPINE_DEPTH - 1) {
209 		DMERR("btree deletion stack out of memory");
210 		return -ENOMEM;
211 	}
212 
213 	r = dm_tm_ref(s->tm, b, &ref_count);
214 	if (r)
215 		return r;
216 
217 	if (ref_count > 1)
218 		/*
219 		 * This is a shared node, so we can just decrement it's
220 		 * reference counter and leave the children.
221 		 */
222 		dm_tm_dec(s->tm, b);
223 
224 	else {
225 		uint32_t flags;
226 		struct frame *f = s->spine + ++s->top;
227 
228 		r = dm_tm_read_lock(s->tm, b, &btree_node_validator, &f->b);
229 		if (r) {
230 			s->top--;
231 			return r;
232 		}
233 
234 		f->n = dm_block_data(f->b);
235 		f->level = level;
236 		f->nr_children = le32_to_cpu(f->n->header.nr_entries);
237 		f->current_child = 0;
238 
239 		flags = le32_to_cpu(f->n->header.flags);
240 		if (flags & INTERNAL_NODE || is_internal_level(s->info, f))
241 			prefetch_children(s, f);
242 	}
243 
244 	return 0;
245 }
246 
247 static void pop_frame(struct del_stack *s)
248 {
249 	struct frame *f = s->spine + s->top--;
250 
251 	dm_tm_dec(s->tm, dm_block_location(f->b));
252 	dm_tm_unlock(s->tm, f->b);
253 }
254 
255 int dm_btree_del(struct dm_btree_info *info, dm_block_t root)
256 {
257 	int r;
258 	struct del_stack *s;
259 
260 	s = kmalloc(sizeof(*s), GFP_NOIO);
261 	if (!s)
262 		return -ENOMEM;
263 	s->info = info;
264 	s->tm = info->tm;
265 	s->top = -1;
266 
267 	r = push_frame(s, root, 0);
268 	if (r)
269 		goto out;
270 
271 	while (unprocessed_frames(s)) {
272 		uint32_t flags;
273 		struct frame *f;
274 		dm_block_t b;
275 
276 		r = top_frame(s, &f);
277 		if (r)
278 			goto out;
279 
280 		if (f->current_child >= f->nr_children) {
281 			pop_frame(s);
282 			continue;
283 		}
284 
285 		flags = le32_to_cpu(f->n->header.flags);
286 		if (flags & INTERNAL_NODE) {
287 			b = value64(f->n, f->current_child);
288 			f->current_child++;
289 			r = push_frame(s, b, f->level);
290 			if (r)
291 				goto out;
292 
293 		} else if (is_internal_level(info, f)) {
294 			b = value64(f->n, f->current_child);
295 			f->current_child++;
296 			r = push_frame(s, b, f->level + 1);
297 			if (r)
298 				goto out;
299 
300 		} else {
301 			if (info->value_type.dec) {
302 				unsigned i;
303 
304 				for (i = 0; i < f->nr_children; i++)
305 					info->value_type.dec(info->value_type.context,
306 							     value_ptr(f->n, i));
307 			}
308 			pop_frame(s);
309 		}
310 	}
311 
312 out:
313 	kfree(s);
314 	return r;
315 }
316 EXPORT_SYMBOL_GPL(dm_btree_del);
317 
318 /*----------------------------------------------------------------*/
319 
320 static int btree_lookup_raw(struct ro_spine *s, dm_block_t block, uint64_t key,
321 			    int (*search_fn)(struct btree_node *, uint64_t),
322 			    uint64_t *result_key, void *v, size_t value_size)
323 {
324 	int i, r;
325 	uint32_t flags, nr_entries;
326 
327 	do {
328 		r = ro_step(s, block);
329 		if (r < 0)
330 			return r;
331 
332 		i = search_fn(ro_node(s), key);
333 
334 		flags = le32_to_cpu(ro_node(s)->header.flags);
335 		nr_entries = le32_to_cpu(ro_node(s)->header.nr_entries);
336 		if (i < 0 || i >= nr_entries)
337 			return -ENODATA;
338 
339 		if (flags & INTERNAL_NODE)
340 			block = value64(ro_node(s), i);
341 
342 	} while (!(flags & LEAF_NODE));
343 
344 	*result_key = le64_to_cpu(ro_node(s)->keys[i]);
345 	memcpy(v, value_ptr(ro_node(s), i), value_size);
346 
347 	return 0;
348 }
349 
350 int dm_btree_lookup(struct dm_btree_info *info, dm_block_t root,
351 		    uint64_t *keys, void *value_le)
352 {
353 	unsigned level, last_level = info->levels - 1;
354 	int r = -ENODATA;
355 	uint64_t rkey;
356 	__le64 internal_value_le;
357 	struct ro_spine spine;
358 
359 	init_ro_spine(&spine, info);
360 	for (level = 0; level < info->levels; level++) {
361 		size_t size;
362 		void *value_p;
363 
364 		if (level == last_level) {
365 			value_p = value_le;
366 			size = info->value_type.size;
367 
368 		} else {
369 			value_p = &internal_value_le;
370 			size = sizeof(uint64_t);
371 		}
372 
373 		r = btree_lookup_raw(&spine, root, keys[level],
374 				     lower_bound, &rkey,
375 				     value_p, size);
376 
377 		if (!r) {
378 			if (rkey != keys[level]) {
379 				exit_ro_spine(&spine);
380 				return -ENODATA;
381 			}
382 		} else {
383 			exit_ro_spine(&spine);
384 			return r;
385 		}
386 
387 		root = le64_to_cpu(internal_value_le);
388 	}
389 	exit_ro_spine(&spine);
390 
391 	return r;
392 }
393 EXPORT_SYMBOL_GPL(dm_btree_lookup);
394 
395 /*
396  * Splits a node by creating a sibling node and shifting half the nodes
397  * contents across.  Assumes there is a parent node, and it has room for
398  * another child.
399  *
400  * Before:
401  *	  +--------+
402  *	  | Parent |
403  *	  +--------+
404  *	     |
405  *	     v
406  *	+----------+
407  *	| A ++++++ |
408  *	+----------+
409  *
410  *
411  * After:
412  *		+--------+
413  *		| Parent |
414  *		+--------+
415  *		  |	|
416  *		  v	+------+
417  *	    +---------+	       |
418  *	    | A* +++  |	       v
419  *	    +---------+	  +-------+
420  *			  | B +++ |
421  *			  +-------+
422  *
423  * Where A* is a shadow of A.
424  */
425 static int btree_split_sibling(struct shadow_spine *s, unsigned parent_index,
426 			       uint64_t key)
427 {
428 	int r;
429 	size_t size;
430 	unsigned nr_left, nr_right;
431 	struct dm_block *left, *right, *parent;
432 	struct btree_node *ln, *rn, *pn;
433 	__le64 location;
434 
435 	left = shadow_current(s);
436 
437 	r = new_block(s->info, &right);
438 	if (r < 0)
439 		return r;
440 
441 	ln = dm_block_data(left);
442 	rn = dm_block_data(right);
443 
444 	nr_left = le32_to_cpu(ln->header.nr_entries) / 2;
445 	nr_right = le32_to_cpu(ln->header.nr_entries) - nr_left;
446 
447 	ln->header.nr_entries = cpu_to_le32(nr_left);
448 
449 	rn->header.flags = ln->header.flags;
450 	rn->header.nr_entries = cpu_to_le32(nr_right);
451 	rn->header.max_entries = ln->header.max_entries;
452 	rn->header.value_size = ln->header.value_size;
453 	memcpy(rn->keys, ln->keys + nr_left, nr_right * sizeof(rn->keys[0]));
454 
455 	size = le32_to_cpu(ln->header.flags) & INTERNAL_NODE ?
456 		sizeof(uint64_t) : s->info->value_type.size;
457 	memcpy(value_ptr(rn, 0), value_ptr(ln, nr_left),
458 	       size * nr_right);
459 
460 	/*
461 	 * Patch up the parent
462 	 */
463 	parent = shadow_parent(s);
464 
465 	pn = dm_block_data(parent);
466 	location = cpu_to_le64(dm_block_location(left));
467 	__dm_bless_for_disk(&location);
468 	memcpy_disk(value_ptr(pn, parent_index),
469 		    &location, sizeof(__le64));
470 
471 	location = cpu_to_le64(dm_block_location(right));
472 	__dm_bless_for_disk(&location);
473 
474 	r = insert_at(sizeof(__le64), pn, parent_index + 1,
475 		      le64_to_cpu(rn->keys[0]), &location);
476 	if (r)
477 		return r;
478 
479 	if (key < le64_to_cpu(rn->keys[0])) {
480 		unlock_block(s->info, right);
481 		s->nodes[1] = left;
482 	} else {
483 		unlock_block(s->info, left);
484 		s->nodes[1] = right;
485 	}
486 
487 	return 0;
488 }
489 
490 /*
491  * Splits a node by creating two new children beneath the given node.
492  *
493  * Before:
494  *	  +----------+
495  *	  | A ++++++ |
496  *	  +----------+
497  *
498  *
499  * After:
500  *	+------------+
501  *	| A (shadow) |
502  *	+------------+
503  *	    |	|
504  *   +------+	+----+
505  *   |		     |
506  *   v		     v
507  * +-------+	 +-------+
508  * | B +++ |	 | C +++ |
509  * +-------+	 +-------+
510  */
511 static int btree_split_beneath(struct shadow_spine *s, uint64_t key)
512 {
513 	int r;
514 	size_t size;
515 	unsigned nr_left, nr_right;
516 	struct dm_block *left, *right, *new_parent;
517 	struct btree_node *pn, *ln, *rn;
518 	__le64 val;
519 
520 	new_parent = shadow_current(s);
521 
522 	r = new_block(s->info, &left);
523 	if (r < 0)
524 		return r;
525 
526 	r = new_block(s->info, &right);
527 	if (r < 0) {
528 		unlock_block(s->info, left);
529 		return r;
530 	}
531 
532 	pn = dm_block_data(new_parent);
533 	ln = dm_block_data(left);
534 	rn = dm_block_data(right);
535 
536 	nr_left = le32_to_cpu(pn->header.nr_entries) / 2;
537 	nr_right = le32_to_cpu(pn->header.nr_entries) - nr_left;
538 
539 	ln->header.flags = pn->header.flags;
540 	ln->header.nr_entries = cpu_to_le32(nr_left);
541 	ln->header.max_entries = pn->header.max_entries;
542 	ln->header.value_size = pn->header.value_size;
543 
544 	rn->header.flags = pn->header.flags;
545 	rn->header.nr_entries = cpu_to_le32(nr_right);
546 	rn->header.max_entries = pn->header.max_entries;
547 	rn->header.value_size = pn->header.value_size;
548 
549 	memcpy(ln->keys, pn->keys, nr_left * sizeof(pn->keys[0]));
550 	memcpy(rn->keys, pn->keys + nr_left, nr_right * sizeof(pn->keys[0]));
551 
552 	size = le32_to_cpu(pn->header.flags) & INTERNAL_NODE ?
553 		sizeof(__le64) : s->info->value_type.size;
554 	memcpy(value_ptr(ln, 0), value_ptr(pn, 0), nr_left * size);
555 	memcpy(value_ptr(rn, 0), value_ptr(pn, nr_left),
556 	       nr_right * size);
557 
558 	/* new_parent should just point to l and r now */
559 	pn->header.flags = cpu_to_le32(INTERNAL_NODE);
560 	pn->header.nr_entries = cpu_to_le32(2);
561 	pn->header.max_entries = cpu_to_le32(
562 		calc_max_entries(sizeof(__le64),
563 				 dm_bm_block_size(
564 					 dm_tm_get_bm(s->info->tm))));
565 	pn->header.value_size = cpu_to_le32(sizeof(__le64));
566 
567 	val = cpu_to_le64(dm_block_location(left));
568 	__dm_bless_for_disk(&val);
569 	pn->keys[0] = ln->keys[0];
570 	memcpy_disk(value_ptr(pn, 0), &val, sizeof(__le64));
571 
572 	val = cpu_to_le64(dm_block_location(right));
573 	__dm_bless_for_disk(&val);
574 	pn->keys[1] = rn->keys[0];
575 	memcpy_disk(value_ptr(pn, 1), &val, sizeof(__le64));
576 
577 	/*
578 	 * rejig the spine.  This is ugly, since it knows too
579 	 * much about the spine
580 	 */
581 	if (s->nodes[0] != new_parent) {
582 		unlock_block(s->info, s->nodes[0]);
583 		s->nodes[0] = new_parent;
584 	}
585 	if (key < le64_to_cpu(rn->keys[0])) {
586 		unlock_block(s->info, right);
587 		s->nodes[1] = left;
588 	} else {
589 		unlock_block(s->info, left);
590 		s->nodes[1] = right;
591 	}
592 	s->count = 2;
593 
594 	return 0;
595 }
596 
597 static int btree_insert_raw(struct shadow_spine *s, dm_block_t root,
598 			    struct dm_btree_value_type *vt,
599 			    uint64_t key, unsigned *index)
600 {
601 	int r, i = *index, top = 1;
602 	struct btree_node *node;
603 
604 	for (;;) {
605 		r = shadow_step(s, root, vt);
606 		if (r < 0)
607 			return r;
608 
609 		node = dm_block_data(shadow_current(s));
610 
611 		/*
612 		 * We have to patch up the parent node, ugly, but I don't
613 		 * see a way to do this automatically as part of the spine
614 		 * op.
615 		 */
616 		if (shadow_has_parent(s) && i >= 0) { /* FIXME: second clause unness. */
617 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
618 
619 			__dm_bless_for_disk(&location);
620 			memcpy_disk(value_ptr(dm_block_data(shadow_parent(s)), i),
621 				    &location, sizeof(__le64));
622 		}
623 
624 		node = dm_block_data(shadow_current(s));
625 
626 		if (node->header.nr_entries == node->header.max_entries) {
627 			if (top)
628 				r = btree_split_beneath(s, key);
629 			else
630 				r = btree_split_sibling(s, i, key);
631 
632 			if (r < 0)
633 				return r;
634 		}
635 
636 		node = dm_block_data(shadow_current(s));
637 
638 		i = lower_bound(node, key);
639 
640 		if (le32_to_cpu(node->header.flags) & LEAF_NODE)
641 			break;
642 
643 		if (i < 0) {
644 			/* change the bounds on the lowest key */
645 			node->keys[0] = cpu_to_le64(key);
646 			i = 0;
647 		}
648 
649 		root = value64(node, i);
650 		top = 0;
651 	}
652 
653 	if (i < 0 || le64_to_cpu(node->keys[i]) != key)
654 		i++;
655 
656 	*index = i;
657 	return 0;
658 }
659 
660 static int insert(struct dm_btree_info *info, dm_block_t root,
661 		  uint64_t *keys, void *value, dm_block_t *new_root,
662 		  int *inserted)
663 		  __dm_written_to_disk(value)
664 {
665 	int r, need_insert;
666 	unsigned level, index = -1, last_level = info->levels - 1;
667 	dm_block_t block = root;
668 	struct shadow_spine spine;
669 	struct btree_node *n;
670 	struct dm_btree_value_type le64_type;
671 
672 	init_le64_type(info->tm, &le64_type);
673 	init_shadow_spine(&spine, info);
674 
675 	for (level = 0; level < (info->levels - 1); level++) {
676 		r = btree_insert_raw(&spine, block, &le64_type, keys[level], &index);
677 		if (r < 0)
678 			goto bad;
679 
680 		n = dm_block_data(shadow_current(&spine));
681 		need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
682 			       (le64_to_cpu(n->keys[index]) != keys[level]));
683 
684 		if (need_insert) {
685 			dm_block_t new_tree;
686 			__le64 new_le;
687 
688 			r = dm_btree_empty(info, &new_tree);
689 			if (r < 0)
690 				goto bad;
691 
692 			new_le = cpu_to_le64(new_tree);
693 			__dm_bless_for_disk(&new_le);
694 
695 			r = insert_at(sizeof(uint64_t), n, index,
696 				      keys[level], &new_le);
697 			if (r)
698 				goto bad;
699 		}
700 
701 		if (level < last_level)
702 			block = value64(n, index);
703 	}
704 
705 	r = btree_insert_raw(&spine, block, &info->value_type,
706 			     keys[level], &index);
707 	if (r < 0)
708 		goto bad;
709 
710 	n = dm_block_data(shadow_current(&spine));
711 	need_insert = ((index >= le32_to_cpu(n->header.nr_entries)) ||
712 		       (le64_to_cpu(n->keys[index]) != keys[level]));
713 
714 	if (need_insert) {
715 		if (inserted)
716 			*inserted = 1;
717 
718 		r = insert_at(info->value_type.size, n, index,
719 			      keys[level], value);
720 		if (r)
721 			goto bad_unblessed;
722 	} else {
723 		if (inserted)
724 			*inserted = 0;
725 
726 		if (info->value_type.dec &&
727 		    (!info->value_type.equal ||
728 		     !info->value_type.equal(
729 			     info->value_type.context,
730 			     value_ptr(n, index),
731 			     value))) {
732 			info->value_type.dec(info->value_type.context,
733 					     value_ptr(n, index));
734 		}
735 		memcpy_disk(value_ptr(n, index),
736 			    value, info->value_type.size);
737 	}
738 
739 	*new_root = shadow_root(&spine);
740 	exit_shadow_spine(&spine);
741 
742 	return 0;
743 
744 bad:
745 	__dm_unbless_for_disk(value);
746 bad_unblessed:
747 	exit_shadow_spine(&spine);
748 	return r;
749 }
750 
751 int dm_btree_insert(struct dm_btree_info *info, dm_block_t root,
752 		    uint64_t *keys, void *value, dm_block_t *new_root)
753 		    __dm_written_to_disk(value)
754 {
755 	return insert(info, root, keys, value, new_root, NULL);
756 }
757 EXPORT_SYMBOL_GPL(dm_btree_insert);
758 
759 int dm_btree_insert_notify(struct dm_btree_info *info, dm_block_t root,
760 			   uint64_t *keys, void *value, dm_block_t *new_root,
761 			   int *inserted)
762 			   __dm_written_to_disk(value)
763 {
764 	return insert(info, root, keys, value, new_root, inserted);
765 }
766 EXPORT_SYMBOL_GPL(dm_btree_insert_notify);
767 
768 /*----------------------------------------------------------------*/
769 
770 static int find_key(struct ro_spine *s, dm_block_t block, bool find_highest,
771 		    uint64_t *result_key, dm_block_t *next_block)
772 {
773 	int i, r;
774 	uint32_t flags;
775 
776 	do {
777 		r = ro_step(s, block);
778 		if (r < 0)
779 			return r;
780 
781 		flags = le32_to_cpu(ro_node(s)->header.flags);
782 		i = le32_to_cpu(ro_node(s)->header.nr_entries);
783 		if (!i)
784 			return -ENODATA;
785 		else
786 			i--;
787 
788 		if (find_highest)
789 			*result_key = le64_to_cpu(ro_node(s)->keys[i]);
790 		else
791 			*result_key = le64_to_cpu(ro_node(s)->keys[0]);
792 
793 		if (next_block || flags & INTERNAL_NODE)
794 			block = value64(ro_node(s), i);
795 
796 	} while (flags & INTERNAL_NODE);
797 
798 	if (next_block)
799 		*next_block = block;
800 	return 0;
801 }
802 
803 static int dm_btree_find_key(struct dm_btree_info *info, dm_block_t root,
804 			     bool find_highest, uint64_t *result_keys)
805 {
806 	int r = 0, count = 0, level;
807 	struct ro_spine spine;
808 
809 	init_ro_spine(&spine, info);
810 	for (level = 0; level < info->levels; level++) {
811 		r = find_key(&spine, root, find_highest, result_keys + level,
812 			     level == info->levels - 1 ? NULL : &root);
813 		if (r == -ENODATA) {
814 			r = 0;
815 			break;
816 
817 		} else if (r)
818 			break;
819 
820 		count++;
821 	}
822 	exit_ro_spine(&spine);
823 
824 	return r ? r : count;
825 }
826 
827 int dm_btree_find_highest_key(struct dm_btree_info *info, dm_block_t root,
828 			      uint64_t *result_keys)
829 {
830 	return dm_btree_find_key(info, root, true, result_keys);
831 }
832 EXPORT_SYMBOL_GPL(dm_btree_find_highest_key);
833 
834 int dm_btree_find_lowest_key(struct dm_btree_info *info, dm_block_t root,
835 			     uint64_t *result_keys)
836 {
837 	return dm_btree_find_key(info, root, false, result_keys);
838 }
839 EXPORT_SYMBOL_GPL(dm_btree_find_lowest_key);
840 
841 /*----------------------------------------------------------------*/
842 
843 /*
844  * FIXME: We shouldn't use a recursive algorithm when we have limited stack
845  * space.  Also this only works for single level trees.
846  */
847 static int walk_node(struct dm_btree_info *info, dm_block_t block,
848 		     int (*fn)(void *context, uint64_t *keys, void *leaf),
849 		     void *context)
850 {
851 	int r;
852 	unsigned i, nr;
853 	struct dm_block *node;
854 	struct btree_node *n;
855 	uint64_t keys;
856 
857 	r = bn_read_lock(info, block, &node);
858 	if (r)
859 		return r;
860 
861 	n = dm_block_data(node);
862 
863 	nr = le32_to_cpu(n->header.nr_entries);
864 	for (i = 0; i < nr; i++) {
865 		if (le32_to_cpu(n->header.flags) & INTERNAL_NODE) {
866 			r = walk_node(info, value64(n, i), fn, context);
867 			if (r)
868 				goto out;
869 		} else {
870 			keys = le64_to_cpu(*key_ptr(n, i));
871 			r = fn(context, &keys, value_ptr(n, i));
872 			if (r)
873 				goto out;
874 		}
875 	}
876 
877 out:
878 	dm_tm_unlock(info->tm, node);
879 	return r;
880 }
881 
882 int dm_btree_walk(struct dm_btree_info *info, dm_block_t root,
883 		  int (*fn)(void *context, uint64_t *keys, void *leaf),
884 		  void *context)
885 {
886 	BUG_ON(info->levels > 1);
887 	return walk_node(info, root, fn, context);
888 }
889 EXPORT_SYMBOL_GPL(dm_btree_walk);
890