xref: /openbmc/linux/drivers/md/persistent-data/dm-btree-remove.c (revision 1b8d2789dad0005fd5e7d35dab26a8e1203fb6da)
13241b1d3SJoe Thornber /*
23241b1d3SJoe Thornber  * Copyright (C) 2011 Red Hat, Inc.
33241b1d3SJoe Thornber  *
43241b1d3SJoe Thornber  * This file is released under the GPL.
53241b1d3SJoe Thornber  */
63241b1d3SJoe Thornber 
73241b1d3SJoe Thornber #include "dm-btree.h"
83241b1d3SJoe Thornber #include "dm-btree-internal.h"
93241b1d3SJoe Thornber #include "dm-transaction-manager.h"
103241b1d3SJoe Thornber 
111944ce60SPaul Gortmaker #include <linux/export.h>
123241b1d3SJoe Thornber 
133241b1d3SJoe Thornber /*
143241b1d3SJoe Thornber  * Removing an entry from a btree
153241b1d3SJoe Thornber  * ==============================
163241b1d3SJoe Thornber  *
173241b1d3SJoe Thornber  * A very important constraint for our btree is that no node, except the
183241b1d3SJoe Thornber  * root, may have fewer than a certain number of entries.
193241b1d3SJoe Thornber  * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES).
203241b1d3SJoe Thornber  *
213241b1d3SJoe Thornber  * Ensuring this is complicated by the way we want to only ever hold the
223241b1d3SJoe Thornber  * locks on 2 nodes concurrently, and only change nodes in a top to bottom
233241b1d3SJoe Thornber  * fashion.
243241b1d3SJoe Thornber  *
253241b1d3SJoe Thornber  * Each node may have a left or right sibling.  When decending the spine,
263241b1d3SJoe Thornber  * if a node contains only MIN_ENTRIES then we try and increase this to at
273241b1d3SJoe Thornber  * least MIN_ENTRIES + 1.  We do this in the following ways:
283241b1d3SJoe Thornber  *
293241b1d3SJoe Thornber  * [A] No siblings => this can only happen if the node is the root, in which
303241b1d3SJoe Thornber  *     case we copy the childs contents over the root.
313241b1d3SJoe Thornber  *
323241b1d3SJoe Thornber  * [B] No left sibling
333241b1d3SJoe Thornber  *     ==> rebalance(node, right sibling)
343241b1d3SJoe Thornber  *
353241b1d3SJoe Thornber  * [C] No right sibling
363241b1d3SJoe Thornber  *     ==> rebalance(left sibling, node)
373241b1d3SJoe Thornber  *
383241b1d3SJoe Thornber  * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD
393241b1d3SJoe Thornber  *     ==> delete node adding it's contents to left and right
403241b1d3SJoe Thornber  *
413241b1d3SJoe Thornber  * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD
423241b1d3SJoe Thornber  *     ==> rebalance(left, node, right)
433241b1d3SJoe Thornber  *
443241b1d3SJoe Thornber  * After these operations it's possible that the our original node no
453241b1d3SJoe Thornber  * longer contains the desired sub tree.  For this reason this rebalancing
463241b1d3SJoe Thornber  * is performed on the children of the current node.  This also avoids
473241b1d3SJoe Thornber  * having a special case for the root.
483241b1d3SJoe Thornber  *
493241b1d3SJoe Thornber  * Once this rebalancing has occurred we can then step into the child node
503241b1d3SJoe Thornber  * for internal nodes.  Or delete the entry for leaf nodes.
513241b1d3SJoe Thornber  */
523241b1d3SJoe Thornber 
533241b1d3SJoe Thornber /*
543241b1d3SJoe Thornber  * Some little utilities for moving node data around.
553241b1d3SJoe Thornber  */
56550929faSMikulas Patocka static void node_shift(struct btree_node *n, int shift)
573241b1d3SJoe Thornber {
583241b1d3SJoe Thornber 	uint32_t nr_entries = le32_to_cpu(n->header.nr_entries);
593241b1d3SJoe Thornber 	uint32_t value_size = le32_to_cpu(n->header.value_size);
603241b1d3SJoe Thornber 
613241b1d3SJoe Thornber 	if (shift < 0) {
623241b1d3SJoe Thornber 		shift = -shift;
633241b1d3SJoe Thornber 		BUG_ON(shift > nr_entries);
64a3aefb39SJoe Thornber 		BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift));
653241b1d3SJoe Thornber 		memmove(key_ptr(n, 0),
663241b1d3SJoe Thornber 			key_ptr(n, shift),
673241b1d3SJoe Thornber 			(nr_entries - shift) * sizeof(__le64));
68a3aefb39SJoe Thornber 		memmove(value_ptr(n, 0),
69a3aefb39SJoe Thornber 			value_ptr(n, shift),
703241b1d3SJoe Thornber 			(nr_entries - shift) * value_size);
713241b1d3SJoe Thornber 	} else {
723241b1d3SJoe Thornber 		BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries));
733241b1d3SJoe Thornber 		memmove(key_ptr(n, shift),
743241b1d3SJoe Thornber 			key_ptr(n, 0),
753241b1d3SJoe Thornber 			nr_entries * sizeof(__le64));
76a3aefb39SJoe Thornber 		memmove(value_ptr(n, shift),
77a3aefb39SJoe Thornber 			value_ptr(n, 0),
783241b1d3SJoe Thornber 			nr_entries * value_size);
793241b1d3SJoe Thornber 	}
803241b1d3SJoe Thornber }
813241b1d3SJoe Thornber 
82550929faSMikulas Patocka static void node_copy(struct btree_node *left, struct btree_node *right, int shift)
833241b1d3SJoe Thornber {
843241b1d3SJoe Thornber 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
853241b1d3SJoe Thornber 	uint32_t value_size = le32_to_cpu(left->header.value_size);
863241b1d3SJoe Thornber 	BUG_ON(value_size != le32_to_cpu(right->header.value_size));
873241b1d3SJoe Thornber 
883241b1d3SJoe Thornber 	if (shift < 0) {
893241b1d3SJoe Thornber 		shift = -shift;
903241b1d3SJoe Thornber 		BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries));
913241b1d3SJoe Thornber 		memcpy(key_ptr(left, nr_left),
923241b1d3SJoe Thornber 		       key_ptr(right, 0),
933241b1d3SJoe Thornber 		       shift * sizeof(__le64));
94a3aefb39SJoe Thornber 		memcpy(value_ptr(left, nr_left),
95a3aefb39SJoe Thornber 		       value_ptr(right, 0),
963241b1d3SJoe Thornber 		       shift * value_size);
973241b1d3SJoe Thornber 	} else {
983241b1d3SJoe Thornber 		BUG_ON(shift > le32_to_cpu(right->header.max_entries));
993241b1d3SJoe Thornber 		memcpy(key_ptr(right, 0),
1003241b1d3SJoe Thornber 		       key_ptr(left, nr_left - shift),
1013241b1d3SJoe Thornber 		       shift * sizeof(__le64));
102a3aefb39SJoe Thornber 		memcpy(value_ptr(right, 0),
103a3aefb39SJoe Thornber 		       value_ptr(left, nr_left - shift),
1043241b1d3SJoe Thornber 		       shift * value_size);
1053241b1d3SJoe Thornber 	}
1063241b1d3SJoe Thornber }
1073241b1d3SJoe Thornber 
1083241b1d3SJoe Thornber /*
1093241b1d3SJoe Thornber  * Delete a specific entry from a leaf node.
1103241b1d3SJoe Thornber  */
111550929faSMikulas Patocka static void delete_at(struct btree_node *n, unsigned index)
1123241b1d3SJoe Thornber {
1133241b1d3SJoe Thornber 	unsigned nr_entries = le32_to_cpu(n->header.nr_entries);
1143241b1d3SJoe Thornber 	unsigned nr_to_copy = nr_entries - (index + 1);
1153241b1d3SJoe Thornber 	uint32_t value_size = le32_to_cpu(n->header.value_size);
1163241b1d3SJoe Thornber 	BUG_ON(index >= nr_entries);
1173241b1d3SJoe Thornber 
1183241b1d3SJoe Thornber 	if (nr_to_copy) {
1193241b1d3SJoe Thornber 		memmove(key_ptr(n, index),
1203241b1d3SJoe Thornber 			key_ptr(n, index + 1),
1213241b1d3SJoe Thornber 			nr_to_copy * sizeof(__le64));
1223241b1d3SJoe Thornber 
123a3aefb39SJoe Thornber 		memmove(value_ptr(n, index),
124a3aefb39SJoe Thornber 			value_ptr(n, index + 1),
1253241b1d3SJoe Thornber 			nr_to_copy * value_size);
1263241b1d3SJoe Thornber 	}
1273241b1d3SJoe Thornber 
1283241b1d3SJoe Thornber 	n->header.nr_entries = cpu_to_le32(nr_entries - 1);
1293241b1d3SJoe Thornber }
1303241b1d3SJoe Thornber 
131550929faSMikulas Patocka static unsigned merge_threshold(struct btree_node *n)
1323241b1d3SJoe Thornber {
133b0988900SJoe Thornber 	return le32_to_cpu(n->header.max_entries) / 3;
1343241b1d3SJoe Thornber }
1353241b1d3SJoe Thornber 
1363241b1d3SJoe Thornber struct child {
1373241b1d3SJoe Thornber 	unsigned index;
1383241b1d3SJoe Thornber 	struct dm_block *block;
139550929faSMikulas Patocka 	struct btree_node *n;
1403241b1d3SJoe Thornber };
1413241b1d3SJoe Thornber 
142f046f89aSJoe Thornber static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt,
143f046f89aSJoe Thornber 		      struct btree_node *parent,
1443241b1d3SJoe Thornber 		      unsigned index, struct child *result)
1453241b1d3SJoe Thornber {
1463241b1d3SJoe Thornber 	int r, inc;
1473241b1d3SJoe Thornber 	dm_block_t root;
1483241b1d3SJoe Thornber 
1493241b1d3SJoe Thornber 	result->index = index;
1503241b1d3SJoe Thornber 	root = value64(parent, index);
1513241b1d3SJoe Thornber 
1523241b1d3SJoe Thornber 	r = dm_tm_shadow_block(info->tm, root, &btree_node_validator,
1533241b1d3SJoe Thornber 			       &result->block, &inc);
1543241b1d3SJoe Thornber 	if (r)
1553241b1d3SJoe Thornber 		return r;
1563241b1d3SJoe Thornber 
1573241b1d3SJoe Thornber 	result->n = dm_block_data(result->block);
1583241b1d3SJoe Thornber 
1593241b1d3SJoe Thornber 	if (inc)
160f046f89aSJoe Thornber 		inc_children(info->tm, result->n, vt);
1613241b1d3SJoe Thornber 
162a3aefb39SJoe Thornber 	*((__le64 *) value_ptr(parent, index)) =
1633241b1d3SJoe Thornber 		cpu_to_le64(dm_block_location(result->block));
1643241b1d3SJoe Thornber 
1653241b1d3SJoe Thornber 	return 0;
1663241b1d3SJoe Thornber }
1673241b1d3SJoe Thornber 
1684c7da06fSMikulas Patocka static void exit_child(struct dm_btree_info *info, struct child *c)
1693241b1d3SJoe Thornber {
1704c7da06fSMikulas Patocka 	dm_tm_unlock(info->tm, c->block);
1713241b1d3SJoe Thornber }
1723241b1d3SJoe Thornber 
173550929faSMikulas Patocka static void shift(struct btree_node *left, struct btree_node *right, int count)
1743241b1d3SJoe Thornber {
175b0988900SJoe Thornber 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
176b0988900SJoe Thornber 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
177b0988900SJoe Thornber 	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
178b0988900SJoe Thornber 	uint32_t r_max_entries = le32_to_cpu(right->header.max_entries);
179b0988900SJoe Thornber 
180b0988900SJoe Thornber 	BUG_ON(max_entries != r_max_entries);
181b0988900SJoe Thornber 	BUG_ON(nr_left - count > max_entries);
182b0988900SJoe Thornber 	BUG_ON(nr_right + count > max_entries);
183b0988900SJoe Thornber 
1843241b1d3SJoe Thornber 	if (!count)
1853241b1d3SJoe Thornber 		return;
1863241b1d3SJoe Thornber 
1873241b1d3SJoe Thornber 	if (count > 0) {
1883241b1d3SJoe Thornber 		node_shift(right, count);
1893241b1d3SJoe Thornber 		node_copy(left, right, count);
1903241b1d3SJoe Thornber 	} else {
1913241b1d3SJoe Thornber 		node_copy(left, right, count);
1923241b1d3SJoe Thornber 		node_shift(right, count);
1933241b1d3SJoe Thornber 	}
1943241b1d3SJoe Thornber 
195b0988900SJoe Thornber 	left->header.nr_entries = cpu_to_le32(nr_left - count);
196b0988900SJoe Thornber 	right->header.nr_entries = cpu_to_le32(nr_right + count);
1973241b1d3SJoe Thornber }
1983241b1d3SJoe Thornber 
199550929faSMikulas Patocka static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent,
2003241b1d3SJoe Thornber 			 struct child *l, struct child *r)
2013241b1d3SJoe Thornber {
202550929faSMikulas Patocka 	struct btree_node *left = l->n;
203550929faSMikulas Patocka 	struct btree_node *right = r->n;
2043241b1d3SJoe Thornber 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
2053241b1d3SJoe Thornber 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
206474e5595SHou Tao 	/*
207474e5595SHou Tao 	 * Ensure the number of entries in each child will be greater
208474e5595SHou Tao 	 * than or equal to (max_entries / 3 + 1), so no matter which
209474e5595SHou Tao 	 * child is used for removal, the number will still be not
210474e5595SHou Tao 	 * less than (max_entries / 3).
211474e5595SHou Tao 	 */
212474e5595SHou Tao 	unsigned int threshold = 2 * (merge_threshold(left) + 1);
2133241b1d3SJoe Thornber 
214b0988900SJoe Thornber 	if (nr_left + nr_right < threshold) {
2153241b1d3SJoe Thornber 		/*
2163241b1d3SJoe Thornber 		 * Merge
2173241b1d3SJoe Thornber 		 */
2183241b1d3SJoe Thornber 		node_copy(left, right, -nr_right);
2193241b1d3SJoe Thornber 		left->header.nr_entries = cpu_to_le32(nr_left + nr_right);
2203241b1d3SJoe Thornber 		delete_at(parent, r->index);
2213241b1d3SJoe Thornber 
2223241b1d3SJoe Thornber 		/*
2233241b1d3SJoe Thornber 		 * We need to decrement the right block, but not it's
2243241b1d3SJoe Thornber 		 * children, since they're still referenced by left.
2253241b1d3SJoe Thornber 		 */
2263241b1d3SJoe Thornber 		dm_tm_dec(info->tm, dm_block_location(r->block));
2273241b1d3SJoe Thornber 	} else {
2283241b1d3SJoe Thornber 		/*
2293241b1d3SJoe Thornber 		 * Rebalance.
2303241b1d3SJoe Thornber 		 */
2313241b1d3SJoe Thornber 		unsigned target_left = (nr_left + nr_right) / 2;
2323241b1d3SJoe Thornber 		shift(left, right, nr_left - target_left);
2333241b1d3SJoe Thornber 		*key_ptr(parent, r->index) = right->keys[0];
2343241b1d3SJoe Thornber 	}
2353241b1d3SJoe Thornber }
2363241b1d3SJoe Thornber 
2373241b1d3SJoe Thornber static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info,
238f046f89aSJoe Thornber 		      struct dm_btree_value_type *vt, unsigned left_index)
2393241b1d3SJoe Thornber {
2403241b1d3SJoe Thornber 	int r;
241550929faSMikulas Patocka 	struct btree_node *parent;
2423241b1d3SJoe Thornber 	struct child left, right;
2433241b1d3SJoe Thornber 
2443241b1d3SJoe Thornber 	parent = dm_block_data(shadow_current(s));
2453241b1d3SJoe Thornber 
246f046f89aSJoe Thornber 	r = init_child(info, vt, parent, left_index, &left);
2473241b1d3SJoe Thornber 	if (r)
2483241b1d3SJoe Thornber 		return r;
2493241b1d3SJoe Thornber 
250f046f89aSJoe Thornber 	r = init_child(info, vt, parent, left_index + 1, &right);
2513241b1d3SJoe Thornber 	if (r) {
2523241b1d3SJoe Thornber 		exit_child(info, &left);
2533241b1d3SJoe Thornber 		return r;
2543241b1d3SJoe Thornber 	}
2553241b1d3SJoe Thornber 
2563241b1d3SJoe Thornber 	__rebalance2(info, parent, &left, &right);
2573241b1d3SJoe Thornber 
2584c7da06fSMikulas Patocka 	exit_child(info, &left);
2593241b1d3SJoe Thornber 	exit_child(info, &right);
2603241b1d3SJoe Thornber 
2614c7da06fSMikulas Patocka 	return 0;
2623241b1d3SJoe Thornber }
2633241b1d3SJoe Thornber 
2643241b1d3SJoe Thornber /*
265b0988900SJoe Thornber  * We dump as many entries from center as possible into left, then the rest
266b0988900SJoe Thornber  * in right, then rebalance2.  This wastes some cpu, but I want something
267b0988900SJoe Thornber  * simple atm.
2683241b1d3SJoe Thornber  */
269550929faSMikulas Patocka static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent,
270b0988900SJoe Thornber 			       struct child *l, struct child *c, struct child *r,
271550929faSMikulas Patocka 			       struct btree_node *left, struct btree_node *center, struct btree_node *right,
272b0988900SJoe Thornber 			       uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
273b0988900SJoe Thornber {
274b0988900SJoe Thornber 	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
2753241b1d3SJoe Thornber 	unsigned shift = min(max_entries - nr_left, nr_center);
2763241b1d3SJoe Thornber 
2773241b1d3SJoe Thornber 	BUG_ON(nr_left + shift > max_entries);
2783241b1d3SJoe Thornber 	node_copy(left, center, -shift);
2793241b1d3SJoe Thornber 	left->header.nr_entries = cpu_to_le32(nr_left + shift);
2803241b1d3SJoe Thornber 
2813241b1d3SJoe Thornber 	if (shift != nr_center) {
2823241b1d3SJoe Thornber 		shift = nr_center - shift;
283b0988900SJoe Thornber 		BUG_ON((nr_right + shift) > max_entries);
2843241b1d3SJoe Thornber 		node_shift(right, shift);
2853241b1d3SJoe Thornber 		node_copy(center, right, shift);
2863241b1d3SJoe Thornber 		right->header.nr_entries = cpu_to_le32(nr_right + shift);
2873241b1d3SJoe Thornber 	}
2883241b1d3SJoe Thornber 	*key_ptr(parent, r->index) = right->keys[0];
2893241b1d3SJoe Thornber 
2903241b1d3SJoe Thornber 	delete_at(parent, c->index);
2913241b1d3SJoe Thornber 	r->index--;
2923241b1d3SJoe Thornber 
2933241b1d3SJoe Thornber 	dm_tm_dec(info->tm, dm_block_location(c->block));
2943241b1d3SJoe Thornber 	__rebalance2(info, parent, l, r);
2953241b1d3SJoe Thornber }
2963241b1d3SJoe Thornber 
2973241b1d3SJoe Thornber /*
298b0988900SJoe Thornber  * Redistributes entries among 3 sibling nodes.
2993241b1d3SJoe Thornber  */
300550929faSMikulas Patocka static void redistribute3(struct dm_btree_info *info, struct btree_node *parent,
301b0988900SJoe Thornber 			  struct child *l, struct child *c, struct child *r,
302550929faSMikulas Patocka 			  struct btree_node *left, struct btree_node *center, struct btree_node *right,
303b0988900SJoe Thornber 			  uint32_t nr_left, uint32_t nr_center, uint32_t nr_right)
304b0988900SJoe Thornber {
305b0988900SJoe Thornber 	int s;
306b0988900SJoe Thornber 	uint32_t max_entries = le32_to_cpu(left->header.max_entries);
3072871c69eSJoe Thornber 	unsigned total = nr_left + nr_center + nr_right;
3082871c69eSJoe Thornber 	unsigned target_right = total / 3;
3092871c69eSJoe Thornber 	unsigned remainder = (target_right * 3) != total;
3102871c69eSJoe Thornber 	unsigned target_left = target_right + remainder;
3112871c69eSJoe Thornber 
3122871c69eSJoe Thornber 	BUG_ON(target_left > max_entries);
3132871c69eSJoe Thornber 	BUG_ON(target_right > max_entries);
3143241b1d3SJoe Thornber 
315b0988900SJoe Thornber 	if (nr_left < nr_right) {
3162871c69eSJoe Thornber 		s = nr_left - target_left;
3173241b1d3SJoe Thornber 
318b0988900SJoe Thornber 		if (s < 0 && nr_center < -s) {
319b0988900SJoe Thornber 			/* not enough in central node */
3204c7e3093SDennis Yang 			shift(left, center, -nr_center);
3214c7e3093SDennis Yang 			s += nr_center;
322b0988900SJoe Thornber 			shift(left, right, s);
323b0988900SJoe Thornber 			nr_right += s;
324b0988900SJoe Thornber 		} else
325b0988900SJoe Thornber 			shift(left, center, s);
326b0988900SJoe Thornber 
3272871c69eSJoe Thornber 		shift(center, right, target_right - nr_right);
328b0988900SJoe Thornber 
329b0988900SJoe Thornber 	} else {
3302871c69eSJoe Thornber 		s = target_right - nr_right;
331b0988900SJoe Thornber 		if (s > 0 && nr_center < s) {
332b0988900SJoe Thornber 			/* not enough in central node */
333b0988900SJoe Thornber 			shift(center, right, nr_center);
3344c7e3093SDennis Yang 			s -= nr_center;
335b0988900SJoe Thornber 			shift(left, right, s);
336b0988900SJoe Thornber 			nr_left -= s;
337b0988900SJoe Thornber 		} else
338b0988900SJoe Thornber 			shift(center, right, s);
339b0988900SJoe Thornber 
3402871c69eSJoe Thornber 		shift(left, center, nr_left - target_left);
341b0988900SJoe Thornber 	}
342b0988900SJoe Thornber 
3433241b1d3SJoe Thornber 	*key_ptr(parent, c->index) = center->keys[0];
3443241b1d3SJoe Thornber 	*key_ptr(parent, r->index) = right->keys[0];
3453241b1d3SJoe Thornber }
3463241b1d3SJoe Thornber 
347550929faSMikulas Patocka static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent,
348b0988900SJoe Thornber 			 struct child *l, struct child *c, struct child *r)
349b0988900SJoe Thornber {
350550929faSMikulas Patocka 	struct btree_node *left = l->n;
351550929faSMikulas Patocka 	struct btree_node *center = c->n;
352550929faSMikulas Patocka 	struct btree_node *right = r->n;
353b0988900SJoe Thornber 
354b0988900SJoe Thornber 	uint32_t nr_left = le32_to_cpu(left->header.nr_entries);
355b0988900SJoe Thornber 	uint32_t nr_center = le32_to_cpu(center->header.nr_entries);
356b0988900SJoe Thornber 	uint32_t nr_right = le32_to_cpu(right->header.nr_entries);
357b0988900SJoe Thornber 
358b0988900SJoe Thornber 	unsigned threshold = merge_threshold(left) * 4 + 1;
359b0988900SJoe Thornber 
360b0988900SJoe Thornber 	BUG_ON(left->header.max_entries != center->header.max_entries);
361b0988900SJoe Thornber 	BUG_ON(center->header.max_entries != right->header.max_entries);
362b0988900SJoe Thornber 
363b0988900SJoe Thornber 	if ((nr_left + nr_center + nr_right) < threshold)
364b0988900SJoe Thornber 		delete_center_node(info, parent, l, c, r, left, center, right,
365b0988900SJoe Thornber 				   nr_left, nr_center, nr_right);
366b0988900SJoe Thornber 	else
367b0988900SJoe Thornber 		redistribute3(info, parent, l, c, r, left, center, right,
368b0988900SJoe Thornber 			      nr_left, nr_center, nr_right);
369b0988900SJoe Thornber }
370b0988900SJoe Thornber 
3713241b1d3SJoe Thornber static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info,
372f046f89aSJoe Thornber 		      struct dm_btree_value_type *vt, unsigned left_index)
3733241b1d3SJoe Thornber {
3743241b1d3SJoe Thornber 	int r;
375550929faSMikulas Patocka 	struct btree_node *parent = dm_block_data(shadow_current(s));
3763241b1d3SJoe Thornber 	struct child left, center, right;
3773241b1d3SJoe Thornber 
3783241b1d3SJoe Thornber 	/*
3793241b1d3SJoe Thornber 	 * FIXME: fill out an array?
3803241b1d3SJoe Thornber 	 */
381f046f89aSJoe Thornber 	r = init_child(info, vt, parent, left_index, &left);
3823241b1d3SJoe Thornber 	if (r)
3833241b1d3SJoe Thornber 		return r;
3843241b1d3SJoe Thornber 
385f046f89aSJoe Thornber 	r = init_child(info, vt, parent, left_index + 1, &center);
3863241b1d3SJoe Thornber 	if (r) {
3873241b1d3SJoe Thornber 		exit_child(info, &left);
3883241b1d3SJoe Thornber 		return r;
3893241b1d3SJoe Thornber 	}
3903241b1d3SJoe Thornber 
391f046f89aSJoe Thornber 	r = init_child(info, vt, parent, left_index + 2, &right);
3923241b1d3SJoe Thornber 	if (r) {
3933241b1d3SJoe Thornber 		exit_child(info, &left);
3943241b1d3SJoe Thornber 		exit_child(info, &center);
3953241b1d3SJoe Thornber 		return r;
3963241b1d3SJoe Thornber 	}
3973241b1d3SJoe Thornber 
3983241b1d3SJoe Thornber 	__rebalance3(info, parent, &left, &center, &right);
3993241b1d3SJoe Thornber 
4004c7da06fSMikulas Patocka 	exit_child(info, &left);
4013241b1d3SJoe Thornber 	exit_child(info, &center);
4023241b1d3SJoe Thornber 	exit_child(info, &right);
4033241b1d3SJoe Thornber 
4043241b1d3SJoe Thornber 	return 0;
4053241b1d3SJoe Thornber }
4063241b1d3SJoe Thornber 
4073241b1d3SJoe Thornber static int rebalance_children(struct shadow_spine *s,
408f046f89aSJoe Thornber 			      struct dm_btree_info *info,
409f046f89aSJoe Thornber 			      struct dm_btree_value_type *vt, uint64_t key)
4103241b1d3SJoe Thornber {
4113241b1d3SJoe Thornber 	int i, r, has_left_sibling, has_right_sibling;
412550929faSMikulas Patocka 	struct btree_node *n;
4133241b1d3SJoe Thornber 
4143241b1d3SJoe Thornber 	n = dm_block_data(shadow_current(s));
4153241b1d3SJoe Thornber 
4163241b1d3SJoe Thornber 	if (le32_to_cpu(n->header.nr_entries) == 1) {
4173241b1d3SJoe Thornber 		struct dm_block *child;
4183241b1d3SJoe Thornber 		dm_block_t b = value64(n, 0);
4193241b1d3SJoe Thornber 
4203241b1d3SJoe Thornber 		r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child);
4213241b1d3SJoe Thornber 		if (r)
4223241b1d3SJoe Thornber 			return r;
4233241b1d3SJoe Thornber 
4243241b1d3SJoe Thornber 		memcpy(n, dm_block_data(child),
4253241b1d3SJoe Thornber 		       dm_bm_block_size(dm_tm_get_bm(info->tm)));
4263241b1d3SJoe Thornber 
4273241b1d3SJoe Thornber 		dm_tm_dec(info->tm, dm_block_location(child));
428*1b8d2789SJoe Thornber 		dm_tm_unlock(info->tm, child);
4293241b1d3SJoe Thornber 		return 0;
4303241b1d3SJoe Thornber 	}
4313241b1d3SJoe Thornber 
4323241b1d3SJoe Thornber 	i = lower_bound(n, key);
4333241b1d3SJoe Thornber 	if (i < 0)
4343241b1d3SJoe Thornber 		return -ENODATA;
4353241b1d3SJoe Thornber 
4363241b1d3SJoe Thornber 	has_left_sibling = i > 0;
4373241b1d3SJoe Thornber 	has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1);
4383241b1d3SJoe Thornber 
4393241b1d3SJoe Thornber 	if (!has_left_sibling)
440f046f89aSJoe Thornber 		r = rebalance2(s, info, vt, i);
4413241b1d3SJoe Thornber 
4423241b1d3SJoe Thornber 	else if (!has_right_sibling)
443f046f89aSJoe Thornber 		r = rebalance2(s, info, vt, i - 1);
4443241b1d3SJoe Thornber 
4453241b1d3SJoe Thornber 	else
446f046f89aSJoe Thornber 		r = rebalance3(s, info, vt, i - 1);
4473241b1d3SJoe Thornber 
4483241b1d3SJoe Thornber 	return r;
4493241b1d3SJoe Thornber }
4503241b1d3SJoe Thornber 
451550929faSMikulas Patocka static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index)
4523241b1d3SJoe Thornber {
4533241b1d3SJoe Thornber 	int i = lower_bound(n, key);
4543241b1d3SJoe Thornber 
4553241b1d3SJoe Thornber 	if ((i < 0) ||
4563241b1d3SJoe Thornber 	    (i >= le32_to_cpu(n->header.nr_entries)) ||
4573241b1d3SJoe Thornber 	    (le64_to_cpu(n->keys[i]) != key))
4583241b1d3SJoe Thornber 		return -ENODATA;
4593241b1d3SJoe Thornber 
4603241b1d3SJoe Thornber 	*index = i;
4613241b1d3SJoe Thornber 
4623241b1d3SJoe Thornber 	return 0;
4633241b1d3SJoe Thornber }
4643241b1d3SJoe Thornber 
4653241b1d3SJoe Thornber /*
4663241b1d3SJoe Thornber  * Prepares for removal from one level of the hierarchy.  The caller must
4673241b1d3SJoe Thornber  * call delete_at() to remove the entry at index.
4683241b1d3SJoe Thornber  */
4693241b1d3SJoe Thornber static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info,
4703241b1d3SJoe Thornber 		      struct dm_btree_value_type *vt, dm_block_t root,
4713241b1d3SJoe Thornber 		      uint64_t key, unsigned *index)
4723241b1d3SJoe Thornber {
4733241b1d3SJoe Thornber 	int i = *index, r;
474550929faSMikulas Patocka 	struct btree_node *n;
4753241b1d3SJoe Thornber 
4763241b1d3SJoe Thornber 	for (;;) {
4773241b1d3SJoe Thornber 		r = shadow_step(s, root, vt);
4783241b1d3SJoe Thornber 		if (r < 0)
4793241b1d3SJoe Thornber 			break;
4803241b1d3SJoe Thornber 
4813241b1d3SJoe Thornber 		/*
4823241b1d3SJoe Thornber 		 * We have to patch up the parent node, ugly, but I don't
4833241b1d3SJoe Thornber 		 * see a way to do this automatically as part of the spine
4843241b1d3SJoe Thornber 		 * op.
4853241b1d3SJoe Thornber 		 */
4863241b1d3SJoe Thornber 		if (shadow_has_parent(s)) {
4873241b1d3SJoe Thornber 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
488a3aefb39SJoe Thornber 			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
4893241b1d3SJoe Thornber 			       &location, sizeof(__le64));
4903241b1d3SJoe Thornber 		}
4913241b1d3SJoe Thornber 
4923241b1d3SJoe Thornber 		n = dm_block_data(shadow_current(s));
4933241b1d3SJoe Thornber 
4943241b1d3SJoe Thornber 		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
4953241b1d3SJoe Thornber 			return do_leaf(n, key, index);
4963241b1d3SJoe Thornber 
497f046f89aSJoe Thornber 		r = rebalance_children(s, info, vt, key);
4983241b1d3SJoe Thornber 		if (r)
4993241b1d3SJoe Thornber 			break;
5003241b1d3SJoe Thornber 
5013241b1d3SJoe Thornber 		n = dm_block_data(shadow_current(s));
5023241b1d3SJoe Thornber 		if (le32_to_cpu(n->header.flags) & LEAF_NODE)
5033241b1d3SJoe Thornber 			return do_leaf(n, key, index);
5043241b1d3SJoe Thornber 
5053241b1d3SJoe Thornber 		i = lower_bound(n, key);
5063241b1d3SJoe Thornber 
5073241b1d3SJoe Thornber 		/*
5083241b1d3SJoe Thornber 		 * We know the key is present, or else
5093241b1d3SJoe Thornber 		 * rebalance_children would have returned
5103241b1d3SJoe Thornber 		 * -ENODATA
5113241b1d3SJoe Thornber 		 */
5123241b1d3SJoe Thornber 		root = value64(n, i);
5133241b1d3SJoe Thornber 	}
5143241b1d3SJoe Thornber 
5153241b1d3SJoe Thornber 	return r;
5163241b1d3SJoe Thornber }
5173241b1d3SJoe Thornber 
5183241b1d3SJoe Thornber int dm_btree_remove(struct dm_btree_info *info, dm_block_t root,
5193241b1d3SJoe Thornber 		    uint64_t *keys, dm_block_t *new_root)
5203241b1d3SJoe Thornber {
5213241b1d3SJoe Thornber 	unsigned level, last_level = info->levels - 1;
5223241b1d3SJoe Thornber 	int index = 0, r = 0;
5233241b1d3SJoe Thornber 	struct shadow_spine spine;
524550929faSMikulas Patocka 	struct btree_node *n;
525b0dc3c8bSJoe Thornber 	struct dm_btree_value_type le64_vt;
5263241b1d3SJoe Thornber 
527b0dc3c8bSJoe Thornber 	init_le64_type(info->tm, &le64_vt);
5283241b1d3SJoe Thornber 	init_shadow_spine(&spine, info);
5293241b1d3SJoe Thornber 	for (level = 0; level < info->levels; level++) {
5303241b1d3SJoe Thornber 		r = remove_raw(&spine, info,
5313241b1d3SJoe Thornber 			       (level == last_level ?
532b0dc3c8bSJoe Thornber 				&info->value_type : &le64_vt),
5333241b1d3SJoe Thornber 			       root, keys[level], (unsigned *)&index);
5343241b1d3SJoe Thornber 		if (r < 0)
5353241b1d3SJoe Thornber 			break;
5363241b1d3SJoe Thornber 
5373241b1d3SJoe Thornber 		n = dm_block_data(shadow_current(&spine));
5383241b1d3SJoe Thornber 		if (level != last_level) {
5393241b1d3SJoe Thornber 			root = value64(n, index);
5403241b1d3SJoe Thornber 			continue;
5413241b1d3SJoe Thornber 		}
5423241b1d3SJoe Thornber 
5433241b1d3SJoe Thornber 		BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries));
5443241b1d3SJoe Thornber 
5453241b1d3SJoe Thornber 		if (info->value_type.dec)
5463241b1d3SJoe Thornber 			info->value_type.dec(info->value_type.context,
547be500ed7SJoe Thornber 					     value_ptr(n, index), 1);
5483241b1d3SJoe Thornber 
5493241b1d3SJoe Thornber 		delete_at(n, index);
5503241b1d3SJoe Thornber 	}
5513241b1d3SJoe Thornber 
552b6e58b54SHou Tao 	if (!r)
5533241b1d3SJoe Thornber 		*new_root = shadow_root(&spine);
5543241b1d3SJoe Thornber 	exit_shadow_spine(&spine);
5553241b1d3SJoe Thornber 
5563241b1d3SJoe Thornber 	return r;
5573241b1d3SJoe Thornber }
5583241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_remove);
5594ec331c3SJoe Thornber 
5604ec331c3SJoe Thornber /*----------------------------------------------------------------*/
5614ec331c3SJoe Thornber 
5624ec331c3SJoe Thornber static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info,
5634ec331c3SJoe Thornber 			  struct dm_btree_value_type *vt, dm_block_t root,
5644ec331c3SJoe Thornber 			  uint64_t key, int *index)
5654ec331c3SJoe Thornber {
5664ec331c3SJoe Thornber 	int i = *index, r;
5674ec331c3SJoe Thornber 	struct btree_node *n;
5684ec331c3SJoe Thornber 
5694ec331c3SJoe Thornber 	for (;;) {
5704ec331c3SJoe Thornber 		r = shadow_step(s, root, vt);
5714ec331c3SJoe Thornber 		if (r < 0)
5724ec331c3SJoe Thornber 			break;
5734ec331c3SJoe Thornber 
5744ec331c3SJoe Thornber 		/*
5754ec331c3SJoe Thornber 		 * We have to patch up the parent node, ugly, but I don't
5764ec331c3SJoe Thornber 		 * see a way to do this automatically as part of the spine
5774ec331c3SJoe Thornber 		 * op.
5784ec331c3SJoe Thornber 		 */
5794ec331c3SJoe Thornber 		if (shadow_has_parent(s)) {
5804ec331c3SJoe Thornber 			__le64 location = cpu_to_le64(dm_block_location(shadow_current(s)));
5814ec331c3SJoe Thornber 			memcpy(value_ptr(dm_block_data(shadow_parent(s)), i),
5824ec331c3SJoe Thornber 			       &location, sizeof(__le64));
5834ec331c3SJoe Thornber 		}
5844ec331c3SJoe Thornber 
5854ec331c3SJoe Thornber 		n = dm_block_data(shadow_current(s));
5864ec331c3SJoe Thornber 
5874ec331c3SJoe Thornber 		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
5884ec331c3SJoe Thornber 			*index = lower_bound(n, key);
5894ec331c3SJoe Thornber 			return 0;
5904ec331c3SJoe Thornber 		}
5914ec331c3SJoe Thornber 
5924ec331c3SJoe Thornber 		r = rebalance_children(s, info, vt, key);
5934ec331c3SJoe Thornber 		if (r)
5944ec331c3SJoe Thornber 			break;
5954ec331c3SJoe Thornber 
5964ec331c3SJoe Thornber 		n = dm_block_data(shadow_current(s));
5974ec331c3SJoe Thornber 		if (le32_to_cpu(n->header.flags) & LEAF_NODE) {
5984ec331c3SJoe Thornber 			*index = lower_bound(n, key);
5994ec331c3SJoe Thornber 			return 0;
6004ec331c3SJoe Thornber 		}
6014ec331c3SJoe Thornber 
6024ec331c3SJoe Thornber 		i = lower_bound(n, key);
6034ec331c3SJoe Thornber 
6044ec331c3SJoe Thornber 		/*
6054ec331c3SJoe Thornber 		 * We know the key is present, or else
6064ec331c3SJoe Thornber 		 * rebalance_children would have returned
6074ec331c3SJoe Thornber 		 * -ENODATA
6084ec331c3SJoe Thornber 		 */
6094ec331c3SJoe Thornber 		root = value64(n, i);
6104ec331c3SJoe Thornber 	}
6114ec331c3SJoe Thornber 
6124ec331c3SJoe Thornber 	return r;
6134ec331c3SJoe Thornber }
6144ec331c3SJoe Thornber 
6154ec331c3SJoe Thornber static int remove_one(struct dm_btree_info *info, dm_block_t root,
6164ec331c3SJoe Thornber 		      uint64_t *keys, uint64_t end_key,
6174ec331c3SJoe Thornber 		      dm_block_t *new_root, unsigned *nr_removed)
6184ec331c3SJoe Thornber {
6194ec331c3SJoe Thornber 	unsigned level, last_level = info->levels - 1;
6204ec331c3SJoe Thornber 	int index = 0, r = 0;
6214ec331c3SJoe Thornber 	struct shadow_spine spine;
6224ec331c3SJoe Thornber 	struct btree_node *n;
623b0dc3c8bSJoe Thornber 	struct dm_btree_value_type le64_vt;
6244ec331c3SJoe Thornber 	uint64_t k;
6254ec331c3SJoe Thornber 
626b0dc3c8bSJoe Thornber 	init_le64_type(info->tm, &le64_vt);
6274ec331c3SJoe Thornber 	init_shadow_spine(&spine, info);
6284ec331c3SJoe Thornber 	for (level = 0; level < last_level; level++) {
629b0dc3c8bSJoe Thornber 		r = remove_raw(&spine, info, &le64_vt,
6304ec331c3SJoe Thornber 			       root, keys[level], (unsigned *) &index);
6314ec331c3SJoe Thornber 		if (r < 0)
6324ec331c3SJoe Thornber 			goto out;
6334ec331c3SJoe Thornber 
6344ec331c3SJoe Thornber 		n = dm_block_data(shadow_current(&spine));
6354ec331c3SJoe Thornber 		root = value64(n, index);
6364ec331c3SJoe Thornber 	}
6374ec331c3SJoe Thornber 
6384ec331c3SJoe Thornber 	r = remove_nearest(&spine, info, &info->value_type,
6394ec331c3SJoe Thornber 			   root, keys[last_level], &index);
6404ec331c3SJoe Thornber 	if (r < 0)
6414ec331c3SJoe Thornber 		goto out;
6424ec331c3SJoe Thornber 
6434ec331c3SJoe Thornber 	n = dm_block_data(shadow_current(&spine));
6444ec331c3SJoe Thornber 
6454ec331c3SJoe Thornber 	if (index < 0)
6464ec331c3SJoe Thornber 		index = 0;
6474ec331c3SJoe Thornber 
6484ec331c3SJoe Thornber 	if (index >= le32_to_cpu(n->header.nr_entries)) {
6494ec331c3SJoe Thornber 		r = -ENODATA;
6504ec331c3SJoe Thornber 		goto out;
6514ec331c3SJoe Thornber 	}
6524ec331c3SJoe Thornber 
6534ec331c3SJoe Thornber 	k = le64_to_cpu(n->keys[index]);
6544ec331c3SJoe Thornber 	if (k >= keys[last_level] && k < end_key) {
6554ec331c3SJoe Thornber 		if (info->value_type.dec)
6564ec331c3SJoe Thornber 			info->value_type.dec(info->value_type.context,
657be500ed7SJoe Thornber 					     value_ptr(n, index), 1);
6584ec331c3SJoe Thornber 
6594ec331c3SJoe Thornber 		delete_at(n, index);
660aa0cd28dSJoe Thornber 		keys[last_level] = k + 1ull;
6614ec331c3SJoe Thornber 
6624ec331c3SJoe Thornber 	} else
6634ec331c3SJoe Thornber 		r = -ENODATA;
6644ec331c3SJoe Thornber 
6654ec331c3SJoe Thornber out:
6664ec331c3SJoe Thornber 	*new_root = shadow_root(&spine);
6674ec331c3SJoe Thornber 	exit_shadow_spine(&spine);
6684ec331c3SJoe Thornber 
6694ec331c3SJoe Thornber 	return r;
6704ec331c3SJoe Thornber }
6714ec331c3SJoe Thornber 
6724ec331c3SJoe Thornber int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root,
6734ec331c3SJoe Thornber 			   uint64_t *first_key, uint64_t end_key,
6744ec331c3SJoe Thornber 			   dm_block_t *new_root, unsigned *nr_removed)
6754ec331c3SJoe Thornber {
6764ec331c3SJoe Thornber 	int r;
6774ec331c3SJoe Thornber 
6784ec331c3SJoe Thornber 	*nr_removed = 0;
6794ec331c3SJoe Thornber 	do {
6804ec331c3SJoe Thornber 		r = remove_one(info, root, first_key, end_key, &root, nr_removed);
6814ec331c3SJoe Thornber 		if (!r)
6824ec331c3SJoe Thornber 			(*nr_removed)++;
6834ec331c3SJoe Thornber 	} while (!r);
6844ec331c3SJoe Thornber 
6854ec331c3SJoe Thornber 	*new_root = root;
6864ec331c3SJoe Thornber 	return r == -ENODATA ? 0 : r;
6874ec331c3SJoe Thornber }
6884ec331c3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_remove_leaves);
689