13241b1d3SJoe Thornber /* 23241b1d3SJoe Thornber * Copyright (C) 2011 Red Hat, Inc. 33241b1d3SJoe Thornber * 43241b1d3SJoe Thornber * This file is released under the GPL. 53241b1d3SJoe Thornber */ 63241b1d3SJoe Thornber 73241b1d3SJoe Thornber #include "dm-btree.h" 83241b1d3SJoe Thornber #include "dm-btree-internal.h" 93241b1d3SJoe Thornber #include "dm-transaction-manager.h" 103241b1d3SJoe Thornber 111944ce60SPaul Gortmaker #include <linux/export.h> 123241b1d3SJoe Thornber 133241b1d3SJoe Thornber /* 143241b1d3SJoe Thornber * Removing an entry from a btree 153241b1d3SJoe Thornber * ============================== 163241b1d3SJoe Thornber * 173241b1d3SJoe Thornber * A very important constraint for our btree is that no node, except the 183241b1d3SJoe Thornber * root, may have fewer than a certain number of entries. 193241b1d3SJoe Thornber * (MIN_ENTRIES <= nr_entries <= MAX_ENTRIES). 203241b1d3SJoe Thornber * 213241b1d3SJoe Thornber * Ensuring this is complicated by the way we want to only ever hold the 223241b1d3SJoe Thornber * locks on 2 nodes concurrently, and only change nodes in a top to bottom 233241b1d3SJoe Thornber * fashion. 243241b1d3SJoe Thornber * 253241b1d3SJoe Thornber * Each node may have a left or right sibling. When decending the spine, 263241b1d3SJoe Thornber * if a node contains only MIN_ENTRIES then we try and increase this to at 273241b1d3SJoe Thornber * least MIN_ENTRIES + 1. We do this in the following ways: 283241b1d3SJoe Thornber * 293241b1d3SJoe Thornber * [A] No siblings => this can only happen if the node is the root, in which 303241b1d3SJoe Thornber * case we copy the childs contents over the root. 313241b1d3SJoe Thornber * 323241b1d3SJoe Thornber * [B] No left sibling 333241b1d3SJoe Thornber * ==> rebalance(node, right sibling) 343241b1d3SJoe Thornber * 353241b1d3SJoe Thornber * [C] No right sibling 363241b1d3SJoe Thornber * ==> rebalance(left sibling, node) 373241b1d3SJoe Thornber * 383241b1d3SJoe Thornber * [D] Both siblings, total_entries(left, node, right) <= DEL_THRESHOLD 393241b1d3SJoe Thornber * ==> delete node adding it's contents to left and right 403241b1d3SJoe Thornber * 413241b1d3SJoe Thornber * [E] Both siblings, total_entries(left, node, right) > DEL_THRESHOLD 423241b1d3SJoe Thornber * ==> rebalance(left, node, right) 433241b1d3SJoe Thornber * 443241b1d3SJoe Thornber * After these operations it's possible that the our original node no 453241b1d3SJoe Thornber * longer contains the desired sub tree. For this reason this rebalancing 463241b1d3SJoe Thornber * is performed on the children of the current node. This also avoids 473241b1d3SJoe Thornber * having a special case for the root. 483241b1d3SJoe Thornber * 493241b1d3SJoe Thornber * Once this rebalancing has occurred we can then step into the child node 503241b1d3SJoe Thornber * for internal nodes. Or delete the entry for leaf nodes. 513241b1d3SJoe Thornber */ 523241b1d3SJoe Thornber 533241b1d3SJoe Thornber /* 543241b1d3SJoe Thornber * Some little utilities for moving node data around. 553241b1d3SJoe Thornber */ 56550929faSMikulas Patocka static void node_shift(struct btree_node *n, int shift) 573241b1d3SJoe Thornber { 583241b1d3SJoe Thornber uint32_t nr_entries = le32_to_cpu(n->header.nr_entries); 593241b1d3SJoe Thornber uint32_t value_size = le32_to_cpu(n->header.value_size); 603241b1d3SJoe Thornber 613241b1d3SJoe Thornber if (shift < 0) { 623241b1d3SJoe Thornber shift = -shift; 633241b1d3SJoe Thornber BUG_ON(shift > nr_entries); 64a3aefb39SJoe Thornber BUG_ON((void *) key_ptr(n, shift) >= value_ptr(n, shift)); 653241b1d3SJoe Thornber memmove(key_ptr(n, 0), 663241b1d3SJoe Thornber key_ptr(n, shift), 673241b1d3SJoe Thornber (nr_entries - shift) * sizeof(__le64)); 68a3aefb39SJoe Thornber memmove(value_ptr(n, 0), 69a3aefb39SJoe Thornber value_ptr(n, shift), 703241b1d3SJoe Thornber (nr_entries - shift) * value_size); 713241b1d3SJoe Thornber } else { 723241b1d3SJoe Thornber BUG_ON(nr_entries + shift > le32_to_cpu(n->header.max_entries)); 733241b1d3SJoe Thornber memmove(key_ptr(n, shift), 743241b1d3SJoe Thornber key_ptr(n, 0), 753241b1d3SJoe Thornber nr_entries * sizeof(__le64)); 76a3aefb39SJoe Thornber memmove(value_ptr(n, shift), 77a3aefb39SJoe Thornber value_ptr(n, 0), 783241b1d3SJoe Thornber nr_entries * value_size); 793241b1d3SJoe Thornber } 803241b1d3SJoe Thornber } 813241b1d3SJoe Thornber 82550929faSMikulas Patocka static void node_copy(struct btree_node *left, struct btree_node *right, int shift) 833241b1d3SJoe Thornber { 843241b1d3SJoe Thornber uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 853241b1d3SJoe Thornber uint32_t value_size = le32_to_cpu(left->header.value_size); 863241b1d3SJoe Thornber BUG_ON(value_size != le32_to_cpu(right->header.value_size)); 873241b1d3SJoe Thornber 883241b1d3SJoe Thornber if (shift < 0) { 893241b1d3SJoe Thornber shift = -shift; 903241b1d3SJoe Thornber BUG_ON(nr_left + shift > le32_to_cpu(left->header.max_entries)); 913241b1d3SJoe Thornber memcpy(key_ptr(left, nr_left), 923241b1d3SJoe Thornber key_ptr(right, 0), 933241b1d3SJoe Thornber shift * sizeof(__le64)); 94a3aefb39SJoe Thornber memcpy(value_ptr(left, nr_left), 95a3aefb39SJoe Thornber value_ptr(right, 0), 963241b1d3SJoe Thornber shift * value_size); 973241b1d3SJoe Thornber } else { 983241b1d3SJoe Thornber BUG_ON(shift > le32_to_cpu(right->header.max_entries)); 993241b1d3SJoe Thornber memcpy(key_ptr(right, 0), 1003241b1d3SJoe Thornber key_ptr(left, nr_left - shift), 1013241b1d3SJoe Thornber shift * sizeof(__le64)); 102a3aefb39SJoe Thornber memcpy(value_ptr(right, 0), 103a3aefb39SJoe Thornber value_ptr(left, nr_left - shift), 1043241b1d3SJoe Thornber shift * value_size); 1053241b1d3SJoe Thornber } 1063241b1d3SJoe Thornber } 1073241b1d3SJoe Thornber 1083241b1d3SJoe Thornber /* 1093241b1d3SJoe Thornber * Delete a specific entry from a leaf node. 1103241b1d3SJoe Thornber */ 111550929faSMikulas Patocka static void delete_at(struct btree_node *n, unsigned index) 1123241b1d3SJoe Thornber { 1133241b1d3SJoe Thornber unsigned nr_entries = le32_to_cpu(n->header.nr_entries); 1143241b1d3SJoe Thornber unsigned nr_to_copy = nr_entries - (index + 1); 1153241b1d3SJoe Thornber uint32_t value_size = le32_to_cpu(n->header.value_size); 1163241b1d3SJoe Thornber BUG_ON(index >= nr_entries); 1173241b1d3SJoe Thornber 1183241b1d3SJoe Thornber if (nr_to_copy) { 1193241b1d3SJoe Thornber memmove(key_ptr(n, index), 1203241b1d3SJoe Thornber key_ptr(n, index + 1), 1213241b1d3SJoe Thornber nr_to_copy * sizeof(__le64)); 1223241b1d3SJoe Thornber 123a3aefb39SJoe Thornber memmove(value_ptr(n, index), 124a3aefb39SJoe Thornber value_ptr(n, index + 1), 1253241b1d3SJoe Thornber nr_to_copy * value_size); 1263241b1d3SJoe Thornber } 1273241b1d3SJoe Thornber 1283241b1d3SJoe Thornber n->header.nr_entries = cpu_to_le32(nr_entries - 1); 1293241b1d3SJoe Thornber } 1303241b1d3SJoe Thornber 131550929faSMikulas Patocka static unsigned merge_threshold(struct btree_node *n) 1323241b1d3SJoe Thornber { 133b0988900SJoe Thornber return le32_to_cpu(n->header.max_entries) / 3; 1343241b1d3SJoe Thornber } 1353241b1d3SJoe Thornber 1363241b1d3SJoe Thornber struct child { 1373241b1d3SJoe Thornber unsigned index; 1383241b1d3SJoe Thornber struct dm_block *block; 139550929faSMikulas Patocka struct btree_node *n; 1403241b1d3SJoe Thornber }; 1413241b1d3SJoe Thornber 142f046f89aSJoe Thornber static int init_child(struct dm_btree_info *info, struct dm_btree_value_type *vt, 143f046f89aSJoe Thornber struct btree_node *parent, 1443241b1d3SJoe Thornber unsigned index, struct child *result) 1453241b1d3SJoe Thornber { 1463241b1d3SJoe Thornber int r, inc; 1473241b1d3SJoe Thornber dm_block_t root; 1483241b1d3SJoe Thornber 1493241b1d3SJoe Thornber result->index = index; 1503241b1d3SJoe Thornber root = value64(parent, index); 1513241b1d3SJoe Thornber 1523241b1d3SJoe Thornber r = dm_tm_shadow_block(info->tm, root, &btree_node_validator, 1533241b1d3SJoe Thornber &result->block, &inc); 1543241b1d3SJoe Thornber if (r) 1553241b1d3SJoe Thornber return r; 1563241b1d3SJoe Thornber 1573241b1d3SJoe Thornber result->n = dm_block_data(result->block); 1583241b1d3SJoe Thornber 1593241b1d3SJoe Thornber if (inc) 160f046f89aSJoe Thornber inc_children(info->tm, result->n, vt); 1613241b1d3SJoe Thornber 162a3aefb39SJoe Thornber *((__le64 *) value_ptr(parent, index)) = 1633241b1d3SJoe Thornber cpu_to_le64(dm_block_location(result->block)); 1643241b1d3SJoe Thornber 1653241b1d3SJoe Thornber return 0; 1663241b1d3SJoe Thornber } 1673241b1d3SJoe Thornber 1684c7da06fSMikulas Patocka static void exit_child(struct dm_btree_info *info, struct child *c) 1693241b1d3SJoe Thornber { 1704c7da06fSMikulas Patocka dm_tm_unlock(info->tm, c->block); 1713241b1d3SJoe Thornber } 1723241b1d3SJoe Thornber 173550929faSMikulas Patocka static void shift(struct btree_node *left, struct btree_node *right, int count) 1743241b1d3SJoe Thornber { 175b0988900SJoe Thornber uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 176b0988900SJoe Thornber uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 177b0988900SJoe Thornber uint32_t max_entries = le32_to_cpu(left->header.max_entries); 178b0988900SJoe Thornber uint32_t r_max_entries = le32_to_cpu(right->header.max_entries); 179b0988900SJoe Thornber 180b0988900SJoe Thornber BUG_ON(max_entries != r_max_entries); 181b0988900SJoe Thornber BUG_ON(nr_left - count > max_entries); 182b0988900SJoe Thornber BUG_ON(nr_right + count > max_entries); 183b0988900SJoe Thornber 1843241b1d3SJoe Thornber if (!count) 1853241b1d3SJoe Thornber return; 1863241b1d3SJoe Thornber 1873241b1d3SJoe Thornber if (count > 0) { 1883241b1d3SJoe Thornber node_shift(right, count); 1893241b1d3SJoe Thornber node_copy(left, right, count); 1903241b1d3SJoe Thornber } else { 1913241b1d3SJoe Thornber node_copy(left, right, count); 1923241b1d3SJoe Thornber node_shift(right, count); 1933241b1d3SJoe Thornber } 1943241b1d3SJoe Thornber 195b0988900SJoe Thornber left->header.nr_entries = cpu_to_le32(nr_left - count); 196b0988900SJoe Thornber right->header.nr_entries = cpu_to_le32(nr_right + count); 1973241b1d3SJoe Thornber } 1983241b1d3SJoe Thornber 199550929faSMikulas Patocka static void __rebalance2(struct dm_btree_info *info, struct btree_node *parent, 2003241b1d3SJoe Thornber struct child *l, struct child *r) 2013241b1d3SJoe Thornber { 202550929faSMikulas Patocka struct btree_node *left = l->n; 203550929faSMikulas Patocka struct btree_node *right = r->n; 2043241b1d3SJoe Thornber uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 2053241b1d3SJoe Thornber uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 206474e5595SHou Tao /* 207474e5595SHou Tao * Ensure the number of entries in each child will be greater 208474e5595SHou Tao * than or equal to (max_entries / 3 + 1), so no matter which 209474e5595SHou Tao * child is used for removal, the number will still be not 210474e5595SHou Tao * less than (max_entries / 3). 211474e5595SHou Tao */ 212474e5595SHou Tao unsigned int threshold = 2 * (merge_threshold(left) + 1); 2133241b1d3SJoe Thornber 214b0988900SJoe Thornber if (nr_left + nr_right < threshold) { 2153241b1d3SJoe Thornber /* 2163241b1d3SJoe Thornber * Merge 2173241b1d3SJoe Thornber */ 2183241b1d3SJoe Thornber node_copy(left, right, -nr_right); 2193241b1d3SJoe Thornber left->header.nr_entries = cpu_to_le32(nr_left + nr_right); 2203241b1d3SJoe Thornber delete_at(parent, r->index); 2213241b1d3SJoe Thornber 2223241b1d3SJoe Thornber /* 2233241b1d3SJoe Thornber * We need to decrement the right block, but not it's 2243241b1d3SJoe Thornber * children, since they're still referenced by left. 2253241b1d3SJoe Thornber */ 2263241b1d3SJoe Thornber dm_tm_dec(info->tm, dm_block_location(r->block)); 2273241b1d3SJoe Thornber } else { 2283241b1d3SJoe Thornber /* 2293241b1d3SJoe Thornber * Rebalance. 2303241b1d3SJoe Thornber */ 2313241b1d3SJoe Thornber unsigned target_left = (nr_left + nr_right) / 2; 2323241b1d3SJoe Thornber shift(left, right, nr_left - target_left); 2333241b1d3SJoe Thornber *key_ptr(parent, r->index) = right->keys[0]; 2343241b1d3SJoe Thornber } 2353241b1d3SJoe Thornber } 2363241b1d3SJoe Thornber 2373241b1d3SJoe Thornber static int rebalance2(struct shadow_spine *s, struct dm_btree_info *info, 238f046f89aSJoe Thornber struct dm_btree_value_type *vt, unsigned left_index) 2393241b1d3SJoe Thornber { 2403241b1d3SJoe Thornber int r; 241550929faSMikulas Patocka struct btree_node *parent; 2423241b1d3SJoe Thornber struct child left, right; 2433241b1d3SJoe Thornber 2443241b1d3SJoe Thornber parent = dm_block_data(shadow_current(s)); 2453241b1d3SJoe Thornber 246f046f89aSJoe Thornber r = init_child(info, vt, parent, left_index, &left); 2473241b1d3SJoe Thornber if (r) 2483241b1d3SJoe Thornber return r; 2493241b1d3SJoe Thornber 250f046f89aSJoe Thornber r = init_child(info, vt, parent, left_index + 1, &right); 2513241b1d3SJoe Thornber if (r) { 2523241b1d3SJoe Thornber exit_child(info, &left); 2533241b1d3SJoe Thornber return r; 2543241b1d3SJoe Thornber } 2553241b1d3SJoe Thornber 2563241b1d3SJoe Thornber __rebalance2(info, parent, &left, &right); 2573241b1d3SJoe Thornber 2584c7da06fSMikulas Patocka exit_child(info, &left); 2593241b1d3SJoe Thornber exit_child(info, &right); 2603241b1d3SJoe Thornber 2614c7da06fSMikulas Patocka return 0; 2623241b1d3SJoe Thornber } 2633241b1d3SJoe Thornber 2643241b1d3SJoe Thornber /* 265b0988900SJoe Thornber * We dump as many entries from center as possible into left, then the rest 266b0988900SJoe Thornber * in right, then rebalance2. This wastes some cpu, but I want something 267b0988900SJoe Thornber * simple atm. 2683241b1d3SJoe Thornber */ 269550929faSMikulas Patocka static void delete_center_node(struct dm_btree_info *info, struct btree_node *parent, 270b0988900SJoe Thornber struct child *l, struct child *c, struct child *r, 271550929faSMikulas Patocka struct btree_node *left, struct btree_node *center, struct btree_node *right, 272b0988900SJoe Thornber uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 273b0988900SJoe Thornber { 274b0988900SJoe Thornber uint32_t max_entries = le32_to_cpu(left->header.max_entries); 2753241b1d3SJoe Thornber unsigned shift = min(max_entries - nr_left, nr_center); 2763241b1d3SJoe Thornber 2773241b1d3SJoe Thornber BUG_ON(nr_left + shift > max_entries); 2783241b1d3SJoe Thornber node_copy(left, center, -shift); 2793241b1d3SJoe Thornber left->header.nr_entries = cpu_to_le32(nr_left + shift); 2803241b1d3SJoe Thornber 2813241b1d3SJoe Thornber if (shift != nr_center) { 2823241b1d3SJoe Thornber shift = nr_center - shift; 283b0988900SJoe Thornber BUG_ON((nr_right + shift) > max_entries); 2843241b1d3SJoe Thornber node_shift(right, shift); 2853241b1d3SJoe Thornber node_copy(center, right, shift); 2863241b1d3SJoe Thornber right->header.nr_entries = cpu_to_le32(nr_right + shift); 2873241b1d3SJoe Thornber } 2883241b1d3SJoe Thornber *key_ptr(parent, r->index) = right->keys[0]; 2893241b1d3SJoe Thornber 2903241b1d3SJoe Thornber delete_at(parent, c->index); 2913241b1d3SJoe Thornber r->index--; 2923241b1d3SJoe Thornber 2933241b1d3SJoe Thornber dm_tm_dec(info->tm, dm_block_location(c->block)); 2943241b1d3SJoe Thornber __rebalance2(info, parent, l, r); 2953241b1d3SJoe Thornber } 2963241b1d3SJoe Thornber 2973241b1d3SJoe Thornber /* 298b0988900SJoe Thornber * Redistributes entries among 3 sibling nodes. 2993241b1d3SJoe Thornber */ 300550929faSMikulas Patocka static void redistribute3(struct dm_btree_info *info, struct btree_node *parent, 301b0988900SJoe Thornber struct child *l, struct child *c, struct child *r, 302550929faSMikulas Patocka struct btree_node *left, struct btree_node *center, struct btree_node *right, 303b0988900SJoe Thornber uint32_t nr_left, uint32_t nr_center, uint32_t nr_right) 304b0988900SJoe Thornber { 305b0988900SJoe Thornber int s; 306b0988900SJoe Thornber uint32_t max_entries = le32_to_cpu(left->header.max_entries); 3072871c69eSJoe Thornber unsigned total = nr_left + nr_center + nr_right; 3082871c69eSJoe Thornber unsigned target_right = total / 3; 3092871c69eSJoe Thornber unsigned remainder = (target_right * 3) != total; 3102871c69eSJoe Thornber unsigned target_left = target_right + remainder; 3112871c69eSJoe Thornber 3122871c69eSJoe Thornber BUG_ON(target_left > max_entries); 3132871c69eSJoe Thornber BUG_ON(target_right > max_entries); 3143241b1d3SJoe Thornber 315b0988900SJoe Thornber if (nr_left < nr_right) { 3162871c69eSJoe Thornber s = nr_left - target_left; 3173241b1d3SJoe Thornber 318b0988900SJoe Thornber if (s < 0 && nr_center < -s) { 319b0988900SJoe Thornber /* not enough in central node */ 3204c7e3093SDennis Yang shift(left, center, -nr_center); 3214c7e3093SDennis Yang s += nr_center; 322b0988900SJoe Thornber shift(left, right, s); 323b0988900SJoe Thornber nr_right += s; 324b0988900SJoe Thornber } else 325b0988900SJoe Thornber shift(left, center, s); 326b0988900SJoe Thornber 3272871c69eSJoe Thornber shift(center, right, target_right - nr_right); 328b0988900SJoe Thornber 329b0988900SJoe Thornber } else { 3302871c69eSJoe Thornber s = target_right - nr_right; 331b0988900SJoe Thornber if (s > 0 && nr_center < s) { 332b0988900SJoe Thornber /* not enough in central node */ 333b0988900SJoe Thornber shift(center, right, nr_center); 3344c7e3093SDennis Yang s -= nr_center; 335b0988900SJoe Thornber shift(left, right, s); 336b0988900SJoe Thornber nr_left -= s; 337b0988900SJoe Thornber } else 338b0988900SJoe Thornber shift(center, right, s); 339b0988900SJoe Thornber 3402871c69eSJoe Thornber shift(left, center, nr_left - target_left); 341b0988900SJoe Thornber } 342b0988900SJoe Thornber 3433241b1d3SJoe Thornber *key_ptr(parent, c->index) = center->keys[0]; 3443241b1d3SJoe Thornber *key_ptr(parent, r->index) = right->keys[0]; 3453241b1d3SJoe Thornber } 3463241b1d3SJoe Thornber 347550929faSMikulas Patocka static void __rebalance3(struct dm_btree_info *info, struct btree_node *parent, 348b0988900SJoe Thornber struct child *l, struct child *c, struct child *r) 349b0988900SJoe Thornber { 350550929faSMikulas Patocka struct btree_node *left = l->n; 351550929faSMikulas Patocka struct btree_node *center = c->n; 352550929faSMikulas Patocka struct btree_node *right = r->n; 353b0988900SJoe Thornber 354b0988900SJoe Thornber uint32_t nr_left = le32_to_cpu(left->header.nr_entries); 355b0988900SJoe Thornber uint32_t nr_center = le32_to_cpu(center->header.nr_entries); 356b0988900SJoe Thornber uint32_t nr_right = le32_to_cpu(right->header.nr_entries); 357b0988900SJoe Thornber 358b0988900SJoe Thornber unsigned threshold = merge_threshold(left) * 4 + 1; 359b0988900SJoe Thornber 360b0988900SJoe Thornber BUG_ON(left->header.max_entries != center->header.max_entries); 361b0988900SJoe Thornber BUG_ON(center->header.max_entries != right->header.max_entries); 362b0988900SJoe Thornber 363b0988900SJoe Thornber if ((nr_left + nr_center + nr_right) < threshold) 364b0988900SJoe Thornber delete_center_node(info, parent, l, c, r, left, center, right, 365b0988900SJoe Thornber nr_left, nr_center, nr_right); 366b0988900SJoe Thornber else 367b0988900SJoe Thornber redistribute3(info, parent, l, c, r, left, center, right, 368b0988900SJoe Thornber nr_left, nr_center, nr_right); 369b0988900SJoe Thornber } 370b0988900SJoe Thornber 3713241b1d3SJoe Thornber static int rebalance3(struct shadow_spine *s, struct dm_btree_info *info, 372f046f89aSJoe Thornber struct dm_btree_value_type *vt, unsigned left_index) 3733241b1d3SJoe Thornber { 3743241b1d3SJoe Thornber int r; 375550929faSMikulas Patocka struct btree_node *parent = dm_block_data(shadow_current(s)); 3763241b1d3SJoe Thornber struct child left, center, right; 3773241b1d3SJoe Thornber 3783241b1d3SJoe Thornber /* 3793241b1d3SJoe Thornber * FIXME: fill out an array? 3803241b1d3SJoe Thornber */ 381f046f89aSJoe Thornber r = init_child(info, vt, parent, left_index, &left); 3823241b1d3SJoe Thornber if (r) 3833241b1d3SJoe Thornber return r; 3843241b1d3SJoe Thornber 385f046f89aSJoe Thornber r = init_child(info, vt, parent, left_index + 1, ¢er); 3863241b1d3SJoe Thornber if (r) { 3873241b1d3SJoe Thornber exit_child(info, &left); 3883241b1d3SJoe Thornber return r; 3893241b1d3SJoe Thornber } 3903241b1d3SJoe Thornber 391f046f89aSJoe Thornber r = init_child(info, vt, parent, left_index + 2, &right); 3923241b1d3SJoe Thornber if (r) { 3933241b1d3SJoe Thornber exit_child(info, &left); 3943241b1d3SJoe Thornber exit_child(info, ¢er); 3953241b1d3SJoe Thornber return r; 3963241b1d3SJoe Thornber } 3973241b1d3SJoe Thornber 3983241b1d3SJoe Thornber __rebalance3(info, parent, &left, ¢er, &right); 3993241b1d3SJoe Thornber 4004c7da06fSMikulas Patocka exit_child(info, &left); 4013241b1d3SJoe Thornber exit_child(info, ¢er); 4023241b1d3SJoe Thornber exit_child(info, &right); 4033241b1d3SJoe Thornber 4043241b1d3SJoe Thornber return 0; 4053241b1d3SJoe Thornber } 4063241b1d3SJoe Thornber 4073241b1d3SJoe Thornber static int rebalance_children(struct shadow_spine *s, 408f046f89aSJoe Thornber struct dm_btree_info *info, 409f046f89aSJoe Thornber struct dm_btree_value_type *vt, uint64_t key) 4103241b1d3SJoe Thornber { 4113241b1d3SJoe Thornber int i, r, has_left_sibling, has_right_sibling; 412550929faSMikulas Patocka struct btree_node *n; 4133241b1d3SJoe Thornber 4143241b1d3SJoe Thornber n = dm_block_data(shadow_current(s)); 4153241b1d3SJoe Thornber 4163241b1d3SJoe Thornber if (le32_to_cpu(n->header.nr_entries) == 1) { 4173241b1d3SJoe Thornber struct dm_block *child; 4183241b1d3SJoe Thornber dm_block_t b = value64(n, 0); 4193241b1d3SJoe Thornber 4203241b1d3SJoe Thornber r = dm_tm_read_lock(info->tm, b, &btree_node_validator, &child); 4213241b1d3SJoe Thornber if (r) 4223241b1d3SJoe Thornber return r; 4233241b1d3SJoe Thornber 4243241b1d3SJoe Thornber memcpy(n, dm_block_data(child), 4253241b1d3SJoe Thornber dm_bm_block_size(dm_tm_get_bm(info->tm))); 4263241b1d3SJoe Thornber 4273241b1d3SJoe Thornber dm_tm_dec(info->tm, dm_block_location(child)); 428*1b8d2789SJoe Thornber dm_tm_unlock(info->tm, child); 4293241b1d3SJoe Thornber return 0; 4303241b1d3SJoe Thornber } 4313241b1d3SJoe Thornber 4323241b1d3SJoe Thornber i = lower_bound(n, key); 4333241b1d3SJoe Thornber if (i < 0) 4343241b1d3SJoe Thornber return -ENODATA; 4353241b1d3SJoe Thornber 4363241b1d3SJoe Thornber has_left_sibling = i > 0; 4373241b1d3SJoe Thornber has_right_sibling = i < (le32_to_cpu(n->header.nr_entries) - 1); 4383241b1d3SJoe Thornber 4393241b1d3SJoe Thornber if (!has_left_sibling) 440f046f89aSJoe Thornber r = rebalance2(s, info, vt, i); 4413241b1d3SJoe Thornber 4423241b1d3SJoe Thornber else if (!has_right_sibling) 443f046f89aSJoe Thornber r = rebalance2(s, info, vt, i - 1); 4443241b1d3SJoe Thornber 4453241b1d3SJoe Thornber else 446f046f89aSJoe Thornber r = rebalance3(s, info, vt, i - 1); 4473241b1d3SJoe Thornber 4483241b1d3SJoe Thornber return r; 4493241b1d3SJoe Thornber } 4503241b1d3SJoe Thornber 451550929faSMikulas Patocka static int do_leaf(struct btree_node *n, uint64_t key, unsigned *index) 4523241b1d3SJoe Thornber { 4533241b1d3SJoe Thornber int i = lower_bound(n, key); 4543241b1d3SJoe Thornber 4553241b1d3SJoe Thornber if ((i < 0) || 4563241b1d3SJoe Thornber (i >= le32_to_cpu(n->header.nr_entries)) || 4573241b1d3SJoe Thornber (le64_to_cpu(n->keys[i]) != key)) 4583241b1d3SJoe Thornber return -ENODATA; 4593241b1d3SJoe Thornber 4603241b1d3SJoe Thornber *index = i; 4613241b1d3SJoe Thornber 4623241b1d3SJoe Thornber return 0; 4633241b1d3SJoe Thornber } 4643241b1d3SJoe Thornber 4653241b1d3SJoe Thornber /* 4663241b1d3SJoe Thornber * Prepares for removal from one level of the hierarchy. The caller must 4673241b1d3SJoe Thornber * call delete_at() to remove the entry at index. 4683241b1d3SJoe Thornber */ 4693241b1d3SJoe Thornber static int remove_raw(struct shadow_spine *s, struct dm_btree_info *info, 4703241b1d3SJoe Thornber struct dm_btree_value_type *vt, dm_block_t root, 4713241b1d3SJoe Thornber uint64_t key, unsigned *index) 4723241b1d3SJoe Thornber { 4733241b1d3SJoe Thornber int i = *index, r; 474550929faSMikulas Patocka struct btree_node *n; 4753241b1d3SJoe Thornber 4763241b1d3SJoe Thornber for (;;) { 4773241b1d3SJoe Thornber r = shadow_step(s, root, vt); 4783241b1d3SJoe Thornber if (r < 0) 4793241b1d3SJoe Thornber break; 4803241b1d3SJoe Thornber 4813241b1d3SJoe Thornber /* 4823241b1d3SJoe Thornber * We have to patch up the parent node, ugly, but I don't 4833241b1d3SJoe Thornber * see a way to do this automatically as part of the spine 4843241b1d3SJoe Thornber * op. 4853241b1d3SJoe Thornber */ 4863241b1d3SJoe Thornber if (shadow_has_parent(s)) { 4873241b1d3SJoe Thornber __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 488a3aefb39SJoe Thornber memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 4893241b1d3SJoe Thornber &location, sizeof(__le64)); 4903241b1d3SJoe Thornber } 4913241b1d3SJoe Thornber 4923241b1d3SJoe Thornber n = dm_block_data(shadow_current(s)); 4933241b1d3SJoe Thornber 4943241b1d3SJoe Thornber if (le32_to_cpu(n->header.flags) & LEAF_NODE) 4953241b1d3SJoe Thornber return do_leaf(n, key, index); 4963241b1d3SJoe Thornber 497f046f89aSJoe Thornber r = rebalance_children(s, info, vt, key); 4983241b1d3SJoe Thornber if (r) 4993241b1d3SJoe Thornber break; 5003241b1d3SJoe Thornber 5013241b1d3SJoe Thornber n = dm_block_data(shadow_current(s)); 5023241b1d3SJoe Thornber if (le32_to_cpu(n->header.flags) & LEAF_NODE) 5033241b1d3SJoe Thornber return do_leaf(n, key, index); 5043241b1d3SJoe Thornber 5053241b1d3SJoe Thornber i = lower_bound(n, key); 5063241b1d3SJoe Thornber 5073241b1d3SJoe Thornber /* 5083241b1d3SJoe Thornber * We know the key is present, or else 5093241b1d3SJoe Thornber * rebalance_children would have returned 5103241b1d3SJoe Thornber * -ENODATA 5113241b1d3SJoe Thornber */ 5123241b1d3SJoe Thornber root = value64(n, i); 5133241b1d3SJoe Thornber } 5143241b1d3SJoe Thornber 5153241b1d3SJoe Thornber return r; 5163241b1d3SJoe Thornber } 5173241b1d3SJoe Thornber 5183241b1d3SJoe Thornber int dm_btree_remove(struct dm_btree_info *info, dm_block_t root, 5193241b1d3SJoe Thornber uint64_t *keys, dm_block_t *new_root) 5203241b1d3SJoe Thornber { 5213241b1d3SJoe Thornber unsigned level, last_level = info->levels - 1; 5223241b1d3SJoe Thornber int index = 0, r = 0; 5233241b1d3SJoe Thornber struct shadow_spine spine; 524550929faSMikulas Patocka struct btree_node *n; 525b0dc3c8bSJoe Thornber struct dm_btree_value_type le64_vt; 5263241b1d3SJoe Thornber 527b0dc3c8bSJoe Thornber init_le64_type(info->tm, &le64_vt); 5283241b1d3SJoe Thornber init_shadow_spine(&spine, info); 5293241b1d3SJoe Thornber for (level = 0; level < info->levels; level++) { 5303241b1d3SJoe Thornber r = remove_raw(&spine, info, 5313241b1d3SJoe Thornber (level == last_level ? 532b0dc3c8bSJoe Thornber &info->value_type : &le64_vt), 5333241b1d3SJoe Thornber root, keys[level], (unsigned *)&index); 5343241b1d3SJoe Thornber if (r < 0) 5353241b1d3SJoe Thornber break; 5363241b1d3SJoe Thornber 5373241b1d3SJoe Thornber n = dm_block_data(shadow_current(&spine)); 5383241b1d3SJoe Thornber if (level != last_level) { 5393241b1d3SJoe Thornber root = value64(n, index); 5403241b1d3SJoe Thornber continue; 5413241b1d3SJoe Thornber } 5423241b1d3SJoe Thornber 5433241b1d3SJoe Thornber BUG_ON(index < 0 || index >= le32_to_cpu(n->header.nr_entries)); 5443241b1d3SJoe Thornber 5453241b1d3SJoe Thornber if (info->value_type.dec) 5463241b1d3SJoe Thornber info->value_type.dec(info->value_type.context, 547be500ed7SJoe Thornber value_ptr(n, index), 1); 5483241b1d3SJoe Thornber 5493241b1d3SJoe Thornber delete_at(n, index); 5503241b1d3SJoe Thornber } 5513241b1d3SJoe Thornber 552b6e58b54SHou Tao if (!r) 5533241b1d3SJoe Thornber *new_root = shadow_root(&spine); 5543241b1d3SJoe Thornber exit_shadow_spine(&spine); 5553241b1d3SJoe Thornber 5563241b1d3SJoe Thornber return r; 5573241b1d3SJoe Thornber } 5583241b1d3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_remove); 5594ec331c3SJoe Thornber 5604ec331c3SJoe Thornber /*----------------------------------------------------------------*/ 5614ec331c3SJoe Thornber 5624ec331c3SJoe Thornber static int remove_nearest(struct shadow_spine *s, struct dm_btree_info *info, 5634ec331c3SJoe Thornber struct dm_btree_value_type *vt, dm_block_t root, 5644ec331c3SJoe Thornber uint64_t key, int *index) 5654ec331c3SJoe Thornber { 5664ec331c3SJoe Thornber int i = *index, r; 5674ec331c3SJoe Thornber struct btree_node *n; 5684ec331c3SJoe Thornber 5694ec331c3SJoe Thornber for (;;) { 5704ec331c3SJoe Thornber r = shadow_step(s, root, vt); 5714ec331c3SJoe Thornber if (r < 0) 5724ec331c3SJoe Thornber break; 5734ec331c3SJoe Thornber 5744ec331c3SJoe Thornber /* 5754ec331c3SJoe Thornber * We have to patch up the parent node, ugly, but I don't 5764ec331c3SJoe Thornber * see a way to do this automatically as part of the spine 5774ec331c3SJoe Thornber * op. 5784ec331c3SJoe Thornber */ 5794ec331c3SJoe Thornber if (shadow_has_parent(s)) { 5804ec331c3SJoe Thornber __le64 location = cpu_to_le64(dm_block_location(shadow_current(s))); 5814ec331c3SJoe Thornber memcpy(value_ptr(dm_block_data(shadow_parent(s)), i), 5824ec331c3SJoe Thornber &location, sizeof(__le64)); 5834ec331c3SJoe Thornber } 5844ec331c3SJoe Thornber 5854ec331c3SJoe Thornber n = dm_block_data(shadow_current(s)); 5864ec331c3SJoe Thornber 5874ec331c3SJoe Thornber if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 5884ec331c3SJoe Thornber *index = lower_bound(n, key); 5894ec331c3SJoe Thornber return 0; 5904ec331c3SJoe Thornber } 5914ec331c3SJoe Thornber 5924ec331c3SJoe Thornber r = rebalance_children(s, info, vt, key); 5934ec331c3SJoe Thornber if (r) 5944ec331c3SJoe Thornber break; 5954ec331c3SJoe Thornber 5964ec331c3SJoe Thornber n = dm_block_data(shadow_current(s)); 5974ec331c3SJoe Thornber if (le32_to_cpu(n->header.flags) & LEAF_NODE) { 5984ec331c3SJoe Thornber *index = lower_bound(n, key); 5994ec331c3SJoe Thornber return 0; 6004ec331c3SJoe Thornber } 6014ec331c3SJoe Thornber 6024ec331c3SJoe Thornber i = lower_bound(n, key); 6034ec331c3SJoe Thornber 6044ec331c3SJoe Thornber /* 6054ec331c3SJoe Thornber * We know the key is present, or else 6064ec331c3SJoe Thornber * rebalance_children would have returned 6074ec331c3SJoe Thornber * -ENODATA 6084ec331c3SJoe Thornber */ 6094ec331c3SJoe Thornber root = value64(n, i); 6104ec331c3SJoe Thornber } 6114ec331c3SJoe Thornber 6124ec331c3SJoe Thornber return r; 6134ec331c3SJoe Thornber } 6144ec331c3SJoe Thornber 6154ec331c3SJoe Thornber static int remove_one(struct dm_btree_info *info, dm_block_t root, 6164ec331c3SJoe Thornber uint64_t *keys, uint64_t end_key, 6174ec331c3SJoe Thornber dm_block_t *new_root, unsigned *nr_removed) 6184ec331c3SJoe Thornber { 6194ec331c3SJoe Thornber unsigned level, last_level = info->levels - 1; 6204ec331c3SJoe Thornber int index = 0, r = 0; 6214ec331c3SJoe Thornber struct shadow_spine spine; 6224ec331c3SJoe Thornber struct btree_node *n; 623b0dc3c8bSJoe Thornber struct dm_btree_value_type le64_vt; 6244ec331c3SJoe Thornber uint64_t k; 6254ec331c3SJoe Thornber 626b0dc3c8bSJoe Thornber init_le64_type(info->tm, &le64_vt); 6274ec331c3SJoe Thornber init_shadow_spine(&spine, info); 6284ec331c3SJoe Thornber for (level = 0; level < last_level; level++) { 629b0dc3c8bSJoe Thornber r = remove_raw(&spine, info, &le64_vt, 6304ec331c3SJoe Thornber root, keys[level], (unsigned *) &index); 6314ec331c3SJoe Thornber if (r < 0) 6324ec331c3SJoe Thornber goto out; 6334ec331c3SJoe Thornber 6344ec331c3SJoe Thornber n = dm_block_data(shadow_current(&spine)); 6354ec331c3SJoe Thornber root = value64(n, index); 6364ec331c3SJoe Thornber } 6374ec331c3SJoe Thornber 6384ec331c3SJoe Thornber r = remove_nearest(&spine, info, &info->value_type, 6394ec331c3SJoe Thornber root, keys[last_level], &index); 6404ec331c3SJoe Thornber if (r < 0) 6414ec331c3SJoe Thornber goto out; 6424ec331c3SJoe Thornber 6434ec331c3SJoe Thornber n = dm_block_data(shadow_current(&spine)); 6444ec331c3SJoe Thornber 6454ec331c3SJoe Thornber if (index < 0) 6464ec331c3SJoe Thornber index = 0; 6474ec331c3SJoe Thornber 6484ec331c3SJoe Thornber if (index >= le32_to_cpu(n->header.nr_entries)) { 6494ec331c3SJoe Thornber r = -ENODATA; 6504ec331c3SJoe Thornber goto out; 6514ec331c3SJoe Thornber } 6524ec331c3SJoe Thornber 6534ec331c3SJoe Thornber k = le64_to_cpu(n->keys[index]); 6544ec331c3SJoe Thornber if (k >= keys[last_level] && k < end_key) { 6554ec331c3SJoe Thornber if (info->value_type.dec) 6564ec331c3SJoe Thornber info->value_type.dec(info->value_type.context, 657be500ed7SJoe Thornber value_ptr(n, index), 1); 6584ec331c3SJoe Thornber 6594ec331c3SJoe Thornber delete_at(n, index); 660aa0cd28dSJoe Thornber keys[last_level] = k + 1ull; 6614ec331c3SJoe Thornber 6624ec331c3SJoe Thornber } else 6634ec331c3SJoe Thornber r = -ENODATA; 6644ec331c3SJoe Thornber 6654ec331c3SJoe Thornber out: 6664ec331c3SJoe Thornber *new_root = shadow_root(&spine); 6674ec331c3SJoe Thornber exit_shadow_spine(&spine); 6684ec331c3SJoe Thornber 6694ec331c3SJoe Thornber return r; 6704ec331c3SJoe Thornber } 6714ec331c3SJoe Thornber 6724ec331c3SJoe Thornber int dm_btree_remove_leaves(struct dm_btree_info *info, dm_block_t root, 6734ec331c3SJoe Thornber uint64_t *first_key, uint64_t end_key, 6744ec331c3SJoe Thornber dm_block_t *new_root, unsigned *nr_removed) 6754ec331c3SJoe Thornber { 6764ec331c3SJoe Thornber int r; 6774ec331c3SJoe Thornber 6784ec331c3SJoe Thornber *nr_removed = 0; 6794ec331c3SJoe Thornber do { 6804ec331c3SJoe Thornber r = remove_one(info, root, first_key, end_key, &root, nr_removed); 6814ec331c3SJoe Thornber if (!r) 6824ec331c3SJoe Thornber (*nr_removed)++; 6834ec331c3SJoe Thornber } while (!r); 6844ec331c3SJoe Thornber 6854ec331c3SJoe Thornber *new_root = root; 6864ec331c3SJoe Thornber return r == -ENODATA ? 0 : r; 6874ec331c3SJoe Thornber } 6884ec331c3SJoe Thornber EXPORT_SYMBOL_GPL(dm_btree_remove_leaves); 689