1 /* 2 * Copyright 2000 by Hans Reiser, licensing governed by reiserfs/README 3 */ 4 5 #include <linux/time.h> 6 #include <linux/slab.h> 7 #include <linux/string.h> 8 #include "reiserfs.h" 9 #include <linux/buffer_head.h> 10 11 /* 12 * To make any changes in the tree we find a node that contains item 13 * to be changed/deleted or position in the node we insert a new item 14 * to. We call this node S. To do balancing we need to decide what we 15 * will shift to left/right neighbor, or to a new node, where new item 16 * will be etc. To make this analysis simpler we build virtual 17 * node. Virtual node is an array of items, that will replace items of 18 * node S. (For instance if we are going to delete an item, virtual 19 * node does not contain it). Virtual node keeps information about 20 * item sizes and types, mergeability of first and last items, sizes 21 * of all entries in directory item. We use this array of items when 22 * calculating what we can shift to neighbors and how many nodes we 23 * have to have if we do not any shiftings, if we shift to left/right 24 * neighbor or to both. 25 */ 26 27 /* 28 * Takes item number in virtual node, returns number of item 29 * that it has in source buffer 30 */ 31 static inline int old_item_num(int new_num, int affected_item_num, int mode) 32 { 33 if (mode == M_PASTE || mode == M_CUT || new_num < affected_item_num) 34 return new_num; 35 36 if (mode == M_INSERT) { 37 38 RFALSE(new_num == 0, 39 "vs-8005: for INSERT mode and item number of inserted item"); 40 41 return new_num - 1; 42 } 43 44 RFALSE(mode != M_DELETE, 45 "vs-8010: old_item_num: mode must be M_DELETE (mode = \'%c\'", 46 mode); 47 /* delete mode */ 48 return new_num + 1; 49 } 50 51 static void create_virtual_node(struct tree_balance *tb, int h) 52 { 53 struct item_head *ih; 54 struct virtual_node *vn = tb->tb_vn; 55 int new_num; 56 struct buffer_head *Sh; /* this comes from tb->S[h] */ 57 58 Sh = PATH_H_PBUFFER(tb->tb_path, h); 59 60 /* size of changed node */ 61 vn->vn_size = 62 MAX_CHILD_SIZE(Sh) - B_FREE_SPACE(Sh) + tb->insert_size[h]; 63 64 /* for internal nodes array if virtual items is not created */ 65 if (h) { 66 vn->vn_nr_item = (vn->vn_size - DC_SIZE) / (DC_SIZE + KEY_SIZE); 67 return; 68 } 69 70 /* number of items in virtual node */ 71 vn->vn_nr_item = 72 B_NR_ITEMS(Sh) + ((vn->vn_mode == M_INSERT) ? 1 : 0) - 73 ((vn->vn_mode == M_DELETE) ? 1 : 0); 74 75 /* first virtual item */ 76 vn->vn_vi = (struct virtual_item *)(tb->tb_vn + 1); 77 memset(vn->vn_vi, 0, vn->vn_nr_item * sizeof(struct virtual_item)); 78 vn->vn_free_ptr += vn->vn_nr_item * sizeof(struct virtual_item); 79 80 /* first item in the node */ 81 ih = item_head(Sh, 0); 82 83 /* define the mergeability for 0-th item (if it is not being deleted) */ 84 if (op_is_left_mergeable(&ih->ih_key, Sh->b_size) 85 && (vn->vn_mode != M_DELETE || vn->vn_affected_item_num)) 86 vn->vn_vi[0].vi_type |= VI_TYPE_LEFT_MERGEABLE; 87 88 /* 89 * go through all items that remain in the virtual 90 * node (except for the new (inserted) one) 91 */ 92 for (new_num = 0; new_num < vn->vn_nr_item; new_num++) { 93 int j; 94 struct virtual_item *vi = vn->vn_vi + new_num; 95 int is_affected = 96 ((new_num != vn->vn_affected_item_num) ? 0 : 1); 97 98 if (is_affected && vn->vn_mode == M_INSERT) 99 continue; 100 101 /* get item number in source node */ 102 j = old_item_num(new_num, vn->vn_affected_item_num, 103 vn->vn_mode); 104 105 vi->vi_item_len += ih_item_len(ih + j) + IH_SIZE; 106 vi->vi_ih = ih + j; 107 vi->vi_item = ih_item_body(Sh, ih + j); 108 vi->vi_uarea = vn->vn_free_ptr; 109 110 /* 111 * FIXME: there is no check that item operation did not 112 * consume too much memory 113 */ 114 vn->vn_free_ptr += 115 op_create_vi(vn, vi, is_affected, tb->insert_size[0]); 116 if (tb->vn_buf + tb->vn_buf_size < vn->vn_free_ptr) 117 reiserfs_panic(tb->tb_sb, "vs-8030", 118 "virtual node space consumed"); 119 120 if (!is_affected) 121 /* this is not being changed */ 122 continue; 123 124 if (vn->vn_mode == M_PASTE || vn->vn_mode == M_CUT) { 125 vn->vn_vi[new_num].vi_item_len += tb->insert_size[0]; 126 /* pointer to data which is going to be pasted */ 127 vi->vi_new_data = vn->vn_data; 128 } 129 } 130 131 /* virtual inserted item is not defined yet */ 132 if (vn->vn_mode == M_INSERT) { 133 struct virtual_item *vi = vn->vn_vi + vn->vn_affected_item_num; 134 135 RFALSE(vn->vn_ins_ih == NULL, 136 "vs-8040: item header of inserted item is not specified"); 137 vi->vi_item_len = tb->insert_size[0]; 138 vi->vi_ih = vn->vn_ins_ih; 139 vi->vi_item = vn->vn_data; 140 vi->vi_uarea = vn->vn_free_ptr; 141 142 op_create_vi(vn, vi, 0 /*not pasted or cut */ , 143 tb->insert_size[0]); 144 } 145 146 /* 147 * set right merge flag we take right delimiting key and 148 * check whether it is a mergeable item 149 */ 150 if (tb->CFR[0]) { 151 struct reiserfs_key *key; 152 153 key = internal_key(tb->CFR[0], tb->rkey[0]); 154 if (op_is_left_mergeable(key, Sh->b_size) 155 && (vn->vn_mode != M_DELETE 156 || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) 157 vn->vn_vi[vn->vn_nr_item - 1].vi_type |= 158 VI_TYPE_RIGHT_MERGEABLE; 159 160 #ifdef CONFIG_REISERFS_CHECK 161 if (op_is_left_mergeable(key, Sh->b_size) && 162 !(vn->vn_mode != M_DELETE 163 || vn->vn_affected_item_num != B_NR_ITEMS(Sh) - 1)) { 164 /* 165 * we delete last item and it could be merged 166 * with right neighbor's first item 167 */ 168 if (! 169 (B_NR_ITEMS(Sh) == 1 170 && is_direntry_le_ih(item_head(Sh, 0)) 171 && ih_entry_count(item_head(Sh, 0)) == 1)) { 172 /* 173 * node contains more than 1 item, or item 174 * is not directory item, or this item 175 * contains more than 1 entry 176 */ 177 print_block(Sh, 0, -1, -1); 178 reiserfs_panic(tb->tb_sb, "vs-8045", 179 "rdkey %k, affected item==%d " 180 "(mode==%c) Must be %c", 181 key, vn->vn_affected_item_num, 182 vn->vn_mode, M_DELETE); 183 } 184 } 185 #endif 186 187 } 188 } 189 190 /* 191 * Using virtual node check, how many items can be 192 * shifted to left neighbor 193 */ 194 static void check_left(struct tree_balance *tb, int h, int cur_free) 195 { 196 int i; 197 struct virtual_node *vn = tb->tb_vn; 198 struct virtual_item *vi; 199 int d_size, ih_size; 200 201 RFALSE(cur_free < 0, "vs-8050: cur_free (%d) < 0", cur_free); 202 203 /* internal level */ 204 if (h > 0) { 205 tb->lnum[h] = cur_free / (DC_SIZE + KEY_SIZE); 206 return; 207 } 208 209 /* leaf level */ 210 211 if (!cur_free || !vn->vn_nr_item) { 212 /* no free space or nothing to move */ 213 tb->lnum[h] = 0; 214 tb->lbytes = -1; 215 return; 216 } 217 218 RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), 219 "vs-8055: parent does not exist or invalid"); 220 221 vi = vn->vn_vi; 222 if ((unsigned int)cur_free >= 223 (vn->vn_size - 224 ((vi->vi_type & VI_TYPE_LEFT_MERGEABLE) ? IH_SIZE : 0))) { 225 /* all contents of S[0] fits into L[0] */ 226 227 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, 228 "vs-8055: invalid mode or balance condition failed"); 229 230 tb->lnum[0] = vn->vn_nr_item; 231 tb->lbytes = -1; 232 return; 233 } 234 235 d_size = 0, ih_size = IH_SIZE; 236 237 /* first item may be merge with last item in left neighbor */ 238 if (vi->vi_type & VI_TYPE_LEFT_MERGEABLE) 239 d_size = -((int)IH_SIZE), ih_size = 0; 240 241 tb->lnum[0] = 0; 242 for (i = 0; i < vn->vn_nr_item; 243 i++, ih_size = IH_SIZE, d_size = 0, vi++) { 244 d_size += vi->vi_item_len; 245 if (cur_free >= d_size) { 246 /* the item can be shifted entirely */ 247 cur_free -= d_size; 248 tb->lnum[0]++; 249 continue; 250 } 251 252 /* the item cannot be shifted entirely, try to split it */ 253 /* 254 * check whether L[0] can hold ih and at least one byte 255 * of the item body 256 */ 257 258 /* cannot shift even a part of the current item */ 259 if (cur_free <= ih_size) { 260 tb->lbytes = -1; 261 return; 262 } 263 cur_free -= ih_size; 264 265 tb->lbytes = op_check_left(vi, cur_free, 0, 0); 266 if (tb->lbytes != -1) 267 /* count partially shifted item */ 268 tb->lnum[0]++; 269 270 break; 271 } 272 273 return; 274 } 275 276 /* 277 * Using virtual node check, how many items can be 278 * shifted to right neighbor 279 */ 280 static void check_right(struct tree_balance *tb, int h, int cur_free) 281 { 282 int i; 283 struct virtual_node *vn = tb->tb_vn; 284 struct virtual_item *vi; 285 int d_size, ih_size; 286 287 RFALSE(cur_free < 0, "vs-8070: cur_free < 0"); 288 289 /* internal level */ 290 if (h > 0) { 291 tb->rnum[h] = cur_free / (DC_SIZE + KEY_SIZE); 292 return; 293 } 294 295 /* leaf level */ 296 297 if (!cur_free || !vn->vn_nr_item) { 298 /* no free space */ 299 tb->rnum[h] = 0; 300 tb->rbytes = -1; 301 return; 302 } 303 304 RFALSE(!PATH_H_PPARENT(tb->tb_path, 0), 305 "vs-8075: parent does not exist or invalid"); 306 307 vi = vn->vn_vi + vn->vn_nr_item - 1; 308 if ((unsigned int)cur_free >= 309 (vn->vn_size - 310 ((vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) ? IH_SIZE : 0))) { 311 /* all contents of S[0] fits into R[0] */ 312 313 RFALSE(vn->vn_mode == M_INSERT || vn->vn_mode == M_PASTE, 314 "vs-8080: invalid mode or balance condition failed"); 315 316 tb->rnum[h] = vn->vn_nr_item; 317 tb->rbytes = -1; 318 return; 319 } 320 321 d_size = 0, ih_size = IH_SIZE; 322 323 /* last item may be merge with first item in right neighbor */ 324 if (vi->vi_type & VI_TYPE_RIGHT_MERGEABLE) 325 d_size = -(int)IH_SIZE, ih_size = 0; 326 327 tb->rnum[0] = 0; 328 for (i = vn->vn_nr_item - 1; i >= 0; 329 i--, d_size = 0, ih_size = IH_SIZE, vi--) { 330 d_size += vi->vi_item_len; 331 if (cur_free >= d_size) { 332 /* the item can be shifted entirely */ 333 cur_free -= d_size; 334 tb->rnum[0]++; 335 continue; 336 } 337 338 /* 339 * check whether R[0] can hold ih and at least one 340 * byte of the item body 341 */ 342 343 /* cannot shift even a part of the current item */ 344 if (cur_free <= ih_size) { 345 tb->rbytes = -1; 346 return; 347 } 348 349 /* 350 * R[0] can hold the header of the item and at least 351 * one byte of its body 352 */ 353 cur_free -= ih_size; /* cur_free is still > 0 */ 354 355 tb->rbytes = op_check_right(vi, cur_free); 356 if (tb->rbytes != -1) 357 /* count partially shifted item */ 358 tb->rnum[0]++; 359 360 break; 361 } 362 363 return; 364 } 365 366 /* 367 * from - number of items, which are shifted to left neighbor entirely 368 * to - number of item, which are shifted to right neighbor entirely 369 * from_bytes - number of bytes of boundary item (or directory entries) 370 * which are shifted to left neighbor 371 * to_bytes - number of bytes of boundary item (or directory entries) 372 * which are shifted to right neighbor 373 */ 374 static int get_num_ver(int mode, struct tree_balance *tb, int h, 375 int from, int from_bytes, 376 int to, int to_bytes, short *snum012, int flow) 377 { 378 int i; 379 int cur_free; 380 int units; 381 struct virtual_node *vn = tb->tb_vn; 382 int total_node_size, max_node_size, current_item_size; 383 int needed_nodes; 384 385 /* position of item we start filling node from */ 386 int start_item; 387 388 /* position of item we finish filling node by */ 389 int end_item; 390 391 /* 392 * number of first bytes (entries for directory) of start_item-th item 393 * we do not include into node that is being filled 394 */ 395 int start_bytes; 396 397 /* 398 * number of last bytes (entries for directory) of end_item-th item 399 * we do node include into node that is being filled 400 */ 401 int end_bytes; 402 403 /* 404 * these are positions in virtual item of items, that are split 405 * between S[0] and S1new and S1new and S2new 406 */ 407 int split_item_positions[2]; 408 409 split_item_positions[0] = -1; 410 split_item_positions[1] = -1; 411 412 /* 413 * We only create additional nodes if we are in insert or paste mode 414 * or we are in replace mode at the internal level. If h is 0 and 415 * the mode is M_REPLACE then in fix_nodes we change the mode to 416 * paste or insert before we get here in the code. 417 */ 418 RFALSE(tb->insert_size[h] < 0 || (mode != M_INSERT && mode != M_PASTE), 419 "vs-8100: insert_size < 0 in overflow"); 420 421 max_node_size = MAX_CHILD_SIZE(PATH_H_PBUFFER(tb->tb_path, h)); 422 423 /* 424 * snum012 [0-2] - number of items, that lay 425 * to S[0], first new node and second new node 426 */ 427 snum012[3] = -1; /* s1bytes */ 428 snum012[4] = -1; /* s2bytes */ 429 430 /* internal level */ 431 if (h > 0) { 432 i = ((to - from) * (KEY_SIZE + DC_SIZE) + DC_SIZE); 433 if (i == max_node_size) 434 return 1; 435 return (i / max_node_size + 1); 436 } 437 438 /* leaf level */ 439 needed_nodes = 1; 440 total_node_size = 0; 441 cur_free = max_node_size; 442 443 /* start from 'from'-th item */ 444 start_item = from; 445 /* skip its first 'start_bytes' units */ 446 start_bytes = ((from_bytes != -1) ? from_bytes : 0); 447 448 /* last included item is the 'end_item'-th one */ 449 end_item = vn->vn_nr_item - to - 1; 450 /* do not count last 'end_bytes' units of 'end_item'-th item */ 451 end_bytes = (to_bytes != -1) ? to_bytes : 0; 452 453 /* 454 * go through all item beginning from the start_item-th item 455 * and ending by the end_item-th item. Do not count first 456 * 'start_bytes' units of 'start_item'-th item and last 457 * 'end_bytes' of 'end_item'-th item 458 */ 459 for (i = start_item; i <= end_item; i++) { 460 struct virtual_item *vi = vn->vn_vi + i; 461 int skip_from_end = ((i == end_item) ? end_bytes : 0); 462 463 RFALSE(needed_nodes > 3, "vs-8105: too many nodes are needed"); 464 465 /* get size of current item */ 466 current_item_size = vi->vi_item_len; 467 468 /* 469 * do not take in calculation head part (from_bytes) 470 * of from-th item 471 */ 472 current_item_size -= 473 op_part_size(vi, 0 /*from start */ , start_bytes); 474 475 /* do not take in calculation tail part of last item */ 476 current_item_size -= 477 op_part_size(vi, 1 /*from end */ , skip_from_end); 478 479 /* if item fits into current node entierly */ 480 if (total_node_size + current_item_size <= max_node_size) { 481 snum012[needed_nodes - 1]++; 482 total_node_size += current_item_size; 483 start_bytes = 0; 484 continue; 485 } 486 487 /* 488 * virtual item length is longer, than max size of item in 489 * a node. It is impossible for direct item 490 */ 491 if (current_item_size > max_node_size) { 492 RFALSE(is_direct_le_ih(vi->vi_ih), 493 "vs-8110: " 494 "direct item length is %d. It can not be longer than %d", 495 current_item_size, max_node_size); 496 /* we will try to split it */ 497 flow = 1; 498 } 499 500 /* as we do not split items, take new node and continue */ 501 if (!flow) { 502 needed_nodes++; 503 i--; 504 total_node_size = 0; 505 continue; 506 } 507 508 /* 509 * calculate number of item units which fit into node being 510 * filled 511 */ 512 { 513 int free_space; 514 515 free_space = max_node_size - total_node_size - IH_SIZE; 516 units = 517 op_check_left(vi, free_space, start_bytes, 518 skip_from_end); 519 /* 520 * nothing fits into current node, take new 521 * node and continue 522 */ 523 if (units == -1) { 524 needed_nodes++, i--, total_node_size = 0; 525 continue; 526 } 527 } 528 529 /* something fits into the current node */ 530 start_bytes += units; 531 snum012[needed_nodes - 1 + 3] = units; 532 533 if (needed_nodes > 2) 534 reiserfs_warning(tb->tb_sb, "vs-8111", 535 "split_item_position is out of range"); 536 snum012[needed_nodes - 1]++; 537 split_item_positions[needed_nodes - 1] = i; 538 needed_nodes++; 539 /* continue from the same item with start_bytes != -1 */ 540 start_item = i; 541 i--; 542 total_node_size = 0; 543 } 544 545 /* 546 * sum012[4] (if it is not -1) contains number of units of which 547 * are to be in S1new, snum012[3] - to be in S0. They are supposed 548 * to be S1bytes and S2bytes correspondingly, so recalculate 549 */ 550 if (snum012[4] > 0) { 551 int split_item_num; 552 int bytes_to_r, bytes_to_l; 553 int bytes_to_S1new; 554 555 split_item_num = split_item_positions[1]; 556 bytes_to_l = 557 ((from == split_item_num 558 && from_bytes != -1) ? from_bytes : 0); 559 bytes_to_r = 560 ((end_item == split_item_num 561 && end_bytes != -1) ? end_bytes : 0); 562 bytes_to_S1new = 563 ((split_item_positions[0] == 564 split_item_positions[1]) ? snum012[3] : 0); 565 566 /* s2bytes */ 567 snum012[4] = 568 op_unit_num(&vn->vn_vi[split_item_num]) - snum012[4] - 569 bytes_to_r - bytes_to_l - bytes_to_S1new; 570 571 if (vn->vn_vi[split_item_num].vi_index != TYPE_DIRENTRY && 572 vn->vn_vi[split_item_num].vi_index != TYPE_INDIRECT) 573 reiserfs_warning(tb->tb_sb, "vs-8115", 574 "not directory or indirect item"); 575 } 576 577 /* now we know S2bytes, calculate S1bytes */ 578 if (snum012[3] > 0) { 579 int split_item_num; 580 int bytes_to_r, bytes_to_l; 581 int bytes_to_S2new; 582 583 split_item_num = split_item_positions[0]; 584 bytes_to_l = 585 ((from == split_item_num 586 && from_bytes != -1) ? from_bytes : 0); 587 bytes_to_r = 588 ((end_item == split_item_num 589 && end_bytes != -1) ? end_bytes : 0); 590 bytes_to_S2new = 591 ((split_item_positions[0] == split_item_positions[1] 592 && snum012[4] != -1) ? snum012[4] : 0); 593 594 /* s1bytes */ 595 snum012[3] = 596 op_unit_num(&vn->vn_vi[split_item_num]) - snum012[3] - 597 bytes_to_r - bytes_to_l - bytes_to_S2new; 598 } 599 600 return needed_nodes; 601 } 602 603 604 /* 605 * Set parameters for balancing. 606 * Performs write of results of analysis of balancing into structure tb, 607 * where it will later be used by the functions that actually do the balancing. 608 * Parameters: 609 * tb tree_balance structure; 610 * h current level of the node; 611 * lnum number of items from S[h] that must be shifted to L[h]; 612 * rnum number of items from S[h] that must be shifted to R[h]; 613 * blk_num number of blocks that S[h] will be splitted into; 614 * s012 number of items that fall into splitted nodes. 615 * lbytes number of bytes which flow to the left neighbor from the 616 * item that is not not shifted entirely 617 * rbytes number of bytes which flow to the right neighbor from the 618 * item that is not not shifted entirely 619 * s1bytes number of bytes which flow to the first new node when 620 * S[0] splits (this number is contained in s012 array) 621 */ 622 623 static void set_parameters(struct tree_balance *tb, int h, int lnum, 624 int rnum, int blk_num, short *s012, int lb, int rb) 625 { 626 627 tb->lnum[h] = lnum; 628 tb->rnum[h] = rnum; 629 tb->blknum[h] = blk_num; 630 631 /* only for leaf level */ 632 if (h == 0) { 633 if (s012 != NULL) { 634 tb->s0num = *s012++; 635 tb->snum[0] = *s012++; 636 tb->snum[1] = *s012++; 637 tb->sbytes[0] = *s012++; 638 tb->sbytes[1] = *s012; 639 } 640 tb->lbytes = lb; 641 tb->rbytes = rb; 642 } 643 PROC_INFO_ADD(tb->tb_sb, lnum[h], lnum); 644 PROC_INFO_ADD(tb->tb_sb, rnum[h], rnum); 645 646 PROC_INFO_ADD(tb->tb_sb, lbytes[h], lb); 647 PROC_INFO_ADD(tb->tb_sb, rbytes[h], rb); 648 } 649 650 /* 651 * check if node disappears if we shift tb->lnum[0] items to left 652 * neighbor and tb->rnum[0] to the right one. 653 */ 654 static int is_leaf_removable(struct tree_balance *tb) 655 { 656 struct virtual_node *vn = tb->tb_vn; 657 int to_left, to_right; 658 int size; 659 int remain_items; 660 661 /* 662 * number of items that will be shifted to left (right) neighbor 663 * entirely 664 */ 665 to_left = tb->lnum[0] - ((tb->lbytes != -1) ? 1 : 0); 666 to_right = tb->rnum[0] - ((tb->rbytes != -1) ? 1 : 0); 667 remain_items = vn->vn_nr_item; 668 669 /* how many items remain in S[0] after shiftings to neighbors */ 670 remain_items -= (to_left + to_right); 671 672 /* all content of node can be shifted to neighbors */ 673 if (remain_items < 1) { 674 set_parameters(tb, 0, to_left, vn->vn_nr_item - to_left, 0, 675 NULL, -1, -1); 676 return 1; 677 } 678 679 /* S[0] is not removable */ 680 if (remain_items > 1 || tb->lbytes == -1 || tb->rbytes == -1) 681 return 0; 682 683 /* check whether we can divide 1 remaining item between neighbors */ 684 685 /* get size of remaining item (in item units) */ 686 size = op_unit_num(&vn->vn_vi[to_left]); 687 688 if (tb->lbytes + tb->rbytes >= size) { 689 set_parameters(tb, 0, to_left + 1, to_right + 1, 0, NULL, 690 tb->lbytes, -1); 691 return 1; 692 } 693 694 return 0; 695 } 696 697 /* check whether L, S, R can be joined in one node */ 698 static int are_leaves_removable(struct tree_balance *tb, int lfree, int rfree) 699 { 700 struct virtual_node *vn = tb->tb_vn; 701 int ih_size; 702 struct buffer_head *S0; 703 704 S0 = PATH_H_PBUFFER(tb->tb_path, 0); 705 706 ih_size = 0; 707 if (vn->vn_nr_item) { 708 if (vn->vn_vi[0].vi_type & VI_TYPE_LEFT_MERGEABLE) 709 ih_size += IH_SIZE; 710 711 if (vn->vn_vi[vn->vn_nr_item - 1]. 712 vi_type & VI_TYPE_RIGHT_MERGEABLE) 713 ih_size += IH_SIZE; 714 } else { 715 /* there was only one item and it will be deleted */ 716 struct item_head *ih; 717 718 RFALSE(B_NR_ITEMS(S0) != 1, 719 "vs-8125: item number must be 1: it is %d", 720 B_NR_ITEMS(S0)); 721 722 ih = item_head(S0, 0); 723 if (tb->CFR[0] 724 && !comp_short_le_keys(&ih->ih_key, 725 internal_key(tb->CFR[0], 726 tb->rkey[0]))) 727 /* 728 * Directory must be in correct state here: that is 729 * somewhere at the left side should exist first 730 * directory item. But the item being deleted can 731 * not be that first one because its right neighbor 732 * is item of the same directory. (But first item 733 * always gets deleted in last turn). So, neighbors 734 * of deleted item can be merged, so we can save 735 * ih_size 736 */ 737 if (is_direntry_le_ih(ih)) { 738 ih_size = IH_SIZE; 739 740 /* 741 * we might check that left neighbor exists 742 * and is of the same directory 743 */ 744 RFALSE(le_ih_k_offset(ih) == DOT_OFFSET, 745 "vs-8130: first directory item can not be removed until directory is not empty"); 746 } 747 748 } 749 750 if (MAX_CHILD_SIZE(S0) + vn->vn_size <= rfree + lfree + ih_size) { 751 set_parameters(tb, 0, -1, -1, -1, NULL, -1, -1); 752 PROC_INFO_INC(tb->tb_sb, leaves_removable); 753 return 1; 754 } 755 return 0; 756 757 } 758 759 /* when we do not split item, lnum and rnum are numbers of entire items */ 760 #define SET_PAR_SHIFT_LEFT \ 761 if (h)\ 762 {\ 763 int to_l;\ 764 \ 765 to_l = (MAX_NR_KEY(Sh)+1 - lpar + vn->vn_nr_item + 1) / 2 -\ 766 (MAX_NR_KEY(Sh) + 1 - lpar);\ 767 \ 768 set_parameters (tb, h, to_l, 0, lnver, NULL, -1, -1);\ 769 }\ 770 else \ 771 {\ 772 if (lset==LEFT_SHIFT_FLOW)\ 773 set_parameters (tb, h, lpar, 0, lnver, snum012+lset,\ 774 tb->lbytes, -1);\ 775 else\ 776 set_parameters (tb, h, lpar - (tb->lbytes!=-1), 0, lnver, snum012+lset,\ 777 -1, -1);\ 778 } 779 780 #define SET_PAR_SHIFT_RIGHT \ 781 if (h)\ 782 {\ 783 int to_r;\ 784 \ 785 to_r = (MAX_NR_KEY(Sh)+1 - rpar + vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - rpar);\ 786 \ 787 set_parameters (tb, h, 0, to_r, rnver, NULL, -1, -1);\ 788 }\ 789 else \ 790 {\ 791 if (rset==RIGHT_SHIFT_FLOW)\ 792 set_parameters (tb, h, 0, rpar, rnver, snum012+rset,\ 793 -1, tb->rbytes);\ 794 else\ 795 set_parameters (tb, h, 0, rpar - (tb->rbytes!=-1), rnver, snum012+rset,\ 796 -1, -1);\ 797 } 798 799 static void free_buffers_in_tb(struct tree_balance *tb) 800 { 801 int i; 802 803 pathrelse(tb->tb_path); 804 805 for (i = 0; i < MAX_HEIGHT; i++) { 806 brelse(tb->L[i]); 807 brelse(tb->R[i]); 808 brelse(tb->FL[i]); 809 brelse(tb->FR[i]); 810 brelse(tb->CFL[i]); 811 brelse(tb->CFR[i]); 812 813 tb->L[i] = NULL; 814 tb->R[i] = NULL; 815 tb->FL[i] = NULL; 816 tb->FR[i] = NULL; 817 tb->CFL[i] = NULL; 818 tb->CFR[i] = NULL; 819 } 820 } 821 822 /* 823 * Get new buffers for storing new nodes that are created while balancing. 824 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; 825 * CARRY_ON - schedule didn't occur while the function worked; 826 * NO_DISK_SPACE - no disk space. 827 */ 828 /* The function is NOT SCHEDULE-SAFE! */ 829 static int get_empty_nodes(struct tree_balance *tb, int h) 830 { 831 struct buffer_head *new_bh, *Sh = PATH_H_PBUFFER(tb->tb_path, h); 832 b_blocknr_t *blocknr, blocknrs[MAX_AMOUNT_NEEDED] = { 0, }; 833 int counter, number_of_freeblk; 834 int amount_needed; /* number of needed empty blocks */ 835 int retval = CARRY_ON; 836 struct super_block *sb = tb->tb_sb; 837 838 /* 839 * number_of_freeblk is the number of empty blocks which have been 840 * acquired for use by the balancing algorithm minus the number of 841 * empty blocks used in the previous levels of the analysis, 842 * number_of_freeblk = tb->cur_blknum can be non-zero if a schedule 843 * occurs after empty blocks are acquired, and the balancing analysis 844 * is then restarted, amount_needed is the number needed by this 845 * level (h) of the balancing analysis. 846 * 847 * Note that for systems with many processes writing, it would be 848 * more layout optimal to calculate the total number needed by all 849 * levels and then to run reiserfs_new_blocks to get all of them at 850 * once. 851 */ 852 853 /* 854 * Initiate number_of_freeblk to the amount acquired prior to the 855 * restart of the analysis or 0 if not restarted, then subtract the 856 * amount needed by all of the levels of the tree below h. 857 */ 858 /* blknum includes S[h], so we subtract 1 in this calculation */ 859 for (counter = 0, number_of_freeblk = tb->cur_blknum; 860 counter < h; counter++) 861 number_of_freeblk -= 862 (tb->blknum[counter]) ? (tb->blknum[counter] - 863 1) : 0; 864 865 /* Allocate missing empty blocks. */ 866 /* if Sh == 0 then we are getting a new root */ 867 amount_needed = (Sh) ? (tb->blknum[h] - 1) : 1; 868 /* 869 * Amount_needed = the amount that we need more than the 870 * amount that we have. 871 */ 872 if (amount_needed > number_of_freeblk) 873 amount_needed -= number_of_freeblk; 874 else /* If we have enough already then there is nothing to do. */ 875 return CARRY_ON; 876 877 /* 878 * No need to check quota - is not allocated for blocks used 879 * for formatted nodes 880 */ 881 if (reiserfs_new_form_blocknrs(tb, blocknrs, 882 amount_needed) == NO_DISK_SPACE) 883 return NO_DISK_SPACE; 884 885 /* for each blocknumber we just got, get a buffer and stick it on FEB */ 886 for (blocknr = blocknrs, counter = 0; 887 counter < amount_needed; blocknr++, counter++) { 888 889 RFALSE(!*blocknr, 890 "PAP-8135: reiserfs_new_blocknrs failed when got new blocks"); 891 892 new_bh = sb_getblk(sb, *blocknr); 893 RFALSE(buffer_dirty(new_bh) || 894 buffer_journaled(new_bh) || 895 buffer_journal_dirty(new_bh), 896 "PAP-8140: journaled or dirty buffer %b for the new block", 897 new_bh); 898 899 /* Put empty buffers into the array. */ 900 RFALSE(tb->FEB[tb->cur_blknum], 901 "PAP-8141: busy slot for new buffer"); 902 903 set_buffer_journal_new(new_bh); 904 tb->FEB[tb->cur_blknum++] = new_bh; 905 } 906 907 if (retval == CARRY_ON && FILESYSTEM_CHANGED_TB(tb)) 908 retval = REPEAT_SEARCH; 909 910 return retval; 911 } 912 913 /* 914 * Get free space of the left neighbor, which is stored in the parent 915 * node of the left neighbor. 916 */ 917 static int get_lfree(struct tree_balance *tb, int h) 918 { 919 struct buffer_head *l, *f; 920 int order; 921 922 if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || 923 (l = tb->FL[h]) == NULL) 924 return 0; 925 926 if (f == l) 927 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) - 1; 928 else { 929 order = B_NR_ITEMS(l); 930 f = l; 931 } 932 933 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); 934 } 935 936 /* 937 * Get free space of the right neighbor, 938 * which is stored in the parent node of the right neighbor. 939 */ 940 static int get_rfree(struct tree_balance *tb, int h) 941 { 942 struct buffer_head *r, *f; 943 int order; 944 945 if ((f = PATH_H_PPARENT(tb->tb_path, h)) == NULL || 946 (r = tb->FR[h]) == NULL) 947 return 0; 948 949 if (f == r) 950 order = PATH_H_B_ITEM_ORDER(tb->tb_path, h) + 1; 951 else { 952 order = 0; 953 f = r; 954 } 955 956 return (MAX_CHILD_SIZE(f) - dc_size(B_N_CHILD(f, order))); 957 958 } 959 960 /* Check whether left neighbor is in memory. */ 961 static int is_left_neighbor_in_cache(struct tree_balance *tb, int h) 962 { 963 struct buffer_head *father, *left; 964 struct super_block *sb = tb->tb_sb; 965 b_blocknr_t left_neighbor_blocknr; 966 int left_neighbor_position; 967 968 /* Father of the left neighbor does not exist. */ 969 if (!tb->FL[h]) 970 return 0; 971 972 /* Calculate father of the node to be balanced. */ 973 father = PATH_H_PBUFFER(tb->tb_path, h + 1); 974 975 RFALSE(!father || 976 !B_IS_IN_TREE(father) || 977 !B_IS_IN_TREE(tb->FL[h]) || 978 !buffer_uptodate(father) || 979 !buffer_uptodate(tb->FL[h]), 980 "vs-8165: F[h] (%b) or FL[h] (%b) is invalid", 981 father, tb->FL[h]); 982 983 /* 984 * Get position of the pointer to the left neighbor 985 * into the left father. 986 */ 987 left_neighbor_position = (father == tb->FL[h]) ? 988 tb->lkey[h] : B_NR_ITEMS(tb->FL[h]); 989 /* Get left neighbor block number. */ 990 left_neighbor_blocknr = 991 B_N_CHILD_NUM(tb->FL[h], left_neighbor_position); 992 /* Look for the left neighbor in the cache. */ 993 if ((left = sb_find_get_block(sb, left_neighbor_blocknr))) { 994 995 RFALSE(buffer_uptodate(left) && !B_IS_IN_TREE(left), 996 "vs-8170: left neighbor (%b %z) is not in the tree", 997 left, left); 998 put_bh(left); 999 return 1; 1000 } 1001 1002 return 0; 1003 } 1004 1005 #define LEFT_PARENTS 'l' 1006 #define RIGHT_PARENTS 'r' 1007 1008 static void decrement_key(struct cpu_key *key) 1009 { 1010 /* call item specific function for this key */ 1011 item_ops[cpu_key_k_type(key)]->decrement_key(key); 1012 } 1013 1014 /* 1015 * Calculate far left/right parent of the left/right neighbor of the 1016 * current node, that is calculate the left/right (FL[h]/FR[h]) neighbor 1017 * of the parent F[h]. 1018 * Calculate left/right common parent of the current node and L[h]/R[h]. 1019 * Calculate left/right delimiting key position. 1020 * Returns: PATH_INCORRECT - path in the tree is not correct 1021 * SCHEDULE_OCCURRED - schedule occurred while the function worked 1022 * CARRY_ON - schedule didn't occur while the function 1023 * worked 1024 */ 1025 static int get_far_parent(struct tree_balance *tb, 1026 int h, 1027 struct buffer_head **pfather, 1028 struct buffer_head **pcom_father, char c_lr_par) 1029 { 1030 struct buffer_head *parent; 1031 INITIALIZE_PATH(s_path_to_neighbor_father); 1032 struct treepath *path = tb->tb_path; 1033 struct cpu_key s_lr_father_key; 1034 int counter, 1035 position = INT_MAX, 1036 first_last_position = 0, 1037 path_offset = PATH_H_PATH_OFFSET(path, h); 1038 1039 /* 1040 * Starting from F[h] go upwards in the tree, and look for the common 1041 * ancestor of F[h], and its neighbor l/r, that should be obtained. 1042 */ 1043 1044 counter = path_offset; 1045 1046 RFALSE(counter < FIRST_PATH_ELEMENT_OFFSET, 1047 "PAP-8180: invalid path length"); 1048 1049 for (; counter > FIRST_PATH_ELEMENT_OFFSET; counter--) { 1050 /* 1051 * Check whether parent of the current buffer in the path 1052 * is really parent in the tree. 1053 */ 1054 if (!B_IS_IN_TREE 1055 (parent = PATH_OFFSET_PBUFFER(path, counter - 1))) 1056 return REPEAT_SEARCH; 1057 1058 /* Check whether position in the parent is correct. */ 1059 if ((position = 1060 PATH_OFFSET_POSITION(path, 1061 counter - 1)) > 1062 B_NR_ITEMS(parent)) 1063 return REPEAT_SEARCH; 1064 1065 /* 1066 * Check whether parent at the path really points 1067 * to the child. 1068 */ 1069 if (B_N_CHILD_NUM(parent, position) != 1070 PATH_OFFSET_PBUFFER(path, counter)->b_blocknr) 1071 return REPEAT_SEARCH; 1072 1073 /* 1074 * Return delimiting key if position in the parent is not 1075 * equal to first/last one. 1076 */ 1077 if (c_lr_par == RIGHT_PARENTS) 1078 first_last_position = B_NR_ITEMS(parent); 1079 if (position != first_last_position) { 1080 *pcom_father = parent; 1081 get_bh(*pcom_father); 1082 /*(*pcom_father = parent)->b_count++; */ 1083 break; 1084 } 1085 } 1086 1087 /* if we are in the root of the tree, then there is no common father */ 1088 if (counter == FIRST_PATH_ELEMENT_OFFSET) { 1089 /* 1090 * Check whether first buffer in the path is the 1091 * root of the tree. 1092 */ 1093 if (PATH_OFFSET_PBUFFER 1094 (tb->tb_path, 1095 FIRST_PATH_ELEMENT_OFFSET)->b_blocknr == 1096 SB_ROOT_BLOCK(tb->tb_sb)) { 1097 *pfather = *pcom_father = NULL; 1098 return CARRY_ON; 1099 } 1100 return REPEAT_SEARCH; 1101 } 1102 1103 RFALSE(B_LEVEL(*pcom_father) <= DISK_LEAF_NODE_LEVEL, 1104 "PAP-8185: (%b %z) level too small", 1105 *pcom_father, *pcom_father); 1106 1107 /* Check whether the common parent is locked. */ 1108 1109 if (buffer_locked(*pcom_father)) { 1110 1111 /* Release the write lock while the buffer is busy */ 1112 int depth = reiserfs_write_unlock_nested(tb->tb_sb); 1113 __wait_on_buffer(*pcom_father); 1114 reiserfs_write_lock_nested(tb->tb_sb, depth); 1115 if (FILESYSTEM_CHANGED_TB(tb)) { 1116 brelse(*pcom_father); 1117 return REPEAT_SEARCH; 1118 } 1119 } 1120 1121 /* 1122 * So, we got common parent of the current node and its 1123 * left/right neighbor. Now we are getting the parent of the 1124 * left/right neighbor. 1125 */ 1126 1127 /* Form key to get parent of the left/right neighbor. */ 1128 le_key2cpu_key(&s_lr_father_key, 1129 internal_key(*pcom_father, 1130 (c_lr_par == 1131 LEFT_PARENTS) ? (tb->lkey[h - 1] = 1132 position - 1133 1) : (tb->rkey[h - 1134 1] = 1135 position))); 1136 1137 if (c_lr_par == LEFT_PARENTS) 1138 decrement_key(&s_lr_father_key); 1139 1140 if (search_by_key 1141 (tb->tb_sb, &s_lr_father_key, &s_path_to_neighbor_father, 1142 h + 1) == IO_ERROR) 1143 /* path is released */ 1144 return IO_ERROR; 1145 1146 if (FILESYSTEM_CHANGED_TB(tb)) { 1147 pathrelse(&s_path_to_neighbor_father); 1148 brelse(*pcom_father); 1149 return REPEAT_SEARCH; 1150 } 1151 1152 *pfather = PATH_PLAST_BUFFER(&s_path_to_neighbor_father); 1153 1154 RFALSE(B_LEVEL(*pfather) != h + 1, 1155 "PAP-8190: (%b %z) level too small", *pfather, *pfather); 1156 RFALSE(s_path_to_neighbor_father.path_length < 1157 FIRST_PATH_ELEMENT_OFFSET, "PAP-8192: path length is too small"); 1158 1159 s_path_to_neighbor_father.path_length--; 1160 pathrelse(&s_path_to_neighbor_father); 1161 return CARRY_ON; 1162 } 1163 1164 /* 1165 * Get parents of neighbors of node in the path(S[path_offset]) and 1166 * common parents of S[path_offset] and L[path_offset]/R[path_offset]: 1167 * F[path_offset], FL[path_offset], FR[path_offset], CFL[path_offset], 1168 * CFR[path_offset]. 1169 * Calculate numbers of left and right delimiting keys position: 1170 * lkey[path_offset], rkey[path_offset]. 1171 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked 1172 * CARRY_ON - schedule didn't occur while the function worked 1173 */ 1174 static int get_parents(struct tree_balance *tb, int h) 1175 { 1176 struct treepath *path = tb->tb_path; 1177 int position, 1178 ret, 1179 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); 1180 struct buffer_head *curf, *curcf; 1181 1182 /* Current node is the root of the tree or will be root of the tree */ 1183 if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { 1184 /* 1185 * The root can not have parents. 1186 * Release nodes which previously were obtained as 1187 * parents of the current node neighbors. 1188 */ 1189 brelse(tb->FL[h]); 1190 brelse(tb->CFL[h]); 1191 brelse(tb->FR[h]); 1192 brelse(tb->CFR[h]); 1193 tb->FL[h] = NULL; 1194 tb->CFL[h] = NULL; 1195 tb->FR[h] = NULL; 1196 tb->CFR[h] = NULL; 1197 return CARRY_ON; 1198 } 1199 1200 /* Get parent FL[path_offset] of L[path_offset]. */ 1201 position = PATH_OFFSET_POSITION(path, path_offset - 1); 1202 if (position) { 1203 /* Current node is not the first child of its parent. */ 1204 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); 1205 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); 1206 get_bh(curf); 1207 get_bh(curf); 1208 tb->lkey[h] = position - 1; 1209 } else { 1210 /* 1211 * Calculate current parent of L[path_offset], which is the 1212 * left neighbor of the current node. Calculate current 1213 * common parent of L[path_offset] and the current node. 1214 * Note that CFL[path_offset] not equal FL[path_offset] and 1215 * CFL[path_offset] not equal F[path_offset]. 1216 * Calculate lkey[path_offset]. 1217 */ 1218 if ((ret = get_far_parent(tb, h + 1, &curf, 1219 &curcf, 1220 LEFT_PARENTS)) != CARRY_ON) 1221 return ret; 1222 } 1223 1224 brelse(tb->FL[h]); 1225 tb->FL[h] = curf; /* New initialization of FL[h]. */ 1226 brelse(tb->CFL[h]); 1227 tb->CFL[h] = curcf; /* New initialization of CFL[h]. */ 1228 1229 RFALSE((curf && !B_IS_IN_TREE(curf)) || 1230 (curcf && !B_IS_IN_TREE(curcf)), 1231 "PAP-8195: FL (%b) or CFL (%b) is invalid", curf, curcf); 1232 1233 /* Get parent FR[h] of R[h]. */ 1234 1235 /* Current node is the last child of F[h]. FR[h] != F[h]. */ 1236 if (position == B_NR_ITEMS(PATH_H_PBUFFER(path, h + 1))) { 1237 /* 1238 * Calculate current parent of R[h], which is the right 1239 * neighbor of F[h]. Calculate current common parent of 1240 * R[h] and current node. Note that CFR[h] not equal 1241 * FR[path_offset] and CFR[h] not equal F[h]. 1242 */ 1243 if ((ret = 1244 get_far_parent(tb, h + 1, &curf, &curcf, 1245 RIGHT_PARENTS)) != CARRY_ON) 1246 return ret; 1247 } else { 1248 /* Current node is not the last child of its parent F[h]. */ 1249 curf = PATH_OFFSET_PBUFFER(path, path_offset - 1); 1250 curcf = PATH_OFFSET_PBUFFER(path, path_offset - 1); 1251 get_bh(curf); 1252 get_bh(curf); 1253 tb->rkey[h] = position; 1254 } 1255 1256 brelse(tb->FR[h]); 1257 /* New initialization of FR[path_offset]. */ 1258 tb->FR[h] = curf; 1259 1260 brelse(tb->CFR[h]); 1261 /* New initialization of CFR[path_offset]. */ 1262 tb->CFR[h] = curcf; 1263 1264 RFALSE((curf && !B_IS_IN_TREE(curf)) || 1265 (curcf && !B_IS_IN_TREE(curcf)), 1266 "PAP-8205: FR (%b) or CFR (%b) is invalid", curf, curcf); 1267 1268 return CARRY_ON; 1269 } 1270 1271 /* 1272 * it is possible to remove node as result of shiftings to 1273 * neighbors even when we insert or paste item. 1274 */ 1275 static inline int can_node_be_removed(int mode, int lfree, int sfree, int rfree, 1276 struct tree_balance *tb, int h) 1277 { 1278 struct buffer_head *Sh = PATH_H_PBUFFER(tb->tb_path, h); 1279 int levbytes = tb->insert_size[h]; 1280 struct item_head *ih; 1281 struct reiserfs_key *r_key = NULL; 1282 1283 ih = item_head(Sh, 0); 1284 if (tb->CFR[h]) 1285 r_key = internal_key(tb->CFR[h], tb->rkey[h]); 1286 1287 if (lfree + rfree + sfree < MAX_CHILD_SIZE(Sh) + levbytes 1288 /* shifting may merge items which might save space */ 1289 - 1290 ((!h 1291 && op_is_left_mergeable(&ih->ih_key, Sh->b_size)) ? IH_SIZE : 0) 1292 - 1293 ((!h && r_key 1294 && op_is_left_mergeable(r_key, Sh->b_size)) ? IH_SIZE : 0) 1295 + ((h) ? KEY_SIZE : 0)) { 1296 /* node can not be removed */ 1297 if (sfree >= levbytes) { 1298 /* new item fits into node S[h] without any shifting */ 1299 if (!h) 1300 tb->s0num = 1301 B_NR_ITEMS(Sh) + 1302 ((mode == M_INSERT) ? 1 : 0); 1303 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1304 return NO_BALANCING_NEEDED; 1305 } 1306 } 1307 PROC_INFO_INC(tb->tb_sb, can_node_be_removed[h]); 1308 return !NO_BALANCING_NEEDED; 1309 } 1310 1311 /* 1312 * Check whether current node S[h] is balanced when increasing its size by 1313 * Inserting or Pasting. 1314 * Calculate parameters for balancing for current level h. 1315 * Parameters: 1316 * tb tree_balance structure; 1317 * h current level of the node; 1318 * inum item number in S[h]; 1319 * mode i - insert, p - paste; 1320 * Returns: 1 - schedule occurred; 1321 * 0 - balancing for higher levels needed; 1322 * -1 - no balancing for higher levels needed; 1323 * -2 - no disk space. 1324 */ 1325 /* ip means Inserting or Pasting */ 1326 static int ip_check_balance(struct tree_balance *tb, int h) 1327 { 1328 struct virtual_node *vn = tb->tb_vn; 1329 /* 1330 * Number of bytes that must be inserted into (value is negative 1331 * if bytes are deleted) buffer which contains node being balanced. 1332 * The mnemonic is that the attempted change in node space used 1333 * level is levbytes bytes. 1334 */ 1335 int levbytes; 1336 int ret; 1337 1338 int lfree, sfree, rfree /* free space in L, S and R */ ; 1339 1340 /* 1341 * nver is short for number of vertixes, and lnver is the number if 1342 * we shift to the left, rnver is the number if we shift to the 1343 * right, and lrnver is the number if we shift in both directions. 1344 * The goal is to minimize first the number of vertixes, and second, 1345 * the number of vertixes whose contents are changed by shifting, 1346 * and third the number of uncached vertixes whose contents are 1347 * changed by shifting and must be read from disk. 1348 */ 1349 int nver, lnver, rnver, lrnver; 1350 1351 /* 1352 * used at leaf level only, S0 = S[0] is the node being balanced, 1353 * sInum [ I = 0,1,2 ] is the number of items that will 1354 * remain in node SI after balancing. S1 and S2 are new 1355 * nodes that might be created. 1356 */ 1357 1358 /* 1359 * we perform 8 calls to get_num_ver(). For each call we 1360 * calculate five parameters. where 4th parameter is s1bytes 1361 * and 5th - s2bytes 1362 * 1363 * s0num, s1num, s2num for 8 cases 1364 * 0,1 - do not shift and do not shift but bottle 1365 * 2 - shift only whole item to left 1366 * 3 - shift to left and bottle as much as possible 1367 * 4,5 - shift to right (whole items and as much as possible 1368 * 6,7 - shift to both directions (whole items and as much as possible) 1369 */ 1370 short snum012[40] = { 0, }; 1371 1372 /* Sh is the node whose balance is currently being checked */ 1373 struct buffer_head *Sh; 1374 1375 Sh = PATH_H_PBUFFER(tb->tb_path, h); 1376 levbytes = tb->insert_size[h]; 1377 1378 /* Calculate balance parameters for creating new root. */ 1379 if (!Sh) { 1380 if (!h) 1381 reiserfs_panic(tb->tb_sb, "vs-8210", 1382 "S[0] can not be 0"); 1383 switch (ret = get_empty_nodes(tb, h)) { 1384 /* no balancing for higher levels needed */ 1385 case CARRY_ON: 1386 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1387 return NO_BALANCING_NEEDED; 1388 1389 case NO_DISK_SPACE: 1390 case REPEAT_SEARCH: 1391 return ret; 1392 default: 1393 reiserfs_panic(tb->tb_sb, "vs-8215", "incorrect " 1394 "return value of get_empty_nodes"); 1395 } 1396 } 1397 1398 /* get parents of S[h] neighbors. */ 1399 ret = get_parents(tb, h); 1400 if (ret != CARRY_ON) 1401 return ret; 1402 1403 sfree = B_FREE_SPACE(Sh); 1404 1405 /* get free space of neighbors */ 1406 rfree = get_rfree(tb, h); 1407 lfree = get_lfree(tb, h); 1408 1409 /* and new item fits into node S[h] without any shifting */ 1410 if (can_node_be_removed(vn->vn_mode, lfree, sfree, rfree, tb, h) == 1411 NO_BALANCING_NEEDED) 1412 return NO_BALANCING_NEEDED; 1413 1414 create_virtual_node(tb, h); 1415 1416 /* 1417 * determine maximal number of items we can shift to the left 1418 * neighbor (in tb structure) and the maximal number of bytes 1419 * that can flow to the left neighbor from the left most liquid 1420 * item that cannot be shifted from S[0] entirely (returned value) 1421 */ 1422 check_left(tb, h, lfree); 1423 1424 /* 1425 * determine maximal number of items we can shift to the right 1426 * neighbor (in tb structure) and the maximal number of bytes 1427 * that can flow to the right neighbor from the right most liquid 1428 * item that cannot be shifted from S[0] entirely (returned value) 1429 */ 1430 check_right(tb, h, rfree); 1431 1432 /* 1433 * all contents of internal node S[h] can be moved into its 1434 * neighbors, S[h] will be removed after balancing 1435 */ 1436 if (h && (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1)) { 1437 int to_r; 1438 1439 /* 1440 * Since we are working on internal nodes, and our internal 1441 * nodes have fixed size entries, then we can balance by the 1442 * number of items rather than the space they consume. In this 1443 * routine we set the left node equal to the right node, 1444 * allowing a difference of less than or equal to 1 child 1445 * pointer. 1446 */ 1447 to_r = 1448 ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + 1449 vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - 1450 tb->rnum[h]); 1451 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, 1452 -1, -1); 1453 return CARRY_ON; 1454 } 1455 1456 /* 1457 * this checks balance condition, that any two neighboring nodes 1458 * can not fit in one node 1459 */ 1460 RFALSE(h && 1461 (tb->lnum[h] >= vn->vn_nr_item + 1 || 1462 tb->rnum[h] >= vn->vn_nr_item + 1), 1463 "vs-8220: tree is not balanced on internal level"); 1464 RFALSE(!h && ((tb->lnum[h] >= vn->vn_nr_item && (tb->lbytes == -1)) || 1465 (tb->rnum[h] >= vn->vn_nr_item && (tb->rbytes == -1))), 1466 "vs-8225: tree is not balanced on leaf level"); 1467 1468 /* 1469 * all contents of S[0] can be moved into its neighbors 1470 * S[0] will be removed after balancing. 1471 */ 1472 if (!h && is_leaf_removable(tb)) 1473 return CARRY_ON; 1474 1475 /* 1476 * why do we perform this check here rather than earlier?? 1477 * Answer: we can win 1 node in some cases above. Moreover we 1478 * checked it above, when we checked, that S[0] is not removable 1479 * in principle 1480 */ 1481 1482 /* new item fits into node S[h] without any shifting */ 1483 if (sfree >= levbytes) { 1484 if (!h) 1485 tb->s0num = vn->vn_nr_item; 1486 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1487 return NO_BALANCING_NEEDED; 1488 } 1489 1490 { 1491 int lpar, rpar, nset, lset, rset, lrset; 1492 /* regular overflowing of the node */ 1493 1494 /* 1495 * get_num_ver works in 2 modes (FLOW & NO_FLOW) 1496 * lpar, rpar - number of items we can shift to left/right 1497 * neighbor (including splitting item) 1498 * nset, lset, rset, lrset - shows, whether flowing items 1499 * give better packing 1500 */ 1501 #define FLOW 1 1502 #define NO_FLOW 0 /* do not any splitting */ 1503 1504 /* we choose one of the following */ 1505 #define NOTHING_SHIFT_NO_FLOW 0 1506 #define NOTHING_SHIFT_FLOW 5 1507 #define LEFT_SHIFT_NO_FLOW 10 1508 #define LEFT_SHIFT_FLOW 15 1509 #define RIGHT_SHIFT_NO_FLOW 20 1510 #define RIGHT_SHIFT_FLOW 25 1511 #define LR_SHIFT_NO_FLOW 30 1512 #define LR_SHIFT_FLOW 35 1513 1514 lpar = tb->lnum[h]; 1515 rpar = tb->rnum[h]; 1516 1517 /* 1518 * calculate number of blocks S[h] must be split into when 1519 * nothing is shifted to the neighbors, as well as number of 1520 * items in each part of the split node (s012 numbers), 1521 * and number of bytes (s1bytes) of the shared drop which 1522 * flow to S1 if any 1523 */ 1524 nset = NOTHING_SHIFT_NO_FLOW; 1525 nver = get_num_ver(vn->vn_mode, tb, h, 1526 0, -1, h ? vn->vn_nr_item : 0, -1, 1527 snum012, NO_FLOW); 1528 1529 if (!h) { 1530 int nver1; 1531 1532 /* 1533 * note, that in this case we try to bottle 1534 * between S[0] and S1 (S1 - the first new node) 1535 */ 1536 nver1 = get_num_ver(vn->vn_mode, tb, h, 1537 0, -1, 0, -1, 1538 snum012 + NOTHING_SHIFT_FLOW, FLOW); 1539 if (nver > nver1) 1540 nset = NOTHING_SHIFT_FLOW, nver = nver1; 1541 } 1542 1543 /* 1544 * calculate number of blocks S[h] must be split into when 1545 * l_shift_num first items and l_shift_bytes of the right 1546 * most liquid item to be shifted are shifted to the left 1547 * neighbor, as well as number of items in each part of the 1548 * splitted node (s012 numbers), and number of bytes 1549 * (s1bytes) of the shared drop which flow to S1 if any 1550 */ 1551 lset = LEFT_SHIFT_NO_FLOW; 1552 lnver = get_num_ver(vn->vn_mode, tb, h, 1553 lpar - ((h || tb->lbytes == -1) ? 0 : 1), 1554 -1, h ? vn->vn_nr_item : 0, -1, 1555 snum012 + LEFT_SHIFT_NO_FLOW, NO_FLOW); 1556 if (!h) { 1557 int lnver1; 1558 1559 lnver1 = get_num_ver(vn->vn_mode, tb, h, 1560 lpar - 1561 ((tb->lbytes != -1) ? 1 : 0), 1562 tb->lbytes, 0, -1, 1563 snum012 + LEFT_SHIFT_FLOW, FLOW); 1564 if (lnver > lnver1) 1565 lset = LEFT_SHIFT_FLOW, lnver = lnver1; 1566 } 1567 1568 /* 1569 * calculate number of blocks S[h] must be split into when 1570 * r_shift_num first items and r_shift_bytes of the left most 1571 * liquid item to be shifted are shifted to the right neighbor, 1572 * as well as number of items in each part of the splitted 1573 * node (s012 numbers), and number of bytes (s1bytes) of the 1574 * shared drop which flow to S1 if any 1575 */ 1576 rset = RIGHT_SHIFT_NO_FLOW; 1577 rnver = get_num_ver(vn->vn_mode, tb, h, 1578 0, -1, 1579 h ? (vn->vn_nr_item - rpar) : (rpar - 1580 ((tb-> 1581 rbytes != 1582 -1) ? 1 : 1583 0)), -1, 1584 snum012 + RIGHT_SHIFT_NO_FLOW, NO_FLOW); 1585 if (!h) { 1586 int rnver1; 1587 1588 rnver1 = get_num_ver(vn->vn_mode, tb, h, 1589 0, -1, 1590 (rpar - 1591 ((tb->rbytes != -1) ? 1 : 0)), 1592 tb->rbytes, 1593 snum012 + RIGHT_SHIFT_FLOW, FLOW); 1594 1595 if (rnver > rnver1) 1596 rset = RIGHT_SHIFT_FLOW, rnver = rnver1; 1597 } 1598 1599 /* 1600 * calculate number of blocks S[h] must be split into when 1601 * items are shifted in both directions, as well as number 1602 * of items in each part of the splitted node (s012 numbers), 1603 * and number of bytes (s1bytes) of the shared drop which 1604 * flow to S1 if any 1605 */ 1606 lrset = LR_SHIFT_NO_FLOW; 1607 lrnver = get_num_ver(vn->vn_mode, tb, h, 1608 lpar - ((h || tb->lbytes == -1) ? 0 : 1), 1609 -1, 1610 h ? (vn->vn_nr_item - rpar) : (rpar - 1611 ((tb-> 1612 rbytes != 1613 -1) ? 1 : 1614 0)), -1, 1615 snum012 + LR_SHIFT_NO_FLOW, NO_FLOW); 1616 if (!h) { 1617 int lrnver1; 1618 1619 lrnver1 = get_num_ver(vn->vn_mode, tb, h, 1620 lpar - 1621 ((tb->lbytes != -1) ? 1 : 0), 1622 tb->lbytes, 1623 (rpar - 1624 ((tb->rbytes != -1) ? 1 : 0)), 1625 tb->rbytes, 1626 snum012 + LR_SHIFT_FLOW, FLOW); 1627 if (lrnver > lrnver1) 1628 lrset = LR_SHIFT_FLOW, lrnver = lrnver1; 1629 } 1630 1631 /* 1632 * Our general shifting strategy is: 1633 * 1) to minimized number of new nodes; 1634 * 2) to minimized number of neighbors involved in shifting; 1635 * 3) to minimized number of disk reads; 1636 */ 1637 1638 /* we can win TWO or ONE nodes by shifting in both directions */ 1639 if (lrnver < lnver && lrnver < rnver) { 1640 RFALSE(h && 1641 (tb->lnum[h] != 1 || 1642 tb->rnum[h] != 1 || 1643 lrnver != 1 || rnver != 2 || lnver != 2 1644 || h != 1), "vs-8230: bad h"); 1645 if (lrset == LR_SHIFT_FLOW) 1646 set_parameters(tb, h, tb->lnum[h], tb->rnum[h], 1647 lrnver, snum012 + lrset, 1648 tb->lbytes, tb->rbytes); 1649 else 1650 set_parameters(tb, h, 1651 tb->lnum[h] - 1652 ((tb->lbytes == -1) ? 0 : 1), 1653 tb->rnum[h] - 1654 ((tb->rbytes == -1) ? 0 : 1), 1655 lrnver, snum012 + lrset, -1, -1); 1656 1657 return CARRY_ON; 1658 } 1659 1660 /* 1661 * if shifting doesn't lead to better packing 1662 * then don't shift 1663 */ 1664 if (nver == lrnver) { 1665 set_parameters(tb, h, 0, 0, nver, snum012 + nset, -1, 1666 -1); 1667 return CARRY_ON; 1668 } 1669 1670 /* 1671 * now we know that for better packing shifting in only one 1672 * direction either to the left or to the right is required 1673 */ 1674 1675 /* 1676 * if shifting to the left is better than 1677 * shifting to the right 1678 */ 1679 if (lnver < rnver) { 1680 SET_PAR_SHIFT_LEFT; 1681 return CARRY_ON; 1682 } 1683 1684 /* 1685 * if shifting to the right is better than 1686 * shifting to the left 1687 */ 1688 if (lnver > rnver) { 1689 SET_PAR_SHIFT_RIGHT; 1690 return CARRY_ON; 1691 } 1692 1693 /* 1694 * now shifting in either direction gives the same number 1695 * of nodes and we can make use of the cached neighbors 1696 */ 1697 if (is_left_neighbor_in_cache(tb, h)) { 1698 SET_PAR_SHIFT_LEFT; 1699 return CARRY_ON; 1700 } 1701 1702 /* 1703 * shift to the right independently on whether the 1704 * right neighbor in cache or not 1705 */ 1706 SET_PAR_SHIFT_RIGHT; 1707 return CARRY_ON; 1708 } 1709 } 1710 1711 /* 1712 * Check whether current node S[h] is balanced when Decreasing its size by 1713 * Deleting or Cutting for INTERNAL node of S+tree. 1714 * Calculate parameters for balancing for current level h. 1715 * Parameters: 1716 * tb tree_balance structure; 1717 * h current level of the node; 1718 * inum item number in S[h]; 1719 * mode i - insert, p - paste; 1720 * Returns: 1 - schedule occurred; 1721 * 0 - balancing for higher levels needed; 1722 * -1 - no balancing for higher levels needed; 1723 * -2 - no disk space. 1724 * 1725 * Note: Items of internal nodes have fixed size, so the balance condition for 1726 * the internal part of S+tree is as for the B-trees. 1727 */ 1728 static int dc_check_balance_internal(struct tree_balance *tb, int h) 1729 { 1730 struct virtual_node *vn = tb->tb_vn; 1731 1732 /* 1733 * Sh is the node whose balance is currently being checked, 1734 * and Fh is its father. 1735 */ 1736 struct buffer_head *Sh, *Fh; 1737 int maxsize, ret; 1738 int lfree, rfree /* free space in L and R */ ; 1739 1740 Sh = PATH_H_PBUFFER(tb->tb_path, h); 1741 Fh = PATH_H_PPARENT(tb->tb_path, h); 1742 1743 maxsize = MAX_CHILD_SIZE(Sh); 1744 1745 /* 1746 * using tb->insert_size[h], which is negative in this case, 1747 * create_virtual_node calculates: 1748 * new_nr_item = number of items node would have if operation is 1749 * performed without balancing (new_nr_item); 1750 */ 1751 create_virtual_node(tb, h); 1752 1753 if (!Fh) { /* S[h] is the root. */ 1754 /* no balancing for higher levels needed */ 1755 if (vn->vn_nr_item > 0) { 1756 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1757 return NO_BALANCING_NEEDED; 1758 } 1759 /* 1760 * new_nr_item == 0. 1761 * Current root will be deleted resulting in 1762 * decrementing the tree height. 1763 */ 1764 set_parameters(tb, h, 0, 0, 0, NULL, -1, -1); 1765 return CARRY_ON; 1766 } 1767 1768 if ((ret = get_parents(tb, h)) != CARRY_ON) 1769 return ret; 1770 1771 /* get free space of neighbors */ 1772 rfree = get_rfree(tb, h); 1773 lfree = get_lfree(tb, h); 1774 1775 /* determine maximal number of items we can fit into neighbors */ 1776 check_left(tb, h, lfree); 1777 check_right(tb, h, rfree); 1778 1779 /* 1780 * Balance condition for the internal node is valid. 1781 * In this case we balance only if it leads to better packing. 1782 */ 1783 if (vn->vn_nr_item >= MIN_NR_KEY(Sh)) { 1784 /* 1785 * Here we join S[h] with one of its neighbors, 1786 * which is impossible with greater values of new_nr_item. 1787 */ 1788 if (vn->vn_nr_item == MIN_NR_KEY(Sh)) { 1789 /* All contents of S[h] can be moved to L[h]. */ 1790 if (tb->lnum[h] >= vn->vn_nr_item + 1) { 1791 int n; 1792 int order_L; 1793 1794 order_L = 1795 ((n = 1796 PATH_H_B_ITEM_ORDER(tb->tb_path, 1797 h)) == 1798 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; 1799 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / 1800 (DC_SIZE + KEY_SIZE); 1801 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, 1802 -1); 1803 return CARRY_ON; 1804 } 1805 1806 /* All contents of S[h] can be moved to R[h]. */ 1807 if (tb->rnum[h] >= vn->vn_nr_item + 1) { 1808 int n; 1809 int order_R; 1810 1811 order_R = 1812 ((n = 1813 PATH_H_B_ITEM_ORDER(tb->tb_path, 1814 h)) == 1815 B_NR_ITEMS(Fh)) ? 0 : n + 1; 1816 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / 1817 (DC_SIZE + KEY_SIZE); 1818 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, 1819 -1); 1820 return CARRY_ON; 1821 } 1822 } 1823 1824 /* 1825 * All contents of S[h] can be moved to the neighbors 1826 * (L[h] & R[h]). 1827 */ 1828 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { 1829 int to_r; 1830 1831 to_r = 1832 ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - 1833 tb->rnum[h] + vn->vn_nr_item + 1) / 2 - 1834 (MAX_NR_KEY(Sh) + 1 - tb->rnum[h]); 1835 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 1836 0, NULL, -1, -1); 1837 return CARRY_ON; 1838 } 1839 1840 /* Balancing does not lead to better packing. */ 1841 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1842 return NO_BALANCING_NEEDED; 1843 } 1844 1845 /* 1846 * Current node contain insufficient number of items. 1847 * Balancing is required. 1848 */ 1849 /* Check whether we can merge S[h] with left neighbor. */ 1850 if (tb->lnum[h] >= vn->vn_nr_item + 1) 1851 if (is_left_neighbor_in_cache(tb, h) 1852 || tb->rnum[h] < vn->vn_nr_item + 1 || !tb->FR[h]) { 1853 int n; 1854 int order_L; 1855 1856 order_L = 1857 ((n = 1858 PATH_H_B_ITEM_ORDER(tb->tb_path, 1859 h)) == 1860 0) ? B_NR_ITEMS(tb->FL[h]) : n - 1; 1861 n = dc_size(B_N_CHILD(tb->FL[h], order_L)) / (DC_SIZE + 1862 KEY_SIZE); 1863 set_parameters(tb, h, -n - 1, 0, 0, NULL, -1, -1); 1864 return CARRY_ON; 1865 } 1866 1867 /* Check whether we can merge S[h] with right neighbor. */ 1868 if (tb->rnum[h] >= vn->vn_nr_item + 1) { 1869 int n; 1870 int order_R; 1871 1872 order_R = 1873 ((n = 1874 PATH_H_B_ITEM_ORDER(tb->tb_path, 1875 h)) == B_NR_ITEMS(Fh)) ? 0 : (n + 1); 1876 n = dc_size(B_N_CHILD(tb->FR[h], order_R)) / (DC_SIZE + 1877 KEY_SIZE); 1878 set_parameters(tb, h, 0, -n - 1, 0, NULL, -1, -1); 1879 return CARRY_ON; 1880 } 1881 1882 /* All contents of S[h] can be moved to the neighbors (L[h] & R[h]). */ 1883 if (tb->rnum[h] + tb->lnum[h] >= vn->vn_nr_item + 1) { 1884 int to_r; 1885 1886 to_r = 1887 ((MAX_NR_KEY(Sh) << 1) + 2 - tb->lnum[h] - tb->rnum[h] + 1888 vn->vn_nr_item + 1) / 2 - (MAX_NR_KEY(Sh) + 1 - 1889 tb->rnum[h]); 1890 set_parameters(tb, h, vn->vn_nr_item + 1 - to_r, to_r, 0, NULL, 1891 -1, -1); 1892 return CARRY_ON; 1893 } 1894 1895 /* For internal nodes try to borrow item from a neighbor */ 1896 RFALSE(!tb->FL[h] && !tb->FR[h], "vs-8235: trying to borrow for root"); 1897 1898 /* Borrow one or two items from caching neighbor */ 1899 if (is_left_neighbor_in_cache(tb, h) || !tb->FR[h]) { 1900 int from_l; 1901 1902 from_l = 1903 (MAX_NR_KEY(Sh) + 1 - tb->lnum[h] + vn->vn_nr_item + 1904 1) / 2 - (vn->vn_nr_item + 1); 1905 set_parameters(tb, h, -from_l, 0, 1, NULL, -1, -1); 1906 return CARRY_ON; 1907 } 1908 1909 set_parameters(tb, h, 0, 1910 -((MAX_NR_KEY(Sh) + 1 - tb->rnum[h] + vn->vn_nr_item + 1911 1) / 2 - (vn->vn_nr_item + 1)), 1, NULL, -1, -1); 1912 return CARRY_ON; 1913 } 1914 1915 /* 1916 * Check whether current node S[h] is balanced when Decreasing its size by 1917 * Deleting or Truncating for LEAF node of S+tree. 1918 * Calculate parameters for balancing for current level h. 1919 * Parameters: 1920 * tb tree_balance structure; 1921 * h current level of the node; 1922 * inum item number in S[h]; 1923 * mode i - insert, p - paste; 1924 * Returns: 1 - schedule occurred; 1925 * 0 - balancing for higher levels needed; 1926 * -1 - no balancing for higher levels needed; 1927 * -2 - no disk space. 1928 */ 1929 static int dc_check_balance_leaf(struct tree_balance *tb, int h) 1930 { 1931 struct virtual_node *vn = tb->tb_vn; 1932 1933 /* 1934 * Number of bytes that must be deleted from 1935 * (value is negative if bytes are deleted) buffer which 1936 * contains node being balanced. The mnemonic is that the 1937 * attempted change in node space used level is levbytes bytes. 1938 */ 1939 int levbytes; 1940 1941 /* the maximal item size */ 1942 int maxsize, ret; 1943 1944 /* 1945 * S0 is the node whose balance is currently being checked, 1946 * and F0 is its father. 1947 */ 1948 struct buffer_head *S0, *F0; 1949 int lfree, rfree /* free space in L and R */ ; 1950 1951 S0 = PATH_H_PBUFFER(tb->tb_path, 0); 1952 F0 = PATH_H_PPARENT(tb->tb_path, 0); 1953 1954 levbytes = tb->insert_size[h]; 1955 1956 maxsize = MAX_CHILD_SIZE(S0); /* maximal possible size of an item */ 1957 1958 if (!F0) { /* S[0] is the root now. */ 1959 1960 RFALSE(-levbytes >= maxsize - B_FREE_SPACE(S0), 1961 "vs-8240: attempt to create empty buffer tree"); 1962 1963 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 1964 return NO_BALANCING_NEEDED; 1965 } 1966 1967 if ((ret = get_parents(tb, h)) != CARRY_ON) 1968 return ret; 1969 1970 /* get free space of neighbors */ 1971 rfree = get_rfree(tb, h); 1972 lfree = get_lfree(tb, h); 1973 1974 create_virtual_node(tb, h); 1975 1976 /* if 3 leaves can be merge to one, set parameters and return */ 1977 if (are_leaves_removable(tb, lfree, rfree)) 1978 return CARRY_ON; 1979 1980 /* 1981 * determine maximal number of items we can shift to the left/right 1982 * neighbor and the maximal number of bytes that can flow to the 1983 * left/right neighbor from the left/right most liquid item that 1984 * cannot be shifted from S[0] entirely 1985 */ 1986 check_left(tb, h, lfree); 1987 check_right(tb, h, rfree); 1988 1989 /* check whether we can merge S with left neighbor. */ 1990 if (tb->lnum[0] >= vn->vn_nr_item && tb->lbytes == -1) 1991 if (is_left_neighbor_in_cache(tb, h) || ((tb->rnum[0] - ((tb->rbytes == -1) ? 0 : 1)) < vn->vn_nr_item) || /* S can not be merged with R */ 1992 !tb->FR[h]) { 1993 1994 RFALSE(!tb->FL[h], 1995 "vs-8245: dc_check_balance_leaf: FL[h] must exist"); 1996 1997 /* set parameter to merge S[0] with its left neighbor */ 1998 set_parameters(tb, h, -1, 0, 0, NULL, -1, -1); 1999 return CARRY_ON; 2000 } 2001 2002 /* check whether we can merge S[0] with right neighbor. */ 2003 if (tb->rnum[0] >= vn->vn_nr_item && tb->rbytes == -1) { 2004 set_parameters(tb, h, 0, -1, 0, NULL, -1, -1); 2005 return CARRY_ON; 2006 } 2007 2008 /* 2009 * All contents of S[0] can be moved to the neighbors (L[0] & R[0]). 2010 * Set parameters and return 2011 */ 2012 if (is_leaf_removable(tb)) 2013 return CARRY_ON; 2014 2015 /* Balancing is not required. */ 2016 tb->s0num = vn->vn_nr_item; 2017 set_parameters(tb, h, 0, 0, 1, NULL, -1, -1); 2018 return NO_BALANCING_NEEDED; 2019 } 2020 2021 /* 2022 * Check whether current node S[h] is balanced when Decreasing its size by 2023 * Deleting or Cutting. 2024 * Calculate parameters for balancing for current level h. 2025 * Parameters: 2026 * tb tree_balance structure; 2027 * h current level of the node; 2028 * inum item number in S[h]; 2029 * mode d - delete, c - cut. 2030 * Returns: 1 - schedule occurred; 2031 * 0 - balancing for higher levels needed; 2032 * -1 - no balancing for higher levels needed; 2033 * -2 - no disk space. 2034 */ 2035 static int dc_check_balance(struct tree_balance *tb, int h) 2036 { 2037 RFALSE(!(PATH_H_PBUFFER(tb->tb_path, h)), 2038 "vs-8250: S is not initialized"); 2039 2040 if (h) 2041 return dc_check_balance_internal(tb, h); 2042 else 2043 return dc_check_balance_leaf(tb, h); 2044 } 2045 2046 /* 2047 * Check whether current node S[h] is balanced. 2048 * Calculate parameters for balancing for current level h. 2049 * Parameters: 2050 * 2051 * tb tree_balance structure: 2052 * 2053 * tb is a large structure that must be read about in the header 2054 * file at the same time as this procedure if the reader is 2055 * to successfully understand this procedure 2056 * 2057 * h current level of the node; 2058 * inum item number in S[h]; 2059 * mode i - insert, p - paste, d - delete, c - cut. 2060 * Returns: 1 - schedule occurred; 2061 * 0 - balancing for higher levels needed; 2062 * -1 - no balancing for higher levels needed; 2063 * -2 - no disk space. 2064 */ 2065 static int check_balance(int mode, 2066 struct tree_balance *tb, 2067 int h, 2068 int inum, 2069 int pos_in_item, 2070 struct item_head *ins_ih, const void *data) 2071 { 2072 struct virtual_node *vn; 2073 2074 vn = tb->tb_vn = (struct virtual_node *)(tb->vn_buf); 2075 vn->vn_free_ptr = (char *)(tb->tb_vn + 1); 2076 vn->vn_mode = mode; 2077 vn->vn_affected_item_num = inum; 2078 vn->vn_pos_in_item = pos_in_item; 2079 vn->vn_ins_ih = ins_ih; 2080 vn->vn_data = data; 2081 2082 RFALSE(mode == M_INSERT && !vn->vn_ins_ih, 2083 "vs-8255: ins_ih can not be 0 in insert mode"); 2084 2085 /* Calculate balance parameters when size of node is increasing. */ 2086 if (tb->insert_size[h] > 0) 2087 return ip_check_balance(tb, h); 2088 2089 /* Calculate balance parameters when size of node is decreasing. */ 2090 return dc_check_balance(tb, h); 2091 } 2092 2093 /* Check whether parent at the path is the really parent of the current node.*/ 2094 static int get_direct_parent(struct tree_balance *tb, int h) 2095 { 2096 struct buffer_head *bh; 2097 struct treepath *path = tb->tb_path; 2098 int position, 2099 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h); 2100 2101 /* We are in the root or in the new root. */ 2102 if (path_offset <= FIRST_PATH_ELEMENT_OFFSET) { 2103 2104 RFALSE(path_offset < FIRST_PATH_ELEMENT_OFFSET - 1, 2105 "PAP-8260: invalid offset in the path"); 2106 2107 if (PATH_OFFSET_PBUFFER(path, FIRST_PATH_ELEMENT_OFFSET)-> 2108 b_blocknr == SB_ROOT_BLOCK(tb->tb_sb)) { 2109 /* Root is not changed. */ 2110 PATH_OFFSET_PBUFFER(path, path_offset - 1) = NULL; 2111 PATH_OFFSET_POSITION(path, path_offset - 1) = 0; 2112 return CARRY_ON; 2113 } 2114 /* Root is changed and we must recalculate the path. */ 2115 return REPEAT_SEARCH; 2116 } 2117 2118 /* Parent in the path is not in the tree. */ 2119 if (!B_IS_IN_TREE 2120 (bh = PATH_OFFSET_PBUFFER(path, path_offset - 1))) 2121 return REPEAT_SEARCH; 2122 2123 if ((position = 2124 PATH_OFFSET_POSITION(path, 2125 path_offset - 1)) > B_NR_ITEMS(bh)) 2126 return REPEAT_SEARCH; 2127 2128 /* Parent in the path is not parent of the current node in the tree. */ 2129 if (B_N_CHILD_NUM(bh, position) != 2130 PATH_OFFSET_PBUFFER(path, path_offset)->b_blocknr) 2131 return REPEAT_SEARCH; 2132 2133 if (buffer_locked(bh)) { 2134 int depth = reiserfs_write_unlock_nested(tb->tb_sb); 2135 __wait_on_buffer(bh); 2136 reiserfs_write_lock_nested(tb->tb_sb, depth); 2137 if (FILESYSTEM_CHANGED_TB(tb)) 2138 return REPEAT_SEARCH; 2139 } 2140 2141 /* 2142 * Parent in the path is unlocked and really parent 2143 * of the current node. 2144 */ 2145 return CARRY_ON; 2146 } 2147 2148 /* 2149 * Using lnum[h] and rnum[h] we should determine what neighbors 2150 * of S[h] we 2151 * need in order to balance S[h], and get them if necessary. 2152 * Returns: SCHEDULE_OCCURRED - schedule occurred while the function worked; 2153 * CARRY_ON - schedule didn't occur while the function worked; 2154 */ 2155 static int get_neighbors(struct tree_balance *tb, int h) 2156 { 2157 int child_position, 2158 path_offset = PATH_H_PATH_OFFSET(tb->tb_path, h + 1); 2159 unsigned long son_number; 2160 struct super_block *sb = tb->tb_sb; 2161 struct buffer_head *bh; 2162 int depth; 2163 2164 PROC_INFO_INC(sb, get_neighbors[h]); 2165 2166 if (tb->lnum[h]) { 2167 /* We need left neighbor to balance S[h]. */ 2168 PROC_INFO_INC(sb, need_l_neighbor[h]); 2169 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); 2170 2171 RFALSE(bh == tb->FL[h] && 2172 !PATH_OFFSET_POSITION(tb->tb_path, path_offset), 2173 "PAP-8270: invalid position in the parent"); 2174 2175 child_position = 2176 (bh == 2177 tb->FL[h]) ? tb->lkey[h] : B_NR_ITEMS(tb-> 2178 FL[h]); 2179 son_number = B_N_CHILD_NUM(tb->FL[h], child_position); 2180 depth = reiserfs_write_unlock_nested(tb->tb_sb); 2181 bh = sb_bread(sb, son_number); 2182 reiserfs_write_lock_nested(tb->tb_sb, depth); 2183 if (!bh) 2184 return IO_ERROR; 2185 if (FILESYSTEM_CHANGED_TB(tb)) { 2186 brelse(bh); 2187 PROC_INFO_INC(sb, get_neighbors_restart[h]); 2188 return REPEAT_SEARCH; 2189 } 2190 2191 RFALSE(!B_IS_IN_TREE(tb->FL[h]) || 2192 child_position > B_NR_ITEMS(tb->FL[h]) || 2193 B_N_CHILD_NUM(tb->FL[h], child_position) != 2194 bh->b_blocknr, "PAP-8275: invalid parent"); 2195 RFALSE(!B_IS_IN_TREE(bh), "PAP-8280: invalid child"); 2196 RFALSE(!h && 2197 B_FREE_SPACE(bh) != 2198 MAX_CHILD_SIZE(bh) - 2199 dc_size(B_N_CHILD(tb->FL[0], child_position)), 2200 "PAP-8290: invalid child size of left neighbor"); 2201 2202 brelse(tb->L[h]); 2203 tb->L[h] = bh; 2204 } 2205 2206 /* We need right neighbor to balance S[path_offset]. */ 2207 if (tb->rnum[h]) { 2208 PROC_INFO_INC(sb, need_r_neighbor[h]); 2209 bh = PATH_OFFSET_PBUFFER(tb->tb_path, path_offset); 2210 2211 RFALSE(bh == tb->FR[h] && 2212 PATH_OFFSET_POSITION(tb->tb_path, 2213 path_offset) >= 2214 B_NR_ITEMS(bh), 2215 "PAP-8295: invalid position in the parent"); 2216 2217 child_position = 2218 (bh == tb->FR[h]) ? tb->rkey[h] + 1 : 0; 2219 son_number = B_N_CHILD_NUM(tb->FR[h], child_position); 2220 depth = reiserfs_write_unlock_nested(tb->tb_sb); 2221 bh = sb_bread(sb, son_number); 2222 reiserfs_write_lock_nested(tb->tb_sb, depth); 2223 if (!bh) 2224 return IO_ERROR; 2225 if (FILESYSTEM_CHANGED_TB(tb)) { 2226 brelse(bh); 2227 PROC_INFO_INC(sb, get_neighbors_restart[h]); 2228 return REPEAT_SEARCH; 2229 } 2230 brelse(tb->R[h]); 2231 tb->R[h] = bh; 2232 2233 RFALSE(!h 2234 && B_FREE_SPACE(bh) != 2235 MAX_CHILD_SIZE(bh) - 2236 dc_size(B_N_CHILD(tb->FR[0], child_position)), 2237 "PAP-8300: invalid child size of right neighbor (%d != %d - %d)", 2238 B_FREE_SPACE(bh), MAX_CHILD_SIZE(bh), 2239 dc_size(B_N_CHILD(tb->FR[0], child_position))); 2240 2241 } 2242 return CARRY_ON; 2243 } 2244 2245 static int get_virtual_node_size(struct super_block *sb, struct buffer_head *bh) 2246 { 2247 int max_num_of_items; 2248 int max_num_of_entries; 2249 unsigned long blocksize = sb->s_blocksize; 2250 2251 #define MIN_NAME_LEN 1 2252 2253 max_num_of_items = (blocksize - BLKH_SIZE) / (IH_SIZE + MIN_ITEM_LEN); 2254 max_num_of_entries = (blocksize - BLKH_SIZE - IH_SIZE) / 2255 (DEH_SIZE + MIN_NAME_LEN); 2256 2257 return sizeof(struct virtual_node) + 2258 max(max_num_of_items * sizeof(struct virtual_item), 2259 sizeof(struct virtual_item) + sizeof(struct direntry_uarea) + 2260 (max_num_of_entries - 1) * sizeof(__u16)); 2261 } 2262 2263 /* 2264 * maybe we should fail balancing we are going to perform when kmalloc 2265 * fails several times. But now it will loop until kmalloc gets 2266 * required memory 2267 */ 2268 static int get_mem_for_virtual_node(struct tree_balance *tb) 2269 { 2270 int check_fs = 0; 2271 int size; 2272 char *buf; 2273 2274 size = get_virtual_node_size(tb->tb_sb, PATH_PLAST_BUFFER(tb->tb_path)); 2275 2276 /* we have to allocate more memory for virtual node */ 2277 if (size > tb->vn_buf_size) { 2278 if (tb->vn_buf) { 2279 /* free memory allocated before */ 2280 kfree(tb->vn_buf); 2281 /* this is not needed if kfree is atomic */ 2282 check_fs = 1; 2283 } 2284 2285 /* virtual node requires now more memory */ 2286 tb->vn_buf_size = size; 2287 2288 /* get memory for virtual item */ 2289 buf = kmalloc(size, GFP_ATOMIC | __GFP_NOWARN); 2290 if (!buf) { 2291 /* 2292 * getting memory with GFP_KERNEL priority may involve 2293 * balancing now (due to indirect_to_direct conversion 2294 * on dcache shrinking). So, release path and collected 2295 * resources here 2296 */ 2297 free_buffers_in_tb(tb); 2298 buf = kmalloc(size, GFP_NOFS); 2299 if (!buf) { 2300 tb->vn_buf_size = 0; 2301 } 2302 tb->vn_buf = buf; 2303 schedule(); 2304 return REPEAT_SEARCH; 2305 } 2306 2307 tb->vn_buf = buf; 2308 } 2309 2310 if (check_fs && FILESYSTEM_CHANGED_TB(tb)) 2311 return REPEAT_SEARCH; 2312 2313 return CARRY_ON; 2314 } 2315 2316 #ifdef CONFIG_REISERFS_CHECK 2317 static void tb_buffer_sanity_check(struct super_block *sb, 2318 struct buffer_head *bh, 2319 const char *descr, int level) 2320 { 2321 if (bh) { 2322 if (atomic_read(&(bh->b_count)) <= 0) 2323 2324 reiserfs_panic(sb, "jmacd-1", "negative or zero " 2325 "reference counter for buffer %s[%d] " 2326 "(%b)", descr, level, bh); 2327 2328 if (!buffer_uptodate(bh)) 2329 reiserfs_panic(sb, "jmacd-2", "buffer is not up " 2330 "to date %s[%d] (%b)", 2331 descr, level, bh); 2332 2333 if (!B_IS_IN_TREE(bh)) 2334 reiserfs_panic(sb, "jmacd-3", "buffer is not " 2335 "in tree %s[%d] (%b)", 2336 descr, level, bh); 2337 2338 if (bh->b_bdev != sb->s_bdev) 2339 reiserfs_panic(sb, "jmacd-4", "buffer has wrong " 2340 "device %s[%d] (%b)", 2341 descr, level, bh); 2342 2343 if (bh->b_size != sb->s_blocksize) 2344 reiserfs_panic(sb, "jmacd-5", "buffer has wrong " 2345 "blocksize %s[%d] (%b)", 2346 descr, level, bh); 2347 2348 if (bh->b_blocknr > SB_BLOCK_COUNT(sb)) 2349 reiserfs_panic(sb, "jmacd-6", "buffer block " 2350 "number too high %s[%d] (%b)", 2351 descr, level, bh); 2352 } 2353 } 2354 #else 2355 static void tb_buffer_sanity_check(struct super_block *sb, 2356 struct buffer_head *bh, 2357 const char *descr, int level) 2358 {; 2359 } 2360 #endif 2361 2362 static int clear_all_dirty_bits(struct super_block *s, struct buffer_head *bh) 2363 { 2364 return reiserfs_prepare_for_journal(s, bh, 0); 2365 } 2366 2367 static int wait_tb_buffers_until_unlocked(struct tree_balance *tb) 2368 { 2369 struct buffer_head *locked; 2370 #ifdef CONFIG_REISERFS_CHECK 2371 int repeat_counter = 0; 2372 #endif 2373 int i; 2374 2375 do { 2376 2377 locked = NULL; 2378 2379 for (i = tb->tb_path->path_length; 2380 !locked && i > ILLEGAL_PATH_ELEMENT_OFFSET; i--) { 2381 if (PATH_OFFSET_PBUFFER(tb->tb_path, i)) { 2382 /* 2383 * if I understand correctly, we can only 2384 * be sure the last buffer in the path is 2385 * in the tree --clm 2386 */ 2387 #ifdef CONFIG_REISERFS_CHECK 2388 if (PATH_PLAST_BUFFER(tb->tb_path) == 2389 PATH_OFFSET_PBUFFER(tb->tb_path, i)) 2390 tb_buffer_sanity_check(tb->tb_sb, 2391 PATH_OFFSET_PBUFFER 2392 (tb->tb_path, 2393 i), "S", 2394 tb->tb_path-> 2395 path_length - i); 2396 #endif 2397 if (!clear_all_dirty_bits(tb->tb_sb, 2398 PATH_OFFSET_PBUFFER 2399 (tb->tb_path, 2400 i))) { 2401 locked = 2402 PATH_OFFSET_PBUFFER(tb->tb_path, 2403 i); 2404 } 2405 } 2406 } 2407 2408 for (i = 0; !locked && i < MAX_HEIGHT && tb->insert_size[i]; 2409 i++) { 2410 2411 if (tb->lnum[i]) { 2412 2413 if (tb->L[i]) { 2414 tb_buffer_sanity_check(tb->tb_sb, 2415 tb->L[i], 2416 "L", i); 2417 if (!clear_all_dirty_bits 2418 (tb->tb_sb, tb->L[i])) 2419 locked = tb->L[i]; 2420 } 2421 2422 if (!locked && tb->FL[i]) { 2423 tb_buffer_sanity_check(tb->tb_sb, 2424 tb->FL[i], 2425 "FL", i); 2426 if (!clear_all_dirty_bits 2427 (tb->tb_sb, tb->FL[i])) 2428 locked = tb->FL[i]; 2429 } 2430 2431 if (!locked && tb->CFL[i]) { 2432 tb_buffer_sanity_check(tb->tb_sb, 2433 tb->CFL[i], 2434 "CFL", i); 2435 if (!clear_all_dirty_bits 2436 (tb->tb_sb, tb->CFL[i])) 2437 locked = tb->CFL[i]; 2438 } 2439 2440 } 2441 2442 if (!locked && (tb->rnum[i])) { 2443 2444 if (tb->R[i]) { 2445 tb_buffer_sanity_check(tb->tb_sb, 2446 tb->R[i], 2447 "R", i); 2448 if (!clear_all_dirty_bits 2449 (tb->tb_sb, tb->R[i])) 2450 locked = tb->R[i]; 2451 } 2452 2453 if (!locked && tb->FR[i]) { 2454 tb_buffer_sanity_check(tb->tb_sb, 2455 tb->FR[i], 2456 "FR", i); 2457 if (!clear_all_dirty_bits 2458 (tb->tb_sb, tb->FR[i])) 2459 locked = tb->FR[i]; 2460 } 2461 2462 if (!locked && tb->CFR[i]) { 2463 tb_buffer_sanity_check(tb->tb_sb, 2464 tb->CFR[i], 2465 "CFR", i); 2466 if (!clear_all_dirty_bits 2467 (tb->tb_sb, tb->CFR[i])) 2468 locked = tb->CFR[i]; 2469 } 2470 } 2471 } 2472 2473 /* 2474 * as far as I can tell, this is not required. The FEB list 2475 * seems to be full of newly allocated nodes, which will 2476 * never be locked, dirty, or anything else. 2477 * To be safe, I'm putting in the checks and waits in. 2478 * For the moment, they are needed to keep the code in 2479 * journal.c from complaining about the buffer. 2480 * That code is inside CONFIG_REISERFS_CHECK as well. --clm 2481 */ 2482 for (i = 0; !locked && i < MAX_FEB_SIZE; i++) { 2483 if (tb->FEB[i]) { 2484 if (!clear_all_dirty_bits 2485 (tb->tb_sb, tb->FEB[i])) 2486 locked = tb->FEB[i]; 2487 } 2488 } 2489 2490 if (locked) { 2491 int depth; 2492 #ifdef CONFIG_REISERFS_CHECK 2493 repeat_counter++; 2494 if ((repeat_counter % 10000) == 0) { 2495 reiserfs_warning(tb->tb_sb, "reiserfs-8200", 2496 "too many iterations waiting " 2497 "for buffer to unlock " 2498 "(%b)", locked); 2499 2500 /* Don't loop forever. Try to recover from possible error. */ 2501 2502 return (FILESYSTEM_CHANGED_TB(tb)) ? 2503 REPEAT_SEARCH : CARRY_ON; 2504 } 2505 #endif 2506 depth = reiserfs_write_unlock_nested(tb->tb_sb); 2507 __wait_on_buffer(locked); 2508 reiserfs_write_lock_nested(tb->tb_sb, depth); 2509 if (FILESYSTEM_CHANGED_TB(tb)) 2510 return REPEAT_SEARCH; 2511 } 2512 2513 } while (locked); 2514 2515 return CARRY_ON; 2516 } 2517 2518 /* 2519 * Prepare for balancing, that is 2520 * get all necessary parents, and neighbors; 2521 * analyze what and where should be moved; 2522 * get sufficient number of new nodes; 2523 * Balancing will start only after all resources will be collected at a time. 2524 * 2525 * When ported to SMP kernels, only at the last moment after all needed nodes 2526 * are collected in cache, will the resources be locked using the usual 2527 * textbook ordered lock acquisition algorithms. Note that ensuring that 2528 * this code neither write locks what it does not need to write lock nor locks 2529 * out of order will be a pain in the butt that could have been avoided. 2530 * Grumble grumble. -Hans 2531 * 2532 * fix is meant in the sense of render unchanging 2533 * 2534 * Latency might be improved by first gathering a list of what buffers 2535 * are needed and then getting as many of them in parallel as possible? -Hans 2536 * 2537 * Parameters: 2538 * op_mode i - insert, d - delete, c - cut (truncate), p - paste (append) 2539 * tb tree_balance structure; 2540 * inum item number in S[h]; 2541 * pos_in_item - comment this if you can 2542 * ins_ih item head of item being inserted 2543 * data inserted item or data to be pasted 2544 * Returns: 1 - schedule occurred while the function worked; 2545 * 0 - schedule didn't occur while the function worked; 2546 * -1 - if no_disk_space 2547 */ 2548 2549 int fix_nodes(int op_mode, struct tree_balance *tb, 2550 struct item_head *ins_ih, const void *data) 2551 { 2552 int ret, h, item_num = PATH_LAST_POSITION(tb->tb_path); 2553 int pos_in_item; 2554 2555 /* 2556 * we set wait_tb_buffers_run when we have to restore any dirty 2557 * bits cleared during wait_tb_buffers_run 2558 */ 2559 int wait_tb_buffers_run = 0; 2560 struct buffer_head *tbS0 = PATH_PLAST_BUFFER(tb->tb_path); 2561 2562 ++REISERFS_SB(tb->tb_sb)->s_fix_nodes; 2563 2564 pos_in_item = tb->tb_path->pos_in_item; 2565 2566 tb->fs_gen = get_generation(tb->tb_sb); 2567 2568 /* 2569 * we prepare and log the super here so it will already be in the 2570 * transaction when do_balance needs to change it. 2571 * This way do_balance won't have to schedule when trying to prepare 2572 * the super for logging 2573 */ 2574 reiserfs_prepare_for_journal(tb->tb_sb, 2575 SB_BUFFER_WITH_SB(tb->tb_sb), 1); 2576 journal_mark_dirty(tb->transaction_handle, 2577 SB_BUFFER_WITH_SB(tb->tb_sb)); 2578 if (FILESYSTEM_CHANGED_TB(tb)) 2579 return REPEAT_SEARCH; 2580 2581 /* if it possible in indirect_to_direct conversion */ 2582 if (buffer_locked(tbS0)) { 2583 int depth = reiserfs_write_unlock_nested(tb->tb_sb); 2584 __wait_on_buffer(tbS0); 2585 reiserfs_write_lock_nested(tb->tb_sb, depth); 2586 if (FILESYSTEM_CHANGED_TB(tb)) 2587 return REPEAT_SEARCH; 2588 } 2589 #ifdef CONFIG_REISERFS_CHECK 2590 if (REISERFS_SB(tb->tb_sb)->cur_tb) { 2591 print_cur_tb("fix_nodes"); 2592 reiserfs_panic(tb->tb_sb, "PAP-8305", 2593 "there is pending do_balance"); 2594 } 2595 2596 if (!buffer_uptodate(tbS0) || !B_IS_IN_TREE(tbS0)) 2597 reiserfs_panic(tb->tb_sb, "PAP-8320", "S[0] (%b %z) is " 2598 "not uptodate at the beginning of fix_nodes " 2599 "or not in tree (mode %c)", 2600 tbS0, tbS0, op_mode); 2601 2602 /* Check parameters. */ 2603 switch (op_mode) { 2604 case M_INSERT: 2605 if (item_num <= 0 || item_num > B_NR_ITEMS(tbS0)) 2606 reiserfs_panic(tb->tb_sb, "PAP-8330", "Incorrect " 2607 "item number %d (in S0 - %d) in case " 2608 "of insert", item_num, 2609 B_NR_ITEMS(tbS0)); 2610 break; 2611 case M_PASTE: 2612 case M_DELETE: 2613 case M_CUT: 2614 if (item_num < 0 || item_num >= B_NR_ITEMS(tbS0)) { 2615 print_block(tbS0, 0, -1, -1); 2616 reiserfs_panic(tb->tb_sb, "PAP-8335", "Incorrect " 2617 "item number(%d); mode = %c " 2618 "insert_size = %d", 2619 item_num, op_mode, 2620 tb->insert_size[0]); 2621 } 2622 break; 2623 default: 2624 reiserfs_panic(tb->tb_sb, "PAP-8340", "Incorrect mode " 2625 "of operation"); 2626 } 2627 #endif 2628 2629 if (get_mem_for_virtual_node(tb) == REPEAT_SEARCH) 2630 /* FIXME: maybe -ENOMEM when tb->vn_buf == 0? Now just repeat */ 2631 return REPEAT_SEARCH; 2632 2633 /* Starting from the leaf level; for all levels h of the tree. */ 2634 for (h = 0; h < MAX_HEIGHT && tb->insert_size[h]; h++) { 2635 ret = get_direct_parent(tb, h); 2636 if (ret != CARRY_ON) 2637 goto repeat; 2638 2639 ret = check_balance(op_mode, tb, h, item_num, 2640 pos_in_item, ins_ih, data); 2641 if (ret != CARRY_ON) { 2642 if (ret == NO_BALANCING_NEEDED) { 2643 /* No balancing for higher levels needed. */ 2644 ret = get_neighbors(tb, h); 2645 if (ret != CARRY_ON) 2646 goto repeat; 2647 if (h != MAX_HEIGHT - 1) 2648 tb->insert_size[h + 1] = 0; 2649 /* 2650 * ok, analysis and resource gathering 2651 * are complete 2652 */ 2653 break; 2654 } 2655 goto repeat; 2656 } 2657 2658 ret = get_neighbors(tb, h); 2659 if (ret != CARRY_ON) 2660 goto repeat; 2661 2662 /* 2663 * No disk space, or schedule occurred and analysis may be 2664 * invalid and needs to be redone. 2665 */ 2666 ret = get_empty_nodes(tb, h); 2667 if (ret != CARRY_ON) 2668 goto repeat; 2669 2670 /* 2671 * We have a positive insert size but no nodes exist on this 2672 * level, this means that we are creating a new root. 2673 */ 2674 if (!PATH_H_PBUFFER(tb->tb_path, h)) { 2675 2676 RFALSE(tb->blknum[h] != 1, 2677 "PAP-8350: creating new empty root"); 2678 2679 if (h < MAX_HEIGHT - 1) 2680 tb->insert_size[h + 1] = 0; 2681 } else if (!PATH_H_PBUFFER(tb->tb_path, h + 1)) { 2682 /* 2683 * The tree needs to be grown, so this node S[h] 2684 * which is the root node is split into two nodes, 2685 * and a new node (S[h+1]) will be created to 2686 * become the root node. 2687 */ 2688 if (tb->blknum[h] > 1) { 2689 2690 RFALSE(h == MAX_HEIGHT - 1, 2691 "PAP-8355: attempt to create too high of a tree"); 2692 2693 tb->insert_size[h + 1] = 2694 (DC_SIZE + 2695 KEY_SIZE) * (tb->blknum[h] - 1) + 2696 DC_SIZE; 2697 } else if (h < MAX_HEIGHT - 1) 2698 tb->insert_size[h + 1] = 0; 2699 } else 2700 tb->insert_size[h + 1] = 2701 (DC_SIZE + KEY_SIZE) * (tb->blknum[h] - 1); 2702 } 2703 2704 ret = wait_tb_buffers_until_unlocked(tb); 2705 if (ret == CARRY_ON) { 2706 if (FILESYSTEM_CHANGED_TB(tb)) { 2707 wait_tb_buffers_run = 1; 2708 ret = REPEAT_SEARCH; 2709 goto repeat; 2710 } else { 2711 return CARRY_ON; 2712 } 2713 } else { 2714 wait_tb_buffers_run = 1; 2715 goto repeat; 2716 } 2717 2718 repeat: 2719 /* 2720 * fix_nodes was unable to perform its calculation due to 2721 * filesystem got changed under us, lack of free disk space or i/o 2722 * failure. If the first is the case - the search will be 2723 * repeated. For now - free all resources acquired so far except 2724 * for the new allocated nodes 2725 */ 2726 { 2727 int i; 2728 2729 /* Release path buffers. */ 2730 if (wait_tb_buffers_run) { 2731 pathrelse_and_restore(tb->tb_sb, tb->tb_path); 2732 } else { 2733 pathrelse(tb->tb_path); 2734 } 2735 /* brelse all resources collected for balancing */ 2736 for (i = 0; i < MAX_HEIGHT; i++) { 2737 if (wait_tb_buffers_run) { 2738 reiserfs_restore_prepared_buffer(tb->tb_sb, 2739 tb->L[i]); 2740 reiserfs_restore_prepared_buffer(tb->tb_sb, 2741 tb->R[i]); 2742 reiserfs_restore_prepared_buffer(tb->tb_sb, 2743 tb->FL[i]); 2744 reiserfs_restore_prepared_buffer(tb->tb_sb, 2745 tb->FR[i]); 2746 reiserfs_restore_prepared_buffer(tb->tb_sb, 2747 tb-> 2748 CFL[i]); 2749 reiserfs_restore_prepared_buffer(tb->tb_sb, 2750 tb-> 2751 CFR[i]); 2752 } 2753 2754 brelse(tb->L[i]); 2755 brelse(tb->R[i]); 2756 brelse(tb->FL[i]); 2757 brelse(tb->FR[i]); 2758 brelse(tb->CFL[i]); 2759 brelse(tb->CFR[i]); 2760 2761 tb->L[i] = NULL; 2762 tb->R[i] = NULL; 2763 tb->FL[i] = NULL; 2764 tb->FR[i] = NULL; 2765 tb->CFL[i] = NULL; 2766 tb->CFR[i] = NULL; 2767 } 2768 2769 if (wait_tb_buffers_run) { 2770 for (i = 0; i < MAX_FEB_SIZE; i++) { 2771 if (tb->FEB[i]) 2772 reiserfs_restore_prepared_buffer 2773 (tb->tb_sb, tb->FEB[i]); 2774 } 2775 } 2776 return ret; 2777 } 2778 2779 } 2780 2781 void unfix_nodes(struct tree_balance *tb) 2782 { 2783 int i; 2784 2785 /* Release path buffers. */ 2786 pathrelse_and_restore(tb->tb_sb, tb->tb_path); 2787 2788 /* brelse all resources collected for balancing */ 2789 for (i = 0; i < MAX_HEIGHT; i++) { 2790 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->L[i]); 2791 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->R[i]); 2792 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FL[i]); 2793 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->FR[i]); 2794 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFL[i]); 2795 reiserfs_restore_prepared_buffer(tb->tb_sb, tb->CFR[i]); 2796 2797 brelse(tb->L[i]); 2798 brelse(tb->R[i]); 2799 brelse(tb->FL[i]); 2800 brelse(tb->FR[i]); 2801 brelse(tb->CFL[i]); 2802 brelse(tb->CFR[i]); 2803 } 2804 2805 /* deal with list of allocated (used and unused) nodes */ 2806 for (i = 0; i < MAX_FEB_SIZE; i++) { 2807 if (tb->FEB[i]) { 2808 b_blocknr_t blocknr = tb->FEB[i]->b_blocknr; 2809 /* 2810 * de-allocated block which was not used by 2811 * balancing and bforget about buffer for it 2812 */ 2813 brelse(tb->FEB[i]); 2814 reiserfs_free_block(tb->transaction_handle, NULL, 2815 blocknr, 0); 2816 } 2817 if (tb->used[i]) { 2818 /* release used as new nodes including a new root */ 2819 brelse(tb->used[i]); 2820 } 2821 } 2822 2823 kfree(tb->vn_buf); 2824 2825 } 2826