1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * alloc.c 5 * 6 * Extent allocs and frees 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/swap.h> 31 32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "aops.h" 39 #include "dlmglue.h" 40 #include "extent_map.h" 41 #include "inode.h" 42 #include "journal.h" 43 #include "localalloc.h" 44 #include "suballoc.h" 45 #include "sysfile.h" 46 #include "file.h" 47 #include "super.h" 48 #include "uptodate.h" 49 50 #include "buffer_head_io.h" 51 52 /* 53 * ocfs2_extent_tree and ocfs2_extent_tree_operations are used to abstract 54 * the b-tree operations in ocfs2. Now all the b-tree operations are not 55 * limited to ocfs2_dinode only. Any data which need to allocate clusters 56 * to store can use b-tree. And it only needs to implement its ocfs2_extent_tree 57 * and operation. 58 * 59 * ocfs2_extent_tree contains info for the root of the b-tree, it must have a 60 * root ocfs2_extent_list and a root_bh so that they can be used in the b-tree 61 * functions. 62 * ocfs2_extent_tree_operations abstract the normal operations we do for 63 * the root of extent b-tree. 64 */ 65 struct ocfs2_extent_tree; 66 67 struct ocfs2_extent_tree_operations { 68 void (*set_last_eb_blk) (struct ocfs2_extent_tree *et, u64 blkno); 69 u64 (*get_last_eb_blk) (struct ocfs2_extent_tree *et); 70 void (*update_clusters) (struct inode *inode, 71 struct ocfs2_extent_tree *et, 72 u32 new_clusters); 73 int (*sanity_check) (struct inode *inode, struct ocfs2_extent_tree *et); 74 }; 75 76 struct ocfs2_extent_tree { 77 enum ocfs2_extent_tree_type type; 78 struct ocfs2_extent_tree_operations *eops; 79 struct buffer_head *root_bh; 80 struct ocfs2_extent_list *root_el; 81 void *private; 82 }; 83 84 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 85 u64 blkno) 86 { 87 struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data; 88 89 BUG_ON(et->type != OCFS2_DINODE_EXTENT); 90 di->i_last_eb_blk = cpu_to_le64(blkno); 91 } 92 93 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) 94 { 95 struct ocfs2_dinode *di = (struct ocfs2_dinode *)et->root_bh->b_data; 96 97 BUG_ON(et->type != OCFS2_DINODE_EXTENT); 98 return le64_to_cpu(di->i_last_eb_blk); 99 } 100 101 static void ocfs2_dinode_update_clusters(struct inode *inode, 102 struct ocfs2_extent_tree *et, 103 u32 clusters) 104 { 105 struct ocfs2_dinode *di = 106 (struct ocfs2_dinode *)et->root_bh->b_data; 107 108 le32_add_cpu(&di->i_clusters, clusters); 109 spin_lock(&OCFS2_I(inode)->ip_lock); 110 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 111 spin_unlock(&OCFS2_I(inode)->ip_lock); 112 } 113 114 static int ocfs2_dinode_sanity_check(struct inode *inode, 115 struct ocfs2_extent_tree *et) 116 { 117 int ret = 0; 118 struct ocfs2_dinode *di; 119 120 BUG_ON(et->type != OCFS2_DINODE_EXTENT); 121 122 di = (struct ocfs2_dinode *)et->root_bh->b_data; 123 if (!OCFS2_IS_VALID_DINODE(di)) { 124 ret = -EIO; 125 ocfs2_error(inode->i_sb, 126 "Inode %llu has invalid path root", 127 (unsigned long long)OCFS2_I(inode)->ip_blkno); 128 } 129 130 return ret; 131 } 132 133 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 134 .set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 135 .get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 136 .update_clusters = ocfs2_dinode_update_clusters, 137 .sanity_check = ocfs2_dinode_sanity_check, 138 }; 139 140 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 141 u64 blkno) 142 { 143 struct ocfs2_xattr_value_root *xv = 144 (struct ocfs2_xattr_value_root *)et->private; 145 146 xv->xr_last_eb_blk = cpu_to_le64(blkno); 147 } 148 149 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 150 { 151 struct ocfs2_xattr_value_root *xv = 152 (struct ocfs2_xattr_value_root *) et->private; 153 154 return le64_to_cpu(xv->xr_last_eb_blk); 155 } 156 157 static void ocfs2_xattr_value_update_clusters(struct inode *inode, 158 struct ocfs2_extent_tree *et, 159 u32 clusters) 160 { 161 struct ocfs2_xattr_value_root *xv = 162 (struct ocfs2_xattr_value_root *)et->private; 163 164 le32_add_cpu(&xv->xr_clusters, clusters); 165 } 166 167 static int ocfs2_xattr_value_sanity_check(struct inode *inode, 168 struct ocfs2_extent_tree *et) 169 { 170 return 0; 171 } 172 173 static struct ocfs2_extent_tree_operations ocfs2_xattr_et_ops = { 174 .set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 175 .get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 176 .update_clusters = ocfs2_xattr_value_update_clusters, 177 .sanity_check = ocfs2_xattr_value_sanity_check, 178 }; 179 180 static struct ocfs2_extent_tree* 181 ocfs2_new_extent_tree(struct buffer_head *bh, 182 enum ocfs2_extent_tree_type et_type, 183 void *private) 184 { 185 struct ocfs2_extent_tree *et; 186 187 et = kzalloc(sizeof(*et), GFP_NOFS); 188 if (!et) 189 return NULL; 190 191 et->type = et_type; 192 get_bh(bh); 193 et->root_bh = bh; 194 et->private = private; 195 196 if (et_type == OCFS2_DINODE_EXTENT) { 197 et->root_el = &((struct ocfs2_dinode *)bh->b_data)->id2.i_list; 198 et->eops = &ocfs2_dinode_et_ops; 199 } else if (et_type == OCFS2_XATTR_VALUE_EXTENT) { 200 struct ocfs2_xattr_value_root *xv = 201 (struct ocfs2_xattr_value_root *) private; 202 et->root_el = &xv->xr_list; 203 et->eops = &ocfs2_xattr_et_ops; 204 } 205 206 return et; 207 } 208 209 static void ocfs2_free_extent_tree(struct ocfs2_extent_tree *et) 210 { 211 if (et) { 212 brelse(et->root_bh); 213 kfree(et); 214 } 215 } 216 217 static inline void ocfs2_set_last_eb_blk(struct ocfs2_extent_tree *et, 218 u64 new_last_eb_blk) 219 { 220 et->eops->set_last_eb_blk(et, new_last_eb_blk); 221 } 222 223 static inline u64 ocfs2_get_last_eb_blk(struct ocfs2_extent_tree *et) 224 { 225 return et->eops->get_last_eb_blk(et); 226 } 227 228 static inline void ocfs2_update_clusters(struct inode *inode, 229 struct ocfs2_extent_tree *et, 230 u32 clusters) 231 { 232 et->eops->update_clusters(inode, et, clusters); 233 } 234 235 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 236 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 237 struct ocfs2_extent_block *eb); 238 239 /* 240 * Structures which describe a path through a btree, and functions to 241 * manipulate them. 242 * 243 * The idea here is to be as generic as possible with the tree 244 * manipulation code. 245 */ 246 struct ocfs2_path_item { 247 struct buffer_head *bh; 248 struct ocfs2_extent_list *el; 249 }; 250 251 #define OCFS2_MAX_PATH_DEPTH 5 252 253 struct ocfs2_path { 254 int p_tree_depth; 255 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 256 }; 257 258 #define path_root_bh(_path) ((_path)->p_node[0].bh) 259 #define path_root_el(_path) ((_path)->p_node[0].el) 260 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 261 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 262 #define path_num_items(_path) ((_path)->p_tree_depth + 1) 263 264 /* 265 * Reset the actual path elements so that we can re-use the structure 266 * to build another path. Generally, this involves freeing the buffer 267 * heads. 268 */ 269 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 270 { 271 int i, start = 0, depth = 0; 272 struct ocfs2_path_item *node; 273 274 if (keep_root) 275 start = 1; 276 277 for(i = start; i < path_num_items(path); i++) { 278 node = &path->p_node[i]; 279 280 brelse(node->bh); 281 node->bh = NULL; 282 node->el = NULL; 283 } 284 285 /* 286 * Tree depth may change during truncate, or insert. If we're 287 * keeping the root extent list, then make sure that our path 288 * structure reflects the proper depth. 289 */ 290 if (keep_root) 291 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 292 293 path->p_tree_depth = depth; 294 } 295 296 static void ocfs2_free_path(struct ocfs2_path *path) 297 { 298 if (path) { 299 ocfs2_reinit_path(path, 0); 300 kfree(path); 301 } 302 } 303 304 /* 305 * All the elements of src into dest. After this call, src could be freed 306 * without affecting dest. 307 * 308 * Both paths should have the same root. Any non-root elements of dest 309 * will be freed. 310 */ 311 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) 312 { 313 int i; 314 315 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 316 BUG_ON(path_root_el(dest) != path_root_el(src)); 317 318 ocfs2_reinit_path(dest, 1); 319 320 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 321 dest->p_node[i].bh = src->p_node[i].bh; 322 dest->p_node[i].el = src->p_node[i].el; 323 324 if (dest->p_node[i].bh) 325 get_bh(dest->p_node[i].bh); 326 } 327 } 328 329 /* 330 * Make the *dest path the same as src and re-initialize src path to 331 * have a root only. 332 */ 333 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 334 { 335 int i; 336 337 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 338 339 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 340 brelse(dest->p_node[i].bh); 341 342 dest->p_node[i].bh = src->p_node[i].bh; 343 dest->p_node[i].el = src->p_node[i].el; 344 345 src->p_node[i].bh = NULL; 346 src->p_node[i].el = NULL; 347 } 348 } 349 350 /* 351 * Insert an extent block at given index. 352 * 353 * This will not take an additional reference on eb_bh. 354 */ 355 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 356 struct buffer_head *eb_bh) 357 { 358 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 359 360 /* 361 * Right now, no root bh is an extent block, so this helps 362 * catch code errors with dinode trees. The assertion can be 363 * safely removed if we ever need to insert extent block 364 * structures at the root. 365 */ 366 BUG_ON(index == 0); 367 368 path->p_node[index].bh = eb_bh; 369 path->p_node[index].el = &eb->h_list; 370 } 371 372 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 373 struct ocfs2_extent_list *root_el) 374 { 375 struct ocfs2_path *path; 376 377 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 378 379 path = kzalloc(sizeof(*path), GFP_NOFS); 380 if (path) { 381 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 382 get_bh(root_bh); 383 path_root_bh(path) = root_bh; 384 path_root_el(path) = root_el; 385 } 386 387 return path; 388 } 389 390 /* 391 * Convenience function to journal all components in a path. 392 */ 393 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 394 struct ocfs2_path *path) 395 { 396 int i, ret = 0; 397 398 if (!path) 399 goto out; 400 401 for(i = 0; i < path_num_items(path); i++) { 402 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 403 OCFS2_JOURNAL_ACCESS_WRITE); 404 if (ret < 0) { 405 mlog_errno(ret); 406 goto out; 407 } 408 } 409 410 out: 411 return ret; 412 } 413 414 /* 415 * Return the index of the extent record which contains cluster #v_cluster. 416 * -1 is returned if it was not found. 417 * 418 * Should work fine on interior and exterior nodes. 419 */ 420 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) 421 { 422 int ret = -1; 423 int i; 424 struct ocfs2_extent_rec *rec; 425 u32 rec_end, rec_start, clusters; 426 427 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 428 rec = &el->l_recs[i]; 429 430 rec_start = le32_to_cpu(rec->e_cpos); 431 clusters = ocfs2_rec_clusters(el, rec); 432 433 rec_end = rec_start + clusters; 434 435 if (v_cluster >= rec_start && v_cluster < rec_end) { 436 ret = i; 437 break; 438 } 439 } 440 441 return ret; 442 } 443 444 enum ocfs2_contig_type { 445 CONTIG_NONE = 0, 446 CONTIG_LEFT, 447 CONTIG_RIGHT, 448 CONTIG_LEFTRIGHT, 449 }; 450 451 452 /* 453 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 454 * ocfs2_extent_contig only work properly against leaf nodes! 455 */ 456 static int ocfs2_block_extent_contig(struct super_block *sb, 457 struct ocfs2_extent_rec *ext, 458 u64 blkno) 459 { 460 u64 blk_end = le64_to_cpu(ext->e_blkno); 461 462 blk_end += ocfs2_clusters_to_blocks(sb, 463 le16_to_cpu(ext->e_leaf_clusters)); 464 465 return blkno == blk_end; 466 } 467 468 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 469 struct ocfs2_extent_rec *right) 470 { 471 u32 left_range; 472 473 left_range = le32_to_cpu(left->e_cpos) + 474 le16_to_cpu(left->e_leaf_clusters); 475 476 return (left_range == le32_to_cpu(right->e_cpos)); 477 } 478 479 static enum ocfs2_contig_type 480 ocfs2_extent_contig(struct inode *inode, 481 struct ocfs2_extent_rec *ext, 482 struct ocfs2_extent_rec *insert_rec) 483 { 484 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 485 486 /* 487 * Refuse to coalesce extent records with different flag 488 * fields - we don't want to mix unwritten extents with user 489 * data. 490 */ 491 if (ext->e_flags != insert_rec->e_flags) 492 return CONTIG_NONE; 493 494 if (ocfs2_extents_adjacent(ext, insert_rec) && 495 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 496 return CONTIG_RIGHT; 497 498 blkno = le64_to_cpu(ext->e_blkno); 499 if (ocfs2_extents_adjacent(insert_rec, ext) && 500 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 501 return CONTIG_LEFT; 502 503 return CONTIG_NONE; 504 } 505 506 /* 507 * NOTE: We can have pretty much any combination of contiguousness and 508 * appending. 509 * 510 * The usefulness of APPEND_TAIL is more in that it lets us know that 511 * we'll have to update the path to that leaf. 512 */ 513 enum ocfs2_append_type { 514 APPEND_NONE = 0, 515 APPEND_TAIL, 516 }; 517 518 enum ocfs2_split_type { 519 SPLIT_NONE = 0, 520 SPLIT_LEFT, 521 SPLIT_RIGHT, 522 }; 523 524 struct ocfs2_insert_type { 525 enum ocfs2_split_type ins_split; 526 enum ocfs2_append_type ins_appending; 527 enum ocfs2_contig_type ins_contig; 528 int ins_contig_index; 529 int ins_tree_depth; 530 }; 531 532 struct ocfs2_merge_ctxt { 533 enum ocfs2_contig_type c_contig_type; 534 int c_has_empty_extent; 535 int c_split_covers_rec; 536 }; 537 538 /* 539 * How many free extents have we got before we need more meta data? 540 */ 541 int ocfs2_num_free_extents(struct ocfs2_super *osb, 542 struct inode *inode, 543 struct buffer_head *root_bh, 544 enum ocfs2_extent_tree_type type, 545 void *private) 546 { 547 int retval; 548 struct ocfs2_extent_list *el = NULL; 549 struct ocfs2_extent_block *eb; 550 struct buffer_head *eb_bh = NULL; 551 u64 last_eb_blk = 0; 552 553 mlog_entry_void(); 554 555 if (type == OCFS2_DINODE_EXTENT) { 556 struct ocfs2_dinode *fe = 557 (struct ocfs2_dinode *)root_bh->b_data; 558 if (!OCFS2_IS_VALID_DINODE(fe)) { 559 OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 560 retval = -EIO; 561 goto bail; 562 } 563 564 if (fe->i_last_eb_blk) 565 last_eb_blk = le64_to_cpu(fe->i_last_eb_blk); 566 el = &fe->id2.i_list; 567 } else if (type == OCFS2_XATTR_VALUE_EXTENT) { 568 struct ocfs2_xattr_value_root *xv = 569 (struct ocfs2_xattr_value_root *) private; 570 571 last_eb_blk = le64_to_cpu(xv->xr_last_eb_blk); 572 el = &xv->xr_list; 573 } 574 575 if (last_eb_blk) { 576 retval = ocfs2_read_block(osb, last_eb_blk, 577 &eb_bh, OCFS2_BH_CACHED, inode); 578 if (retval < 0) { 579 mlog_errno(retval); 580 goto bail; 581 } 582 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 583 el = &eb->h_list; 584 } 585 586 BUG_ON(el->l_tree_depth != 0); 587 588 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 589 bail: 590 if (eb_bh) 591 brelse(eb_bh); 592 593 mlog_exit(retval); 594 return retval; 595 } 596 597 /* expects array to already be allocated 598 * 599 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 600 * l_count for you 601 */ 602 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 603 handle_t *handle, 604 struct inode *inode, 605 int wanted, 606 struct ocfs2_alloc_context *meta_ac, 607 struct buffer_head *bhs[]) 608 { 609 int count, status, i; 610 u16 suballoc_bit_start; 611 u32 num_got; 612 u64 first_blkno; 613 struct ocfs2_extent_block *eb; 614 615 mlog_entry_void(); 616 617 count = 0; 618 while (count < wanted) { 619 status = ocfs2_claim_metadata(osb, 620 handle, 621 meta_ac, 622 wanted - count, 623 &suballoc_bit_start, 624 &num_got, 625 &first_blkno); 626 if (status < 0) { 627 mlog_errno(status); 628 goto bail; 629 } 630 631 for(i = count; i < (num_got + count); i++) { 632 bhs[i] = sb_getblk(osb->sb, first_blkno); 633 if (bhs[i] == NULL) { 634 status = -EIO; 635 mlog_errno(status); 636 goto bail; 637 } 638 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 639 640 status = ocfs2_journal_access(handle, inode, bhs[i], 641 OCFS2_JOURNAL_ACCESS_CREATE); 642 if (status < 0) { 643 mlog_errno(status); 644 goto bail; 645 } 646 647 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 648 eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 649 /* Ok, setup the minimal stuff here. */ 650 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 651 eb->h_blkno = cpu_to_le64(first_blkno); 652 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 653 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 654 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 655 eb->h_list.l_count = 656 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 657 658 suballoc_bit_start++; 659 first_blkno++; 660 661 /* We'll also be dirtied by the caller, so 662 * this isn't absolutely necessary. */ 663 status = ocfs2_journal_dirty(handle, bhs[i]); 664 if (status < 0) { 665 mlog_errno(status); 666 goto bail; 667 } 668 } 669 670 count += num_got; 671 } 672 673 status = 0; 674 bail: 675 if (status < 0) { 676 for(i = 0; i < wanted; i++) { 677 if (bhs[i]) 678 brelse(bhs[i]); 679 bhs[i] = NULL; 680 } 681 } 682 mlog_exit(status); 683 return status; 684 } 685 686 /* 687 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 688 * 689 * Returns the sum of the rightmost extent rec logical offset and 690 * cluster count. 691 * 692 * ocfs2_add_branch() uses this to determine what logical cluster 693 * value should be populated into the leftmost new branch records. 694 * 695 * ocfs2_shift_tree_depth() uses this to determine the # clusters 696 * value for the new topmost tree record. 697 */ 698 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 699 { 700 int i; 701 702 i = le16_to_cpu(el->l_next_free_rec) - 1; 703 704 return le32_to_cpu(el->l_recs[i].e_cpos) + 705 ocfs2_rec_clusters(el, &el->l_recs[i]); 706 } 707 708 /* 709 * Add an entire tree branch to our inode. eb_bh is the extent block 710 * to start at, if we don't want to start the branch at the dinode 711 * structure. 712 * 713 * last_eb_bh is required as we have to update it's next_leaf pointer 714 * for the new last extent block. 715 * 716 * the new branch will be 'empty' in the sense that every block will 717 * contain a single record with cluster count == 0. 718 */ 719 static int ocfs2_add_branch(struct ocfs2_super *osb, 720 handle_t *handle, 721 struct inode *inode, 722 struct ocfs2_extent_tree *et, 723 struct buffer_head *eb_bh, 724 struct buffer_head **last_eb_bh, 725 struct ocfs2_alloc_context *meta_ac) 726 { 727 int status, new_blocks, i; 728 u64 next_blkno, new_last_eb_blk; 729 struct buffer_head *bh; 730 struct buffer_head **new_eb_bhs = NULL; 731 struct ocfs2_extent_block *eb; 732 struct ocfs2_extent_list *eb_el; 733 struct ocfs2_extent_list *el; 734 u32 new_cpos; 735 736 mlog_entry_void(); 737 738 BUG_ON(!last_eb_bh || !*last_eb_bh); 739 740 if (eb_bh) { 741 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 742 el = &eb->h_list; 743 } else 744 el = et->root_el; 745 746 /* we never add a branch to a leaf. */ 747 BUG_ON(!el->l_tree_depth); 748 749 new_blocks = le16_to_cpu(el->l_tree_depth); 750 751 /* allocate the number of new eb blocks we need */ 752 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 753 GFP_KERNEL); 754 if (!new_eb_bhs) { 755 status = -ENOMEM; 756 mlog_errno(status); 757 goto bail; 758 } 759 760 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 761 meta_ac, new_eb_bhs); 762 if (status < 0) { 763 mlog_errno(status); 764 goto bail; 765 } 766 767 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; 768 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 769 770 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 771 * linked with the rest of the tree. 772 * conversly, new_eb_bhs[0] is the new bottommost leaf. 773 * 774 * when we leave the loop, new_last_eb_blk will point to the 775 * newest leaf, and next_blkno will point to the topmost extent 776 * block. */ 777 next_blkno = new_last_eb_blk = 0; 778 for(i = 0; i < new_blocks; i++) { 779 bh = new_eb_bhs[i]; 780 eb = (struct ocfs2_extent_block *) bh->b_data; 781 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 782 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 783 status = -EIO; 784 goto bail; 785 } 786 eb_el = &eb->h_list; 787 788 status = ocfs2_journal_access(handle, inode, bh, 789 OCFS2_JOURNAL_ACCESS_CREATE); 790 if (status < 0) { 791 mlog_errno(status); 792 goto bail; 793 } 794 795 eb->h_next_leaf_blk = 0; 796 eb_el->l_tree_depth = cpu_to_le16(i); 797 eb_el->l_next_free_rec = cpu_to_le16(1); 798 /* 799 * This actually counts as an empty extent as 800 * c_clusters == 0 801 */ 802 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 803 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 804 /* 805 * eb_el isn't always an interior node, but even leaf 806 * nodes want a zero'd flags and reserved field so 807 * this gets the whole 32 bits regardless of use. 808 */ 809 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 810 if (!eb_el->l_tree_depth) 811 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 812 813 status = ocfs2_journal_dirty(handle, bh); 814 if (status < 0) { 815 mlog_errno(status); 816 goto bail; 817 } 818 819 next_blkno = le64_to_cpu(eb->h_blkno); 820 } 821 822 /* This is a bit hairy. We want to update up to three blocks 823 * here without leaving any of them in an inconsistent state 824 * in case of error. We don't have to worry about 825 * journal_dirty erroring as it won't unless we've aborted the 826 * handle (in which case we would never be here) so reserving 827 * the write with journal_access is all we need to do. */ 828 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 829 OCFS2_JOURNAL_ACCESS_WRITE); 830 if (status < 0) { 831 mlog_errno(status); 832 goto bail; 833 } 834 status = ocfs2_journal_access(handle, inode, et->root_bh, 835 OCFS2_JOURNAL_ACCESS_WRITE); 836 if (status < 0) { 837 mlog_errno(status); 838 goto bail; 839 } 840 if (eb_bh) { 841 status = ocfs2_journal_access(handle, inode, eb_bh, 842 OCFS2_JOURNAL_ACCESS_WRITE); 843 if (status < 0) { 844 mlog_errno(status); 845 goto bail; 846 } 847 } 848 849 /* Link the new branch into the rest of the tree (el will 850 * either be on the root_bh, or the extent block passed in. */ 851 i = le16_to_cpu(el->l_next_free_rec); 852 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 853 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 854 el->l_recs[i].e_int_clusters = 0; 855 le16_add_cpu(&el->l_next_free_rec, 1); 856 857 /* fe needs a new last extent block pointer, as does the 858 * next_leaf on the previously last-extent-block. */ 859 ocfs2_set_last_eb_blk(et, new_last_eb_blk); 860 861 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 862 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 863 864 status = ocfs2_journal_dirty(handle, *last_eb_bh); 865 if (status < 0) 866 mlog_errno(status); 867 status = ocfs2_journal_dirty(handle, et->root_bh); 868 if (status < 0) 869 mlog_errno(status); 870 if (eb_bh) { 871 status = ocfs2_journal_dirty(handle, eb_bh); 872 if (status < 0) 873 mlog_errno(status); 874 } 875 876 /* 877 * Some callers want to track the rightmost leaf so pass it 878 * back here. 879 */ 880 brelse(*last_eb_bh); 881 get_bh(new_eb_bhs[0]); 882 *last_eb_bh = new_eb_bhs[0]; 883 884 status = 0; 885 bail: 886 if (new_eb_bhs) { 887 for (i = 0; i < new_blocks; i++) 888 if (new_eb_bhs[i]) 889 brelse(new_eb_bhs[i]); 890 kfree(new_eb_bhs); 891 } 892 893 mlog_exit(status); 894 return status; 895 } 896 897 /* 898 * adds another level to the allocation tree. 899 * returns back the new extent block so you can add a branch to it 900 * after this call. 901 */ 902 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 903 handle_t *handle, 904 struct inode *inode, 905 struct ocfs2_extent_tree *et, 906 struct ocfs2_alloc_context *meta_ac, 907 struct buffer_head **ret_new_eb_bh) 908 { 909 int status, i; 910 u32 new_clusters; 911 struct buffer_head *new_eb_bh = NULL; 912 struct ocfs2_extent_block *eb; 913 struct ocfs2_extent_list *root_el; 914 struct ocfs2_extent_list *eb_el; 915 916 mlog_entry_void(); 917 918 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 919 &new_eb_bh); 920 if (status < 0) { 921 mlog_errno(status); 922 goto bail; 923 } 924 925 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 926 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 927 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 928 status = -EIO; 929 goto bail; 930 } 931 932 eb_el = &eb->h_list; 933 root_el = et->root_el; 934 935 status = ocfs2_journal_access(handle, inode, new_eb_bh, 936 OCFS2_JOURNAL_ACCESS_CREATE); 937 if (status < 0) { 938 mlog_errno(status); 939 goto bail; 940 } 941 942 /* copy the root extent list data into the new extent block */ 943 eb_el->l_tree_depth = root_el->l_tree_depth; 944 eb_el->l_next_free_rec = root_el->l_next_free_rec; 945 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 946 eb_el->l_recs[i] = root_el->l_recs[i]; 947 948 status = ocfs2_journal_dirty(handle, new_eb_bh); 949 if (status < 0) { 950 mlog_errno(status); 951 goto bail; 952 } 953 954 status = ocfs2_journal_access(handle, inode, et->root_bh, 955 OCFS2_JOURNAL_ACCESS_WRITE); 956 if (status < 0) { 957 mlog_errno(status); 958 goto bail; 959 } 960 961 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 962 963 /* update root_bh now */ 964 le16_add_cpu(&root_el->l_tree_depth, 1); 965 root_el->l_recs[0].e_cpos = 0; 966 root_el->l_recs[0].e_blkno = eb->h_blkno; 967 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 968 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 969 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 970 root_el->l_next_free_rec = cpu_to_le16(1); 971 972 /* If this is our 1st tree depth shift, then last_eb_blk 973 * becomes the allocated extent block */ 974 if (root_el->l_tree_depth == cpu_to_le16(1)) 975 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 976 977 status = ocfs2_journal_dirty(handle, et->root_bh); 978 if (status < 0) { 979 mlog_errno(status); 980 goto bail; 981 } 982 983 *ret_new_eb_bh = new_eb_bh; 984 new_eb_bh = NULL; 985 status = 0; 986 bail: 987 if (new_eb_bh) 988 brelse(new_eb_bh); 989 990 mlog_exit(status); 991 return status; 992 } 993 994 /* 995 * Should only be called when there is no space left in any of the 996 * leaf nodes. What we want to do is find the lowest tree depth 997 * non-leaf extent block with room for new records. There are three 998 * valid results of this search: 999 * 1000 * 1) a lowest extent block is found, then we pass it back in 1001 * *lowest_eb_bh and return '0' 1002 * 1003 * 2) the search fails to find anything, but the root_el has room. We 1004 * pass NULL back in *lowest_eb_bh, but still return '0' 1005 * 1006 * 3) the search fails to find anything AND the root_el is full, in 1007 * which case we return > 0 1008 * 1009 * return status < 0 indicates an error. 1010 */ 1011 static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1012 struct inode *inode, 1013 struct ocfs2_extent_tree *et, 1014 struct buffer_head **target_bh) 1015 { 1016 int status = 0, i; 1017 u64 blkno; 1018 struct ocfs2_extent_block *eb; 1019 struct ocfs2_extent_list *el; 1020 struct buffer_head *bh = NULL; 1021 struct buffer_head *lowest_bh = NULL; 1022 1023 mlog_entry_void(); 1024 1025 *target_bh = NULL; 1026 1027 el = et->root_el; 1028 1029 while(le16_to_cpu(el->l_tree_depth) > 1) { 1030 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1031 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1032 "extent list (next_free_rec == 0)", 1033 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1034 status = -EIO; 1035 goto bail; 1036 } 1037 i = le16_to_cpu(el->l_next_free_rec) - 1; 1038 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1039 if (!blkno) { 1040 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1041 "list where extent # %d has no physical " 1042 "block start", 1043 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1044 status = -EIO; 1045 goto bail; 1046 } 1047 1048 if (bh) { 1049 brelse(bh); 1050 bh = NULL; 1051 } 1052 1053 status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 1054 inode); 1055 if (status < 0) { 1056 mlog_errno(status); 1057 goto bail; 1058 } 1059 1060 eb = (struct ocfs2_extent_block *) bh->b_data; 1061 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1062 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1063 status = -EIO; 1064 goto bail; 1065 } 1066 el = &eb->h_list; 1067 1068 if (le16_to_cpu(el->l_next_free_rec) < 1069 le16_to_cpu(el->l_count)) { 1070 if (lowest_bh) 1071 brelse(lowest_bh); 1072 lowest_bh = bh; 1073 get_bh(lowest_bh); 1074 } 1075 } 1076 1077 /* If we didn't find one and the fe doesn't have any room, 1078 * then return '1' */ 1079 el = et->root_el; 1080 if (!lowest_bh && (el->l_next_free_rec == el->l_count)) 1081 status = 1; 1082 1083 *target_bh = lowest_bh; 1084 bail: 1085 if (bh) 1086 brelse(bh); 1087 1088 mlog_exit(status); 1089 return status; 1090 } 1091 1092 /* 1093 * Grow a b-tree so that it has more records. 1094 * 1095 * We might shift the tree depth in which case existing paths should 1096 * be considered invalid. 1097 * 1098 * Tree depth after the grow is returned via *final_depth. 1099 * 1100 * *last_eb_bh will be updated by ocfs2_add_branch(). 1101 */ 1102 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1103 struct ocfs2_extent_tree *et, int *final_depth, 1104 struct buffer_head **last_eb_bh, 1105 struct ocfs2_alloc_context *meta_ac) 1106 { 1107 int ret, shift; 1108 struct ocfs2_extent_list *el = et->root_el; 1109 int depth = le16_to_cpu(el->l_tree_depth); 1110 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1111 struct buffer_head *bh = NULL; 1112 1113 BUG_ON(meta_ac == NULL); 1114 1115 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1116 if (shift < 0) { 1117 ret = shift; 1118 mlog_errno(ret); 1119 goto out; 1120 } 1121 1122 /* We traveled all the way to the bottom of the allocation tree 1123 * and didn't find room for any more extents - we need to add 1124 * another tree level */ 1125 if (shift) { 1126 BUG_ON(bh); 1127 mlog(0, "need to shift tree depth (current = %d)\n", depth); 1128 1129 /* ocfs2_shift_tree_depth will return us a buffer with 1130 * the new extent block (so we can pass that to 1131 * ocfs2_add_branch). */ 1132 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1133 meta_ac, &bh); 1134 if (ret < 0) { 1135 mlog_errno(ret); 1136 goto out; 1137 } 1138 depth++; 1139 if (depth == 1) { 1140 /* 1141 * Special case: we have room now if we shifted from 1142 * tree_depth 0, so no more work needs to be done. 1143 * 1144 * We won't be calling add_branch, so pass 1145 * back *last_eb_bh as the new leaf. At depth 1146 * zero, it should always be null so there's 1147 * no reason to brelse. 1148 */ 1149 BUG_ON(*last_eb_bh); 1150 get_bh(bh); 1151 *last_eb_bh = bh; 1152 goto out; 1153 } 1154 } 1155 1156 /* call ocfs2_add_branch to add the final part of the tree with 1157 * the new data. */ 1158 mlog(0, "add branch. bh = %p\n", bh); 1159 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1160 meta_ac); 1161 if (ret < 0) { 1162 mlog_errno(ret); 1163 goto out; 1164 } 1165 1166 out: 1167 if (final_depth) 1168 *final_depth = depth; 1169 brelse(bh); 1170 return ret; 1171 } 1172 1173 /* 1174 * This function will discard the rightmost extent record. 1175 */ 1176 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1177 { 1178 int next_free = le16_to_cpu(el->l_next_free_rec); 1179 int count = le16_to_cpu(el->l_count); 1180 unsigned int num_bytes; 1181 1182 BUG_ON(!next_free); 1183 /* This will cause us to go off the end of our extent list. */ 1184 BUG_ON(next_free >= count); 1185 1186 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 1187 1188 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 1189 } 1190 1191 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 1192 struct ocfs2_extent_rec *insert_rec) 1193 { 1194 int i, insert_index, next_free, has_empty, num_bytes; 1195 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 1196 struct ocfs2_extent_rec *rec; 1197 1198 next_free = le16_to_cpu(el->l_next_free_rec); 1199 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 1200 1201 BUG_ON(!next_free); 1202 1203 /* The tree code before us didn't allow enough room in the leaf. */ 1204 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); 1205 1206 /* 1207 * The easiest way to approach this is to just remove the 1208 * empty extent and temporarily decrement next_free. 1209 */ 1210 if (has_empty) { 1211 /* 1212 * If next_free was 1 (only an empty extent), this 1213 * loop won't execute, which is fine. We still want 1214 * the decrement above to happen. 1215 */ 1216 for(i = 0; i < (next_free - 1); i++) 1217 el->l_recs[i] = el->l_recs[i+1]; 1218 1219 next_free--; 1220 } 1221 1222 /* 1223 * Figure out what the new record index should be. 1224 */ 1225 for(i = 0; i < next_free; i++) { 1226 rec = &el->l_recs[i]; 1227 1228 if (insert_cpos < le32_to_cpu(rec->e_cpos)) 1229 break; 1230 } 1231 insert_index = i; 1232 1233 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 1234 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 1235 1236 BUG_ON(insert_index < 0); 1237 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1238 BUG_ON(insert_index > next_free); 1239 1240 /* 1241 * No need to memmove if we're just adding to the tail. 1242 */ 1243 if (insert_index != next_free) { 1244 BUG_ON(next_free >= le16_to_cpu(el->l_count)); 1245 1246 num_bytes = next_free - insert_index; 1247 num_bytes *= sizeof(struct ocfs2_extent_rec); 1248 memmove(&el->l_recs[insert_index + 1], 1249 &el->l_recs[insert_index], 1250 num_bytes); 1251 } 1252 1253 /* 1254 * Either we had an empty extent, and need to re-increment or 1255 * there was no empty extent on a non full rightmost leaf node, 1256 * in which case we still need to increment. 1257 */ 1258 next_free++; 1259 el->l_next_free_rec = cpu_to_le16(next_free); 1260 /* 1261 * Make sure none of the math above just messed up our tree. 1262 */ 1263 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 1264 1265 el->l_recs[insert_index] = *insert_rec; 1266 1267 } 1268 1269 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) 1270 { 1271 int size, num_recs = le16_to_cpu(el->l_next_free_rec); 1272 1273 BUG_ON(num_recs == 0); 1274 1275 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 1276 num_recs--; 1277 size = num_recs * sizeof(struct ocfs2_extent_rec); 1278 memmove(&el->l_recs[0], &el->l_recs[1], size); 1279 memset(&el->l_recs[num_recs], 0, 1280 sizeof(struct ocfs2_extent_rec)); 1281 el->l_next_free_rec = cpu_to_le16(num_recs); 1282 } 1283 } 1284 1285 /* 1286 * Create an empty extent record . 1287 * 1288 * l_next_free_rec may be updated. 1289 * 1290 * If an empty extent already exists do nothing. 1291 */ 1292 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1293 { 1294 int next_free = le16_to_cpu(el->l_next_free_rec); 1295 1296 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1297 1298 if (next_free == 0) 1299 goto set_and_inc; 1300 1301 if (ocfs2_is_empty_extent(&el->l_recs[0])) 1302 return; 1303 1304 mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1305 "Asked to create an empty extent in a full list:\n" 1306 "count = %u, tree depth = %u", 1307 le16_to_cpu(el->l_count), 1308 le16_to_cpu(el->l_tree_depth)); 1309 1310 ocfs2_shift_records_right(el); 1311 1312 set_and_inc: 1313 le16_add_cpu(&el->l_next_free_rec, 1); 1314 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1315 } 1316 1317 /* 1318 * For a rotation which involves two leaf nodes, the "root node" is 1319 * the lowest level tree node which contains a path to both leafs. This 1320 * resulting set of information can be used to form a complete "subtree" 1321 * 1322 * This function is passed two full paths from the dinode down to a 1323 * pair of adjacent leaves. It's task is to figure out which path 1324 * index contains the subtree root - this can be the root index itself 1325 * in a worst-case rotation. 1326 * 1327 * The array index of the subtree root is passed back. 1328 */ 1329 static int ocfs2_find_subtree_root(struct inode *inode, 1330 struct ocfs2_path *left, 1331 struct ocfs2_path *right) 1332 { 1333 int i = 0; 1334 1335 /* 1336 * Check that the caller passed in two paths from the same tree. 1337 */ 1338 BUG_ON(path_root_bh(left) != path_root_bh(right)); 1339 1340 do { 1341 i++; 1342 1343 /* 1344 * The caller didn't pass two adjacent paths. 1345 */ 1346 mlog_bug_on_msg(i > left->p_tree_depth, 1347 "Inode %lu, left depth %u, right depth %u\n" 1348 "left leaf blk %llu, right leaf blk %llu\n", 1349 inode->i_ino, left->p_tree_depth, 1350 right->p_tree_depth, 1351 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1352 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1353 } while (left->p_node[i].bh->b_blocknr == 1354 right->p_node[i].bh->b_blocknr); 1355 1356 return i - 1; 1357 } 1358 1359 typedef void (path_insert_t)(void *, struct buffer_head *); 1360 1361 /* 1362 * Traverse a btree path in search of cpos, starting at root_el. 1363 * 1364 * This code can be called with a cpos larger than the tree, in which 1365 * case it will return the rightmost path. 1366 */ 1367 static int __ocfs2_find_path(struct inode *inode, 1368 struct ocfs2_extent_list *root_el, u32 cpos, 1369 path_insert_t *func, void *data) 1370 { 1371 int i, ret = 0; 1372 u32 range; 1373 u64 blkno; 1374 struct buffer_head *bh = NULL; 1375 struct ocfs2_extent_block *eb; 1376 struct ocfs2_extent_list *el; 1377 struct ocfs2_extent_rec *rec; 1378 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1379 1380 el = root_el; 1381 while (el->l_tree_depth) { 1382 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1383 ocfs2_error(inode->i_sb, 1384 "Inode %llu has empty extent list at " 1385 "depth %u\n", 1386 (unsigned long long)oi->ip_blkno, 1387 le16_to_cpu(el->l_tree_depth)); 1388 ret = -EROFS; 1389 goto out; 1390 1391 } 1392 1393 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1394 rec = &el->l_recs[i]; 1395 1396 /* 1397 * In the case that cpos is off the allocation 1398 * tree, this should just wind up returning the 1399 * rightmost record. 1400 */ 1401 range = le32_to_cpu(rec->e_cpos) + 1402 ocfs2_rec_clusters(el, rec); 1403 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1404 break; 1405 } 1406 1407 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1408 if (blkno == 0) { 1409 ocfs2_error(inode->i_sb, 1410 "Inode %llu has bad blkno in extent list " 1411 "at depth %u (index %d)\n", 1412 (unsigned long long)oi->ip_blkno, 1413 le16_to_cpu(el->l_tree_depth), i); 1414 ret = -EROFS; 1415 goto out; 1416 } 1417 1418 brelse(bh); 1419 bh = NULL; 1420 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1421 &bh, OCFS2_BH_CACHED, inode); 1422 if (ret) { 1423 mlog_errno(ret); 1424 goto out; 1425 } 1426 1427 eb = (struct ocfs2_extent_block *) bh->b_data; 1428 el = &eb->h_list; 1429 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1430 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1431 ret = -EIO; 1432 goto out; 1433 } 1434 1435 if (le16_to_cpu(el->l_next_free_rec) > 1436 le16_to_cpu(el->l_count)) { 1437 ocfs2_error(inode->i_sb, 1438 "Inode %llu has bad count in extent list " 1439 "at block %llu (next free=%u, count=%u)\n", 1440 (unsigned long long)oi->ip_blkno, 1441 (unsigned long long)bh->b_blocknr, 1442 le16_to_cpu(el->l_next_free_rec), 1443 le16_to_cpu(el->l_count)); 1444 ret = -EROFS; 1445 goto out; 1446 } 1447 1448 if (func) 1449 func(data, bh); 1450 } 1451 1452 out: 1453 /* 1454 * Catch any trailing bh that the loop didn't handle. 1455 */ 1456 brelse(bh); 1457 1458 return ret; 1459 } 1460 1461 /* 1462 * Given an initialized path (that is, it has a valid root extent 1463 * list), this function will traverse the btree in search of the path 1464 * which would contain cpos. 1465 * 1466 * The path traveled is recorded in the path structure. 1467 * 1468 * Note that this will not do any comparisons on leaf node extent 1469 * records, so it will work fine in the case that we just added a tree 1470 * branch. 1471 */ 1472 struct find_path_data { 1473 int index; 1474 struct ocfs2_path *path; 1475 }; 1476 static void find_path_ins(void *data, struct buffer_head *bh) 1477 { 1478 struct find_path_data *fp = data; 1479 1480 get_bh(bh); 1481 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1482 fp->index++; 1483 } 1484 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1485 u32 cpos) 1486 { 1487 struct find_path_data data; 1488 1489 data.index = 1; 1490 data.path = path; 1491 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1492 find_path_ins, &data); 1493 } 1494 1495 static void find_leaf_ins(void *data, struct buffer_head *bh) 1496 { 1497 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1498 struct ocfs2_extent_list *el = &eb->h_list; 1499 struct buffer_head **ret = data; 1500 1501 /* We want to retain only the leaf block. */ 1502 if (le16_to_cpu(el->l_tree_depth) == 0) { 1503 get_bh(bh); 1504 *ret = bh; 1505 } 1506 } 1507 /* 1508 * Find the leaf block in the tree which would contain cpos. No 1509 * checking of the actual leaf is done. 1510 * 1511 * Some paths want to call this instead of allocating a path structure 1512 * and calling ocfs2_find_path(). 1513 * 1514 * This function doesn't handle non btree extent lists. 1515 */ 1516 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1517 u32 cpos, struct buffer_head **leaf_bh) 1518 { 1519 int ret; 1520 struct buffer_head *bh = NULL; 1521 1522 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1523 if (ret) { 1524 mlog_errno(ret); 1525 goto out; 1526 } 1527 1528 *leaf_bh = bh; 1529 out: 1530 return ret; 1531 } 1532 1533 /* 1534 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1535 * 1536 * Basically, we've moved stuff around at the bottom of the tree and 1537 * we need to fix up the extent records above the changes to reflect 1538 * the new changes. 1539 * 1540 * left_rec: the record on the left. 1541 * left_child_el: is the child list pointed to by left_rec 1542 * right_rec: the record to the right of left_rec 1543 * right_child_el: is the child list pointed to by right_rec 1544 * 1545 * By definition, this only works on interior nodes. 1546 */ 1547 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1548 struct ocfs2_extent_list *left_child_el, 1549 struct ocfs2_extent_rec *right_rec, 1550 struct ocfs2_extent_list *right_child_el) 1551 { 1552 u32 left_clusters, right_end; 1553 1554 /* 1555 * Interior nodes never have holes. Their cpos is the cpos of 1556 * the leftmost record in their child list. Their cluster 1557 * count covers the full theoretical range of their child list 1558 * - the range between their cpos and the cpos of the record 1559 * immediately to their right. 1560 */ 1561 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1562 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { 1563 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1564 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1565 } 1566 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1567 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1568 1569 /* 1570 * Calculate the rightmost cluster count boundary before 1571 * moving cpos - we will need to adjust clusters after 1572 * updating e_cpos to keep the same highest cluster count. 1573 */ 1574 right_end = le32_to_cpu(right_rec->e_cpos); 1575 right_end += le32_to_cpu(right_rec->e_int_clusters); 1576 1577 right_rec->e_cpos = left_rec->e_cpos; 1578 le32_add_cpu(&right_rec->e_cpos, left_clusters); 1579 1580 right_end -= le32_to_cpu(right_rec->e_cpos); 1581 right_rec->e_int_clusters = cpu_to_le32(right_end); 1582 } 1583 1584 /* 1585 * Adjust the adjacent root node records involved in a 1586 * rotation. left_el_blkno is passed in as a key so that we can easily 1587 * find it's index in the root list. 1588 */ 1589 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1590 struct ocfs2_extent_list *left_el, 1591 struct ocfs2_extent_list *right_el, 1592 u64 left_el_blkno) 1593 { 1594 int i; 1595 1596 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1597 le16_to_cpu(left_el->l_tree_depth)); 1598 1599 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1600 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1601 break; 1602 } 1603 1604 /* 1605 * The path walking code should have never returned a root and 1606 * two paths which are not adjacent. 1607 */ 1608 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1609 1610 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1611 &root_el->l_recs[i + 1], right_el); 1612 } 1613 1614 /* 1615 * We've changed a leaf block (in right_path) and need to reflect that 1616 * change back up the subtree. 1617 * 1618 * This happens in multiple places: 1619 * - When we've moved an extent record from the left path leaf to the right 1620 * path leaf to make room for an empty extent in the left path leaf. 1621 * - When our insert into the right path leaf is at the leftmost edge 1622 * and requires an update of the path immediately to it's left. This 1623 * can occur at the end of some types of rotation and appending inserts. 1624 * - When we've adjusted the last extent record in the left path leaf and the 1625 * 1st extent record in the right path leaf during cross extent block merge. 1626 */ 1627 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1628 struct ocfs2_path *left_path, 1629 struct ocfs2_path *right_path, 1630 int subtree_index) 1631 { 1632 int ret, i, idx; 1633 struct ocfs2_extent_list *el, *left_el, *right_el; 1634 struct ocfs2_extent_rec *left_rec, *right_rec; 1635 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1636 1637 /* 1638 * Update the counts and position values within all the 1639 * interior nodes to reflect the leaf rotation we just did. 1640 * 1641 * The root node is handled below the loop. 1642 * 1643 * We begin the loop with right_el and left_el pointing to the 1644 * leaf lists and work our way up. 1645 * 1646 * NOTE: within this loop, left_el and right_el always refer 1647 * to the *child* lists. 1648 */ 1649 left_el = path_leaf_el(left_path); 1650 right_el = path_leaf_el(right_path); 1651 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1652 mlog(0, "Adjust records at index %u\n", i); 1653 1654 /* 1655 * One nice property of knowing that all of these 1656 * nodes are below the root is that we only deal with 1657 * the leftmost right node record and the rightmost 1658 * left node record. 1659 */ 1660 el = left_path->p_node[i].el; 1661 idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1662 left_rec = &el->l_recs[idx]; 1663 1664 el = right_path->p_node[i].el; 1665 right_rec = &el->l_recs[0]; 1666 1667 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1668 right_el); 1669 1670 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1671 if (ret) 1672 mlog_errno(ret); 1673 1674 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1675 if (ret) 1676 mlog_errno(ret); 1677 1678 /* 1679 * Setup our list pointers now so that the current 1680 * parents become children in the next iteration. 1681 */ 1682 left_el = left_path->p_node[i].el; 1683 right_el = right_path->p_node[i].el; 1684 } 1685 1686 /* 1687 * At the root node, adjust the two adjacent records which 1688 * begin our path to the leaves. 1689 */ 1690 1691 el = left_path->p_node[subtree_index].el; 1692 left_el = left_path->p_node[subtree_index + 1].el; 1693 right_el = right_path->p_node[subtree_index + 1].el; 1694 1695 ocfs2_adjust_root_records(el, left_el, right_el, 1696 left_path->p_node[subtree_index + 1].bh->b_blocknr); 1697 1698 root_bh = left_path->p_node[subtree_index].bh; 1699 1700 ret = ocfs2_journal_dirty(handle, root_bh); 1701 if (ret) 1702 mlog_errno(ret); 1703 } 1704 1705 static int ocfs2_rotate_subtree_right(struct inode *inode, 1706 handle_t *handle, 1707 struct ocfs2_path *left_path, 1708 struct ocfs2_path *right_path, 1709 int subtree_index) 1710 { 1711 int ret, i; 1712 struct buffer_head *right_leaf_bh; 1713 struct buffer_head *left_leaf_bh = NULL; 1714 struct buffer_head *root_bh; 1715 struct ocfs2_extent_list *right_el, *left_el; 1716 struct ocfs2_extent_rec move_rec; 1717 1718 left_leaf_bh = path_leaf_bh(left_path); 1719 left_el = path_leaf_el(left_path); 1720 1721 if (left_el->l_next_free_rec != left_el->l_count) { 1722 ocfs2_error(inode->i_sb, 1723 "Inode %llu has non-full interior leaf node %llu" 1724 "(next free = %u)", 1725 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1726 (unsigned long long)left_leaf_bh->b_blocknr, 1727 le16_to_cpu(left_el->l_next_free_rec)); 1728 return -EROFS; 1729 } 1730 1731 /* 1732 * This extent block may already have an empty record, so we 1733 * return early if so. 1734 */ 1735 if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1736 return 0; 1737 1738 root_bh = left_path->p_node[subtree_index].bh; 1739 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1740 1741 ret = ocfs2_journal_access(handle, inode, root_bh, 1742 OCFS2_JOURNAL_ACCESS_WRITE); 1743 if (ret) { 1744 mlog_errno(ret); 1745 goto out; 1746 } 1747 1748 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1749 ret = ocfs2_journal_access(handle, inode, 1750 right_path->p_node[i].bh, 1751 OCFS2_JOURNAL_ACCESS_WRITE); 1752 if (ret) { 1753 mlog_errno(ret); 1754 goto out; 1755 } 1756 1757 ret = ocfs2_journal_access(handle, inode, 1758 left_path->p_node[i].bh, 1759 OCFS2_JOURNAL_ACCESS_WRITE); 1760 if (ret) { 1761 mlog_errno(ret); 1762 goto out; 1763 } 1764 } 1765 1766 right_leaf_bh = path_leaf_bh(right_path); 1767 right_el = path_leaf_el(right_path); 1768 1769 /* This is a code error, not a disk corruption. */ 1770 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1771 "because rightmost leaf block %llu is empty\n", 1772 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1773 (unsigned long long)right_leaf_bh->b_blocknr); 1774 1775 ocfs2_create_empty_extent(right_el); 1776 1777 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1778 if (ret) { 1779 mlog_errno(ret); 1780 goto out; 1781 } 1782 1783 /* Do the copy now. */ 1784 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1785 move_rec = left_el->l_recs[i]; 1786 right_el->l_recs[0] = move_rec; 1787 1788 /* 1789 * Clear out the record we just copied and shift everything 1790 * over, leaving an empty extent in the left leaf. 1791 * 1792 * We temporarily subtract from next_free_rec so that the 1793 * shift will lose the tail record (which is now defunct). 1794 */ 1795 le16_add_cpu(&left_el->l_next_free_rec, -1); 1796 ocfs2_shift_records_right(left_el); 1797 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1798 le16_add_cpu(&left_el->l_next_free_rec, 1); 1799 1800 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1801 if (ret) { 1802 mlog_errno(ret); 1803 goto out; 1804 } 1805 1806 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1807 subtree_index); 1808 1809 out: 1810 return ret; 1811 } 1812 1813 /* 1814 * Given a full path, determine what cpos value would return us a path 1815 * containing the leaf immediately to the left of the current one. 1816 * 1817 * Will return zero if the path passed in is already the leftmost path. 1818 */ 1819 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 1820 struct ocfs2_path *path, u32 *cpos) 1821 { 1822 int i, j, ret = 0; 1823 u64 blkno; 1824 struct ocfs2_extent_list *el; 1825 1826 BUG_ON(path->p_tree_depth == 0); 1827 1828 *cpos = 0; 1829 1830 blkno = path_leaf_bh(path)->b_blocknr; 1831 1832 /* Start at the tree node just above the leaf and work our way up. */ 1833 i = path->p_tree_depth - 1; 1834 while (i >= 0) { 1835 el = path->p_node[i].el; 1836 1837 /* 1838 * Find the extent record just before the one in our 1839 * path. 1840 */ 1841 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 1842 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 1843 if (j == 0) { 1844 if (i == 0) { 1845 /* 1846 * We've determined that the 1847 * path specified is already 1848 * the leftmost one - return a 1849 * cpos of zero. 1850 */ 1851 goto out; 1852 } 1853 /* 1854 * The leftmost record points to our 1855 * leaf - we need to travel up the 1856 * tree one level. 1857 */ 1858 goto next_node; 1859 } 1860 1861 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 1862 *cpos = *cpos + ocfs2_rec_clusters(el, 1863 &el->l_recs[j - 1]); 1864 *cpos = *cpos - 1; 1865 goto out; 1866 } 1867 } 1868 1869 /* 1870 * If we got here, we never found a valid node where 1871 * the tree indicated one should be. 1872 */ 1873 ocfs2_error(sb, 1874 "Invalid extent tree at extent block %llu\n", 1875 (unsigned long long)blkno); 1876 ret = -EROFS; 1877 goto out; 1878 1879 next_node: 1880 blkno = path->p_node[i].bh->b_blocknr; 1881 i--; 1882 } 1883 1884 out: 1885 return ret; 1886 } 1887 1888 /* 1889 * Extend the transaction by enough credits to complete the rotation, 1890 * and still leave at least the original number of credits allocated 1891 * to this transaction. 1892 */ 1893 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1894 int op_credits, 1895 struct ocfs2_path *path) 1896 { 1897 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 1898 1899 if (handle->h_buffer_credits < credits) 1900 return ocfs2_extend_trans(handle, credits); 1901 1902 return 0; 1903 } 1904 1905 /* 1906 * Trap the case where we're inserting into the theoretical range past 1907 * the _actual_ left leaf range. Otherwise, we'll rotate a record 1908 * whose cpos is less than ours into the right leaf. 1909 * 1910 * It's only necessary to look at the rightmost record of the left 1911 * leaf because the logic that calls us should ensure that the 1912 * theoretical ranges in the path components above the leaves are 1913 * correct. 1914 */ 1915 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 1916 u32 insert_cpos) 1917 { 1918 struct ocfs2_extent_list *left_el; 1919 struct ocfs2_extent_rec *rec; 1920 int next_free; 1921 1922 left_el = path_leaf_el(left_path); 1923 next_free = le16_to_cpu(left_el->l_next_free_rec); 1924 rec = &left_el->l_recs[next_free - 1]; 1925 1926 if (insert_cpos > le32_to_cpu(rec->e_cpos)) 1927 return 1; 1928 return 0; 1929 } 1930 1931 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) 1932 { 1933 int next_free = le16_to_cpu(el->l_next_free_rec); 1934 unsigned int range; 1935 struct ocfs2_extent_rec *rec; 1936 1937 if (next_free == 0) 1938 return 0; 1939 1940 rec = &el->l_recs[0]; 1941 if (ocfs2_is_empty_extent(rec)) { 1942 /* Empty list. */ 1943 if (next_free == 1) 1944 return 0; 1945 rec = &el->l_recs[1]; 1946 } 1947 1948 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 1949 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1950 return 1; 1951 return 0; 1952 } 1953 1954 /* 1955 * Rotate all the records in a btree right one record, starting at insert_cpos. 1956 * 1957 * The path to the rightmost leaf should be passed in. 1958 * 1959 * The array is assumed to be large enough to hold an entire path (tree depth). 1960 * 1961 * Upon succesful return from this function: 1962 * 1963 * - The 'right_path' array will contain a path to the leaf block 1964 * whose range contains e_cpos. 1965 * - That leaf block will have a single empty extent in list index 0. 1966 * - In the case that the rotation requires a post-insert update, 1967 * *ret_left_path will contain a valid path which can be passed to 1968 * ocfs2_insert_path(). 1969 */ 1970 static int ocfs2_rotate_tree_right(struct inode *inode, 1971 handle_t *handle, 1972 enum ocfs2_split_type split, 1973 u32 insert_cpos, 1974 struct ocfs2_path *right_path, 1975 struct ocfs2_path **ret_left_path) 1976 { 1977 int ret, start, orig_credits = handle->h_buffer_credits; 1978 u32 cpos; 1979 struct ocfs2_path *left_path = NULL; 1980 1981 *ret_left_path = NULL; 1982 1983 left_path = ocfs2_new_path(path_root_bh(right_path), 1984 path_root_el(right_path)); 1985 if (!left_path) { 1986 ret = -ENOMEM; 1987 mlog_errno(ret); 1988 goto out; 1989 } 1990 1991 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 1992 if (ret) { 1993 mlog_errno(ret); 1994 goto out; 1995 } 1996 1997 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 1998 1999 /* 2000 * What we want to do here is: 2001 * 2002 * 1) Start with the rightmost path. 2003 * 2004 * 2) Determine a path to the leaf block directly to the left 2005 * of that leaf. 2006 * 2007 * 3) Determine the 'subtree root' - the lowest level tree node 2008 * which contains a path to both leaves. 2009 * 2010 * 4) Rotate the subtree. 2011 * 2012 * 5) Find the next subtree by considering the left path to be 2013 * the new right path. 2014 * 2015 * The check at the top of this while loop also accepts 2016 * insert_cpos == cpos because cpos is only a _theoretical_ 2017 * value to get us the left path - insert_cpos might very well 2018 * be filling that hole. 2019 * 2020 * Stop at a cpos of '0' because we either started at the 2021 * leftmost branch (i.e., a tree with one branch and a 2022 * rotation inside of it), or we've gone as far as we can in 2023 * rotating subtrees. 2024 */ 2025 while (cpos && insert_cpos <= cpos) { 2026 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2027 insert_cpos, cpos); 2028 2029 ret = ocfs2_find_path(inode, left_path, cpos); 2030 if (ret) { 2031 mlog_errno(ret); 2032 goto out; 2033 } 2034 2035 mlog_bug_on_msg(path_leaf_bh(left_path) == 2036 path_leaf_bh(right_path), 2037 "Inode %lu: error during insert of %u " 2038 "(left path cpos %u) results in two identical " 2039 "paths ending at %llu\n", 2040 inode->i_ino, insert_cpos, cpos, 2041 (unsigned long long) 2042 path_leaf_bh(left_path)->b_blocknr); 2043 2044 if (split == SPLIT_NONE && 2045 ocfs2_rotate_requires_path_adjustment(left_path, 2046 insert_cpos)) { 2047 2048 /* 2049 * We've rotated the tree as much as we 2050 * should. The rest is up to 2051 * ocfs2_insert_path() to complete, after the 2052 * record insertion. We indicate this 2053 * situation by returning the left path. 2054 * 2055 * The reason we don't adjust the records here 2056 * before the record insert is that an error 2057 * later might break the rule where a parent 2058 * record e_cpos will reflect the actual 2059 * e_cpos of the 1st nonempty record of the 2060 * child list. 2061 */ 2062 *ret_left_path = left_path; 2063 goto out_ret_path; 2064 } 2065 2066 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2067 2068 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2069 start, 2070 (unsigned long long) right_path->p_node[start].bh->b_blocknr, 2071 right_path->p_tree_depth); 2072 2073 ret = ocfs2_extend_rotate_transaction(handle, start, 2074 orig_credits, right_path); 2075 if (ret) { 2076 mlog_errno(ret); 2077 goto out; 2078 } 2079 2080 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2081 right_path, start); 2082 if (ret) { 2083 mlog_errno(ret); 2084 goto out; 2085 } 2086 2087 if (split != SPLIT_NONE && 2088 ocfs2_leftmost_rec_contains(path_leaf_el(right_path), 2089 insert_cpos)) { 2090 /* 2091 * A rotate moves the rightmost left leaf 2092 * record over to the leftmost right leaf 2093 * slot. If we're doing an extent split 2094 * instead of a real insert, then we have to 2095 * check that the extent to be split wasn't 2096 * just moved over. If it was, then we can 2097 * exit here, passing left_path back - 2098 * ocfs2_split_extent() is smart enough to 2099 * search both leaves. 2100 */ 2101 *ret_left_path = left_path; 2102 goto out_ret_path; 2103 } 2104 2105 /* 2106 * There is no need to re-read the next right path 2107 * as we know that it'll be our current left 2108 * path. Optimize by copying values instead. 2109 */ 2110 ocfs2_mv_path(right_path, left_path); 2111 2112 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2113 &cpos); 2114 if (ret) { 2115 mlog_errno(ret); 2116 goto out; 2117 } 2118 } 2119 2120 out: 2121 ocfs2_free_path(left_path); 2122 2123 out_ret_path: 2124 return ret; 2125 } 2126 2127 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2128 struct ocfs2_path *path) 2129 { 2130 int i, idx; 2131 struct ocfs2_extent_rec *rec; 2132 struct ocfs2_extent_list *el; 2133 struct ocfs2_extent_block *eb; 2134 u32 range; 2135 2136 /* Path should always be rightmost. */ 2137 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2138 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2139 2140 el = &eb->h_list; 2141 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 2142 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2143 rec = &el->l_recs[idx]; 2144 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2145 2146 for (i = 0; i < path->p_tree_depth; i++) { 2147 el = path->p_node[i].el; 2148 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2149 rec = &el->l_recs[idx]; 2150 2151 rec->e_int_clusters = cpu_to_le32(range); 2152 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); 2153 2154 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2155 } 2156 } 2157 2158 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2159 struct ocfs2_cached_dealloc_ctxt *dealloc, 2160 struct ocfs2_path *path, int unlink_start) 2161 { 2162 int ret, i; 2163 struct ocfs2_extent_block *eb; 2164 struct ocfs2_extent_list *el; 2165 struct buffer_head *bh; 2166 2167 for(i = unlink_start; i < path_num_items(path); i++) { 2168 bh = path->p_node[i].bh; 2169 2170 eb = (struct ocfs2_extent_block *)bh->b_data; 2171 /* 2172 * Not all nodes might have had their final count 2173 * decremented by the caller - handle this here. 2174 */ 2175 el = &eb->h_list; 2176 if (le16_to_cpu(el->l_next_free_rec) > 1) { 2177 mlog(ML_ERROR, 2178 "Inode %llu, attempted to remove extent block " 2179 "%llu with %u records\n", 2180 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2181 (unsigned long long)le64_to_cpu(eb->h_blkno), 2182 le16_to_cpu(el->l_next_free_rec)); 2183 2184 ocfs2_journal_dirty(handle, bh); 2185 ocfs2_remove_from_cache(inode, bh); 2186 continue; 2187 } 2188 2189 el->l_next_free_rec = 0; 2190 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2191 2192 ocfs2_journal_dirty(handle, bh); 2193 2194 ret = ocfs2_cache_extent_block_free(dealloc, eb); 2195 if (ret) 2196 mlog_errno(ret); 2197 2198 ocfs2_remove_from_cache(inode, bh); 2199 } 2200 } 2201 2202 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2203 struct ocfs2_path *left_path, 2204 struct ocfs2_path *right_path, 2205 int subtree_index, 2206 struct ocfs2_cached_dealloc_ctxt *dealloc) 2207 { 2208 int i; 2209 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2210 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; 2211 struct ocfs2_extent_list *el; 2212 struct ocfs2_extent_block *eb; 2213 2214 el = path_leaf_el(left_path); 2215 2216 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; 2217 2218 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 2219 if (root_el->l_recs[i].e_blkno == eb->h_blkno) 2220 break; 2221 2222 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); 2223 2224 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 2225 le16_add_cpu(&root_el->l_next_free_rec, -1); 2226 2227 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2228 eb->h_next_leaf_blk = 0; 2229 2230 ocfs2_journal_dirty(handle, root_bh); 2231 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2232 2233 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2234 subtree_index + 1); 2235 } 2236 2237 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2238 struct ocfs2_path *left_path, 2239 struct ocfs2_path *right_path, 2240 int subtree_index, 2241 struct ocfs2_cached_dealloc_ctxt *dealloc, 2242 int *deleted, 2243 struct ocfs2_extent_tree *et) 2244 { 2245 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2246 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2247 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2248 struct ocfs2_extent_block *eb; 2249 2250 *deleted = 0; 2251 2252 right_leaf_el = path_leaf_el(right_path); 2253 left_leaf_el = path_leaf_el(left_path); 2254 root_bh = left_path->p_node[subtree_index].bh; 2255 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2256 2257 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) 2258 return 0; 2259 2260 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; 2261 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { 2262 /* 2263 * It's legal for us to proceed if the right leaf is 2264 * the rightmost one and it has an empty extent. There 2265 * are two cases to handle - whether the leaf will be 2266 * empty after removal or not. If the leaf isn't empty 2267 * then just remove the empty extent up front. The 2268 * next block will handle empty leaves by flagging 2269 * them for unlink. 2270 * 2271 * Non rightmost leaves will throw -EAGAIN and the 2272 * caller can manually move the subtree and retry. 2273 */ 2274 2275 if (eb->h_next_leaf_blk != 0ULL) 2276 return -EAGAIN; 2277 2278 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2279 ret = ocfs2_journal_access(handle, inode, 2280 path_leaf_bh(right_path), 2281 OCFS2_JOURNAL_ACCESS_WRITE); 2282 if (ret) { 2283 mlog_errno(ret); 2284 goto out; 2285 } 2286 2287 ocfs2_remove_empty_extent(right_leaf_el); 2288 } else 2289 right_has_empty = 1; 2290 } 2291 2292 if (eb->h_next_leaf_blk == 0ULL && 2293 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { 2294 /* 2295 * We have to update i_last_eb_blk during the meta 2296 * data delete. 2297 */ 2298 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2299 OCFS2_JOURNAL_ACCESS_WRITE); 2300 if (ret) { 2301 mlog_errno(ret); 2302 goto out; 2303 } 2304 2305 del_right_subtree = 1; 2306 } 2307 2308 /* 2309 * Getting here with an empty extent in the right path implies 2310 * that it's the rightmost path and will be deleted. 2311 */ 2312 BUG_ON(right_has_empty && !del_right_subtree); 2313 2314 ret = ocfs2_journal_access(handle, inode, root_bh, 2315 OCFS2_JOURNAL_ACCESS_WRITE); 2316 if (ret) { 2317 mlog_errno(ret); 2318 goto out; 2319 } 2320 2321 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2322 ret = ocfs2_journal_access(handle, inode, 2323 right_path->p_node[i].bh, 2324 OCFS2_JOURNAL_ACCESS_WRITE); 2325 if (ret) { 2326 mlog_errno(ret); 2327 goto out; 2328 } 2329 2330 ret = ocfs2_journal_access(handle, inode, 2331 left_path->p_node[i].bh, 2332 OCFS2_JOURNAL_ACCESS_WRITE); 2333 if (ret) { 2334 mlog_errno(ret); 2335 goto out; 2336 } 2337 } 2338 2339 if (!right_has_empty) { 2340 /* 2341 * Only do this if we're moving a real 2342 * record. Otherwise, the action is delayed until 2343 * after removal of the right path in which case we 2344 * can do a simple shift to remove the empty extent. 2345 */ 2346 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); 2347 memset(&right_leaf_el->l_recs[0], 0, 2348 sizeof(struct ocfs2_extent_rec)); 2349 } 2350 if (eb->h_next_leaf_blk == 0ULL) { 2351 /* 2352 * Move recs over to get rid of empty extent, decrease 2353 * next_free. This is allowed to remove the last 2354 * extent in our leaf (setting l_next_free_rec to 2355 * zero) - the delete code below won't care. 2356 */ 2357 ocfs2_remove_empty_extent(right_leaf_el); 2358 } 2359 2360 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2361 if (ret) 2362 mlog_errno(ret); 2363 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 2364 if (ret) 2365 mlog_errno(ret); 2366 2367 if (del_right_subtree) { 2368 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2369 subtree_index, dealloc); 2370 ocfs2_update_edge_lengths(inode, handle, left_path); 2371 2372 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2373 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2374 2375 /* 2376 * Removal of the extent in the left leaf was skipped 2377 * above so we could delete the right path 2378 * 1st. 2379 */ 2380 if (right_has_empty) 2381 ocfs2_remove_empty_extent(left_leaf_el); 2382 2383 ret = ocfs2_journal_dirty(handle, et_root_bh); 2384 if (ret) 2385 mlog_errno(ret); 2386 2387 *deleted = 1; 2388 } else 2389 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2390 subtree_index); 2391 2392 out: 2393 return ret; 2394 } 2395 2396 /* 2397 * Given a full path, determine what cpos value would return us a path 2398 * containing the leaf immediately to the right of the current one. 2399 * 2400 * Will return zero if the path passed in is already the rightmost path. 2401 * 2402 * This looks similar, but is subtly different to 2403 * ocfs2_find_cpos_for_left_leaf(). 2404 */ 2405 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2406 struct ocfs2_path *path, u32 *cpos) 2407 { 2408 int i, j, ret = 0; 2409 u64 blkno; 2410 struct ocfs2_extent_list *el; 2411 2412 *cpos = 0; 2413 2414 if (path->p_tree_depth == 0) 2415 return 0; 2416 2417 blkno = path_leaf_bh(path)->b_blocknr; 2418 2419 /* Start at the tree node just above the leaf and work our way up. */ 2420 i = path->p_tree_depth - 1; 2421 while (i >= 0) { 2422 int next_free; 2423 2424 el = path->p_node[i].el; 2425 2426 /* 2427 * Find the extent record just after the one in our 2428 * path. 2429 */ 2430 next_free = le16_to_cpu(el->l_next_free_rec); 2431 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2432 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2433 if (j == (next_free - 1)) { 2434 if (i == 0) { 2435 /* 2436 * We've determined that the 2437 * path specified is already 2438 * the rightmost one - return a 2439 * cpos of zero. 2440 */ 2441 goto out; 2442 } 2443 /* 2444 * The rightmost record points to our 2445 * leaf - we need to travel up the 2446 * tree one level. 2447 */ 2448 goto next_node; 2449 } 2450 2451 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); 2452 goto out; 2453 } 2454 } 2455 2456 /* 2457 * If we got here, we never found a valid node where 2458 * the tree indicated one should be. 2459 */ 2460 ocfs2_error(sb, 2461 "Invalid extent tree at extent block %llu\n", 2462 (unsigned long long)blkno); 2463 ret = -EROFS; 2464 goto out; 2465 2466 next_node: 2467 blkno = path->p_node[i].bh->b_blocknr; 2468 i--; 2469 } 2470 2471 out: 2472 return ret; 2473 } 2474 2475 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2476 handle_t *handle, 2477 struct buffer_head *bh, 2478 struct ocfs2_extent_list *el) 2479 { 2480 int ret; 2481 2482 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2483 return 0; 2484 2485 ret = ocfs2_journal_access(handle, inode, bh, 2486 OCFS2_JOURNAL_ACCESS_WRITE); 2487 if (ret) { 2488 mlog_errno(ret); 2489 goto out; 2490 } 2491 2492 ocfs2_remove_empty_extent(el); 2493 2494 ret = ocfs2_journal_dirty(handle, bh); 2495 if (ret) 2496 mlog_errno(ret); 2497 2498 out: 2499 return ret; 2500 } 2501 2502 static int __ocfs2_rotate_tree_left(struct inode *inode, 2503 handle_t *handle, int orig_credits, 2504 struct ocfs2_path *path, 2505 struct ocfs2_cached_dealloc_ctxt *dealloc, 2506 struct ocfs2_path **empty_extent_path, 2507 struct ocfs2_extent_tree *et) 2508 { 2509 int ret, subtree_root, deleted; 2510 u32 right_cpos; 2511 struct ocfs2_path *left_path = NULL; 2512 struct ocfs2_path *right_path = NULL; 2513 2514 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2515 2516 *empty_extent_path = NULL; 2517 2518 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2519 &right_cpos); 2520 if (ret) { 2521 mlog_errno(ret); 2522 goto out; 2523 } 2524 2525 left_path = ocfs2_new_path(path_root_bh(path), 2526 path_root_el(path)); 2527 if (!left_path) { 2528 ret = -ENOMEM; 2529 mlog_errno(ret); 2530 goto out; 2531 } 2532 2533 ocfs2_cp_path(left_path, path); 2534 2535 right_path = ocfs2_new_path(path_root_bh(path), 2536 path_root_el(path)); 2537 if (!right_path) { 2538 ret = -ENOMEM; 2539 mlog_errno(ret); 2540 goto out; 2541 } 2542 2543 while (right_cpos) { 2544 ret = ocfs2_find_path(inode, right_path, right_cpos); 2545 if (ret) { 2546 mlog_errno(ret); 2547 goto out; 2548 } 2549 2550 subtree_root = ocfs2_find_subtree_root(inode, left_path, 2551 right_path); 2552 2553 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2554 subtree_root, 2555 (unsigned long long) 2556 right_path->p_node[subtree_root].bh->b_blocknr, 2557 right_path->p_tree_depth); 2558 2559 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2560 orig_credits, left_path); 2561 if (ret) { 2562 mlog_errno(ret); 2563 goto out; 2564 } 2565 2566 /* 2567 * Caller might still want to make changes to the 2568 * tree root, so re-add it to the journal here. 2569 */ 2570 ret = ocfs2_journal_access(handle, inode, 2571 path_root_bh(left_path), 2572 OCFS2_JOURNAL_ACCESS_WRITE); 2573 if (ret) { 2574 mlog_errno(ret); 2575 goto out; 2576 } 2577 2578 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2579 right_path, subtree_root, 2580 dealloc, &deleted, et); 2581 if (ret == -EAGAIN) { 2582 /* 2583 * The rotation has to temporarily stop due to 2584 * the right subtree having an empty 2585 * extent. Pass it back to the caller for a 2586 * fixup. 2587 */ 2588 *empty_extent_path = right_path; 2589 right_path = NULL; 2590 goto out; 2591 } 2592 if (ret) { 2593 mlog_errno(ret); 2594 goto out; 2595 } 2596 2597 /* 2598 * The subtree rotate might have removed records on 2599 * the rightmost edge. If so, then rotation is 2600 * complete. 2601 */ 2602 if (deleted) 2603 break; 2604 2605 ocfs2_mv_path(left_path, right_path); 2606 2607 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 2608 &right_cpos); 2609 if (ret) { 2610 mlog_errno(ret); 2611 goto out; 2612 } 2613 } 2614 2615 out: 2616 ocfs2_free_path(right_path); 2617 ocfs2_free_path(left_path); 2618 2619 return ret; 2620 } 2621 2622 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 2623 struct ocfs2_path *path, 2624 struct ocfs2_cached_dealloc_ctxt *dealloc, 2625 struct ocfs2_extent_tree *et) 2626 { 2627 int ret, subtree_index; 2628 u32 cpos; 2629 struct ocfs2_path *left_path = NULL; 2630 struct ocfs2_extent_block *eb; 2631 struct ocfs2_extent_list *el; 2632 2633 2634 ret = et->eops->sanity_check(inode, et); 2635 if (ret) 2636 goto out; 2637 /* 2638 * There's two ways we handle this depending on 2639 * whether path is the only existing one. 2640 */ 2641 ret = ocfs2_extend_rotate_transaction(handle, 0, 2642 handle->h_buffer_credits, 2643 path); 2644 if (ret) { 2645 mlog_errno(ret); 2646 goto out; 2647 } 2648 2649 ret = ocfs2_journal_access_path(inode, handle, path); 2650 if (ret) { 2651 mlog_errno(ret); 2652 goto out; 2653 } 2654 2655 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 2656 if (ret) { 2657 mlog_errno(ret); 2658 goto out; 2659 } 2660 2661 if (cpos) { 2662 /* 2663 * We have a path to the left of this one - it needs 2664 * an update too. 2665 */ 2666 left_path = ocfs2_new_path(path_root_bh(path), 2667 path_root_el(path)); 2668 if (!left_path) { 2669 ret = -ENOMEM; 2670 mlog_errno(ret); 2671 goto out; 2672 } 2673 2674 ret = ocfs2_find_path(inode, left_path, cpos); 2675 if (ret) { 2676 mlog_errno(ret); 2677 goto out; 2678 } 2679 2680 ret = ocfs2_journal_access_path(inode, handle, left_path); 2681 if (ret) { 2682 mlog_errno(ret); 2683 goto out; 2684 } 2685 2686 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 2687 2688 ocfs2_unlink_subtree(inode, handle, left_path, path, 2689 subtree_index, dealloc); 2690 ocfs2_update_edge_lengths(inode, handle, left_path); 2691 2692 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2693 ocfs2_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2694 } else { 2695 /* 2696 * 'path' is also the leftmost path which 2697 * means it must be the only one. This gets 2698 * handled differently because we want to 2699 * revert the inode back to having extents 2700 * in-line. 2701 */ 2702 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 2703 2704 el = et->root_el; 2705 el->l_tree_depth = 0; 2706 el->l_next_free_rec = 0; 2707 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2708 2709 ocfs2_set_last_eb_blk(et, 0); 2710 } 2711 2712 ocfs2_journal_dirty(handle, path_root_bh(path)); 2713 2714 out: 2715 ocfs2_free_path(left_path); 2716 return ret; 2717 } 2718 2719 /* 2720 * Left rotation of btree records. 2721 * 2722 * In many ways, this is (unsurprisingly) the opposite of right 2723 * rotation. We start at some non-rightmost path containing an empty 2724 * extent in the leaf block. The code works its way to the rightmost 2725 * path by rotating records to the left in every subtree. 2726 * 2727 * This is used by any code which reduces the number of extent records 2728 * in a leaf. After removal, an empty record should be placed in the 2729 * leftmost list position. 2730 * 2731 * This won't handle a length update of the rightmost path records if 2732 * the rightmost tree leaf record is removed so the caller is 2733 * responsible for detecting and correcting that. 2734 */ 2735 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 2736 struct ocfs2_path *path, 2737 struct ocfs2_cached_dealloc_ctxt *dealloc, 2738 struct ocfs2_extent_tree *et) 2739 { 2740 int ret, orig_credits = handle->h_buffer_credits; 2741 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 2742 struct ocfs2_extent_block *eb; 2743 struct ocfs2_extent_list *el; 2744 2745 el = path_leaf_el(path); 2746 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2747 return 0; 2748 2749 if (path->p_tree_depth == 0) { 2750 rightmost_no_delete: 2751 /* 2752 * Inline extents. This is trivially handled, so do 2753 * it up front. 2754 */ 2755 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2756 path_leaf_bh(path), 2757 path_leaf_el(path)); 2758 if (ret) 2759 mlog_errno(ret); 2760 goto out; 2761 } 2762 2763 /* 2764 * Handle rightmost branch now. There's several cases: 2765 * 1) simple rotation leaving records in there. That's trivial. 2766 * 2) rotation requiring a branch delete - there's no more 2767 * records left. Two cases of this: 2768 * a) There are branches to the left. 2769 * b) This is also the leftmost (the only) branch. 2770 * 2771 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 2772 * 2a) we need the left branch so that we can update it with the unlink 2773 * 2b) we need to bring the inode back to inline extents. 2774 */ 2775 2776 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2777 el = &eb->h_list; 2778 if (eb->h_next_leaf_blk == 0) { 2779 /* 2780 * This gets a bit tricky if we're going to delete the 2781 * rightmost path. Get the other cases out of the way 2782 * 1st. 2783 */ 2784 if (le16_to_cpu(el->l_next_free_rec) > 1) 2785 goto rightmost_no_delete; 2786 2787 if (le16_to_cpu(el->l_next_free_rec) == 0) { 2788 ret = -EIO; 2789 ocfs2_error(inode->i_sb, 2790 "Inode %llu has empty extent block at %llu", 2791 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2792 (unsigned long long)le64_to_cpu(eb->h_blkno)); 2793 goto out; 2794 } 2795 2796 /* 2797 * XXX: The caller can not trust "path" any more after 2798 * this as it will have been deleted. What do we do? 2799 * 2800 * In theory the rotate-for-merge code will never get 2801 * here because it'll always ask for a rotate in a 2802 * nonempty list. 2803 */ 2804 2805 ret = ocfs2_remove_rightmost_path(inode, handle, path, 2806 dealloc, et); 2807 if (ret) 2808 mlog_errno(ret); 2809 goto out; 2810 } 2811 2812 /* 2813 * Now we can loop, remembering the path we get from -EAGAIN 2814 * and restarting from there. 2815 */ 2816 try_rotate: 2817 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 2818 dealloc, &restart_path, et); 2819 if (ret && ret != -EAGAIN) { 2820 mlog_errno(ret); 2821 goto out; 2822 } 2823 2824 while (ret == -EAGAIN) { 2825 tmp_path = restart_path; 2826 restart_path = NULL; 2827 2828 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 2829 tmp_path, dealloc, 2830 &restart_path, et); 2831 if (ret && ret != -EAGAIN) { 2832 mlog_errno(ret); 2833 goto out; 2834 } 2835 2836 ocfs2_free_path(tmp_path); 2837 tmp_path = NULL; 2838 2839 if (ret == 0) 2840 goto try_rotate; 2841 } 2842 2843 out: 2844 ocfs2_free_path(tmp_path); 2845 ocfs2_free_path(restart_path); 2846 return ret; 2847 } 2848 2849 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, 2850 int index) 2851 { 2852 struct ocfs2_extent_rec *rec = &el->l_recs[index]; 2853 unsigned int size; 2854 2855 if (rec->e_leaf_clusters == 0) { 2856 /* 2857 * We consumed all of the merged-from record. An empty 2858 * extent cannot exist anywhere but the 1st array 2859 * position, so move things over if the merged-from 2860 * record doesn't occupy that position. 2861 * 2862 * This creates a new empty extent so the caller 2863 * should be smart enough to have removed any existing 2864 * ones. 2865 */ 2866 if (index > 0) { 2867 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 2868 size = index * sizeof(struct ocfs2_extent_rec); 2869 memmove(&el->l_recs[1], &el->l_recs[0], size); 2870 } 2871 2872 /* 2873 * Always memset - the caller doesn't check whether it 2874 * created an empty extent, so there could be junk in 2875 * the other fields. 2876 */ 2877 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2878 } 2879 } 2880 2881 static int ocfs2_get_right_path(struct inode *inode, 2882 struct ocfs2_path *left_path, 2883 struct ocfs2_path **ret_right_path) 2884 { 2885 int ret; 2886 u32 right_cpos; 2887 struct ocfs2_path *right_path = NULL; 2888 struct ocfs2_extent_list *left_el; 2889 2890 *ret_right_path = NULL; 2891 2892 /* This function shouldn't be called for non-trees. */ 2893 BUG_ON(left_path->p_tree_depth == 0); 2894 2895 left_el = path_leaf_el(left_path); 2896 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 2897 2898 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 2899 &right_cpos); 2900 if (ret) { 2901 mlog_errno(ret); 2902 goto out; 2903 } 2904 2905 /* This function shouldn't be called for the rightmost leaf. */ 2906 BUG_ON(right_cpos == 0); 2907 2908 right_path = ocfs2_new_path(path_root_bh(left_path), 2909 path_root_el(left_path)); 2910 if (!right_path) { 2911 ret = -ENOMEM; 2912 mlog_errno(ret); 2913 goto out; 2914 } 2915 2916 ret = ocfs2_find_path(inode, right_path, right_cpos); 2917 if (ret) { 2918 mlog_errno(ret); 2919 goto out; 2920 } 2921 2922 *ret_right_path = right_path; 2923 out: 2924 if (ret) 2925 ocfs2_free_path(right_path); 2926 return ret; 2927 } 2928 2929 /* 2930 * Remove split_rec clusters from the record at index and merge them 2931 * onto the beginning of the record "next" to it. 2932 * For index < l_count - 1, the next means the extent rec at index + 1. 2933 * For index == l_count - 1, the "next" means the 1st extent rec of the 2934 * next extent block. 2935 */ 2936 static int ocfs2_merge_rec_right(struct inode *inode, 2937 struct ocfs2_path *left_path, 2938 handle_t *handle, 2939 struct ocfs2_extent_rec *split_rec, 2940 int index) 2941 { 2942 int ret, next_free, i; 2943 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 2944 struct ocfs2_extent_rec *left_rec; 2945 struct ocfs2_extent_rec *right_rec; 2946 struct ocfs2_extent_list *right_el; 2947 struct ocfs2_path *right_path = NULL; 2948 int subtree_index = 0; 2949 struct ocfs2_extent_list *el = path_leaf_el(left_path); 2950 struct buffer_head *bh = path_leaf_bh(left_path); 2951 struct buffer_head *root_bh = NULL; 2952 2953 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 2954 left_rec = &el->l_recs[index]; 2955 2956 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 2957 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 2958 /* we meet with a cross extent block merge. */ 2959 ret = ocfs2_get_right_path(inode, left_path, &right_path); 2960 if (ret) { 2961 mlog_errno(ret); 2962 goto out; 2963 } 2964 2965 right_el = path_leaf_el(right_path); 2966 next_free = le16_to_cpu(right_el->l_next_free_rec); 2967 BUG_ON(next_free <= 0); 2968 right_rec = &right_el->l_recs[0]; 2969 if (ocfs2_is_empty_extent(right_rec)) { 2970 BUG_ON(next_free <= 1); 2971 right_rec = &right_el->l_recs[1]; 2972 } 2973 2974 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 2975 le16_to_cpu(left_rec->e_leaf_clusters) != 2976 le32_to_cpu(right_rec->e_cpos)); 2977 2978 subtree_index = ocfs2_find_subtree_root(inode, 2979 left_path, right_path); 2980 2981 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 2982 handle->h_buffer_credits, 2983 right_path); 2984 if (ret) { 2985 mlog_errno(ret); 2986 goto out; 2987 } 2988 2989 root_bh = left_path->p_node[subtree_index].bh; 2990 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2991 2992 ret = ocfs2_journal_access(handle, inode, root_bh, 2993 OCFS2_JOURNAL_ACCESS_WRITE); 2994 if (ret) { 2995 mlog_errno(ret); 2996 goto out; 2997 } 2998 2999 for (i = subtree_index + 1; 3000 i < path_num_items(right_path); i++) { 3001 ret = ocfs2_journal_access(handle, inode, 3002 right_path->p_node[i].bh, 3003 OCFS2_JOURNAL_ACCESS_WRITE); 3004 if (ret) { 3005 mlog_errno(ret); 3006 goto out; 3007 } 3008 3009 ret = ocfs2_journal_access(handle, inode, 3010 left_path->p_node[i].bh, 3011 OCFS2_JOURNAL_ACCESS_WRITE); 3012 if (ret) { 3013 mlog_errno(ret); 3014 goto out; 3015 } 3016 } 3017 3018 } else { 3019 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); 3020 right_rec = &el->l_recs[index + 1]; 3021 } 3022 3023 ret = ocfs2_journal_access(handle, inode, bh, 3024 OCFS2_JOURNAL_ACCESS_WRITE); 3025 if (ret) { 3026 mlog_errno(ret); 3027 goto out; 3028 } 3029 3030 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); 3031 3032 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3033 le64_add_cpu(&right_rec->e_blkno, 3034 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3035 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3036 3037 ocfs2_cleanup_merge(el, index); 3038 3039 ret = ocfs2_journal_dirty(handle, bh); 3040 if (ret) 3041 mlog_errno(ret); 3042 3043 if (right_path) { 3044 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3045 if (ret) 3046 mlog_errno(ret); 3047 3048 ocfs2_complete_edge_insert(inode, handle, left_path, 3049 right_path, subtree_index); 3050 } 3051 out: 3052 if (right_path) 3053 ocfs2_free_path(right_path); 3054 return ret; 3055 } 3056 3057 static int ocfs2_get_left_path(struct inode *inode, 3058 struct ocfs2_path *right_path, 3059 struct ocfs2_path **ret_left_path) 3060 { 3061 int ret; 3062 u32 left_cpos; 3063 struct ocfs2_path *left_path = NULL; 3064 3065 *ret_left_path = NULL; 3066 3067 /* This function shouldn't be called for non-trees. */ 3068 BUG_ON(right_path->p_tree_depth == 0); 3069 3070 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3071 right_path, &left_cpos); 3072 if (ret) { 3073 mlog_errno(ret); 3074 goto out; 3075 } 3076 3077 /* This function shouldn't be called for the leftmost leaf. */ 3078 BUG_ON(left_cpos == 0); 3079 3080 left_path = ocfs2_new_path(path_root_bh(right_path), 3081 path_root_el(right_path)); 3082 if (!left_path) { 3083 ret = -ENOMEM; 3084 mlog_errno(ret); 3085 goto out; 3086 } 3087 3088 ret = ocfs2_find_path(inode, left_path, left_cpos); 3089 if (ret) { 3090 mlog_errno(ret); 3091 goto out; 3092 } 3093 3094 *ret_left_path = left_path; 3095 out: 3096 if (ret) 3097 ocfs2_free_path(left_path); 3098 return ret; 3099 } 3100 3101 /* 3102 * Remove split_rec clusters from the record at index and merge them 3103 * onto the tail of the record "before" it. 3104 * For index > 0, the "before" means the extent rec at index - 1. 3105 * 3106 * For index == 0, the "before" means the last record of the previous 3107 * extent block. And there is also a situation that we may need to 3108 * remove the rightmost leaf extent block in the right_path and change 3109 * the right path to indicate the new rightmost path. 3110 */ 3111 static int ocfs2_merge_rec_left(struct inode *inode, 3112 struct ocfs2_path *right_path, 3113 handle_t *handle, 3114 struct ocfs2_extent_rec *split_rec, 3115 struct ocfs2_cached_dealloc_ctxt *dealloc, 3116 struct ocfs2_extent_tree *et, 3117 int index) 3118 { 3119 int ret, i, subtree_index = 0, has_empty_extent = 0; 3120 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3121 struct ocfs2_extent_rec *left_rec; 3122 struct ocfs2_extent_rec *right_rec; 3123 struct ocfs2_extent_list *el = path_leaf_el(right_path); 3124 struct buffer_head *bh = path_leaf_bh(right_path); 3125 struct buffer_head *root_bh = NULL; 3126 struct ocfs2_path *left_path = NULL; 3127 struct ocfs2_extent_list *left_el; 3128 3129 BUG_ON(index < 0); 3130 3131 right_rec = &el->l_recs[index]; 3132 if (index == 0) { 3133 /* we meet with a cross extent block merge. */ 3134 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3135 if (ret) { 3136 mlog_errno(ret); 3137 goto out; 3138 } 3139 3140 left_el = path_leaf_el(left_path); 3141 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != 3142 le16_to_cpu(left_el->l_count)); 3143 3144 left_rec = &left_el->l_recs[ 3145 le16_to_cpu(left_el->l_next_free_rec) - 1]; 3146 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3147 le16_to_cpu(left_rec->e_leaf_clusters) != 3148 le32_to_cpu(split_rec->e_cpos)); 3149 3150 subtree_index = ocfs2_find_subtree_root(inode, 3151 left_path, right_path); 3152 3153 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3154 handle->h_buffer_credits, 3155 left_path); 3156 if (ret) { 3157 mlog_errno(ret); 3158 goto out; 3159 } 3160 3161 root_bh = left_path->p_node[subtree_index].bh; 3162 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3163 3164 ret = ocfs2_journal_access(handle, inode, root_bh, 3165 OCFS2_JOURNAL_ACCESS_WRITE); 3166 if (ret) { 3167 mlog_errno(ret); 3168 goto out; 3169 } 3170 3171 for (i = subtree_index + 1; 3172 i < path_num_items(right_path); i++) { 3173 ret = ocfs2_journal_access(handle, inode, 3174 right_path->p_node[i].bh, 3175 OCFS2_JOURNAL_ACCESS_WRITE); 3176 if (ret) { 3177 mlog_errno(ret); 3178 goto out; 3179 } 3180 3181 ret = ocfs2_journal_access(handle, inode, 3182 left_path->p_node[i].bh, 3183 OCFS2_JOURNAL_ACCESS_WRITE); 3184 if (ret) { 3185 mlog_errno(ret); 3186 goto out; 3187 } 3188 } 3189 } else { 3190 left_rec = &el->l_recs[index - 1]; 3191 if (ocfs2_is_empty_extent(&el->l_recs[0])) 3192 has_empty_extent = 1; 3193 } 3194 3195 ret = ocfs2_journal_access(handle, inode, bh, 3196 OCFS2_JOURNAL_ACCESS_WRITE); 3197 if (ret) { 3198 mlog_errno(ret); 3199 goto out; 3200 } 3201 3202 if (has_empty_extent && index == 1) { 3203 /* 3204 * The easy case - we can just plop the record right in. 3205 */ 3206 *left_rec = *split_rec; 3207 3208 has_empty_extent = 0; 3209 } else 3210 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3211 3212 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3213 le64_add_cpu(&right_rec->e_blkno, 3214 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3215 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3216 3217 ocfs2_cleanup_merge(el, index); 3218 3219 ret = ocfs2_journal_dirty(handle, bh); 3220 if (ret) 3221 mlog_errno(ret); 3222 3223 if (left_path) { 3224 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3225 if (ret) 3226 mlog_errno(ret); 3227 3228 /* 3229 * In the situation that the right_rec is empty and the extent 3230 * block is empty also, ocfs2_complete_edge_insert can't handle 3231 * it and we need to delete the right extent block. 3232 */ 3233 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3234 le16_to_cpu(el->l_next_free_rec) == 1) { 3235 3236 ret = ocfs2_remove_rightmost_path(inode, handle, 3237 right_path, 3238 dealloc, et); 3239 if (ret) { 3240 mlog_errno(ret); 3241 goto out; 3242 } 3243 3244 /* Now the rightmost extent block has been deleted. 3245 * So we use the new rightmost path. 3246 */ 3247 ocfs2_mv_path(right_path, left_path); 3248 left_path = NULL; 3249 } else 3250 ocfs2_complete_edge_insert(inode, handle, left_path, 3251 right_path, subtree_index); 3252 } 3253 out: 3254 if (left_path) 3255 ocfs2_free_path(left_path); 3256 return ret; 3257 } 3258 3259 static int ocfs2_try_to_merge_extent(struct inode *inode, 3260 handle_t *handle, 3261 struct ocfs2_path *path, 3262 int split_index, 3263 struct ocfs2_extent_rec *split_rec, 3264 struct ocfs2_cached_dealloc_ctxt *dealloc, 3265 struct ocfs2_merge_ctxt *ctxt, 3266 struct ocfs2_extent_tree *et) 3267 3268 { 3269 int ret = 0; 3270 struct ocfs2_extent_list *el = path_leaf_el(path); 3271 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3272 3273 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3274 3275 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3276 /* 3277 * The merge code will need to create an empty 3278 * extent to take the place of the newly 3279 * emptied slot. Remove any pre-existing empty 3280 * extents - having more than one in a leaf is 3281 * illegal. 3282 */ 3283 ret = ocfs2_rotate_tree_left(inode, handle, path, 3284 dealloc, et); 3285 if (ret) { 3286 mlog_errno(ret); 3287 goto out; 3288 } 3289 split_index--; 3290 rec = &el->l_recs[split_index]; 3291 } 3292 3293 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 3294 /* 3295 * Left-right contig implies this. 3296 */ 3297 BUG_ON(!ctxt->c_split_covers_rec); 3298 3299 /* 3300 * Since the leftright insert always covers the entire 3301 * extent, this call will delete the insert record 3302 * entirely, resulting in an empty extent record added to 3303 * the extent block. 3304 * 3305 * Since the adding of an empty extent shifts 3306 * everything back to the right, there's no need to 3307 * update split_index here. 3308 * 3309 * When the split_index is zero, we need to merge it to the 3310 * prevoius extent block. It is more efficient and easier 3311 * if we do merge_right first and merge_left later. 3312 */ 3313 ret = ocfs2_merge_rec_right(inode, path, 3314 handle, split_rec, 3315 split_index); 3316 if (ret) { 3317 mlog_errno(ret); 3318 goto out; 3319 } 3320 3321 /* 3322 * We can only get this from logic error above. 3323 */ 3324 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3325 3326 /* The merge left us with an empty extent, remove it. */ 3327 ret = ocfs2_rotate_tree_left(inode, handle, path, 3328 dealloc, et); 3329 if (ret) { 3330 mlog_errno(ret); 3331 goto out; 3332 } 3333 3334 rec = &el->l_recs[split_index]; 3335 3336 /* 3337 * Note that we don't pass split_rec here on purpose - 3338 * we've merged it into the rec already. 3339 */ 3340 ret = ocfs2_merge_rec_left(inode, path, 3341 handle, rec, 3342 dealloc, et, 3343 split_index); 3344 3345 if (ret) { 3346 mlog_errno(ret); 3347 goto out; 3348 } 3349 3350 ret = ocfs2_rotate_tree_left(inode, handle, path, 3351 dealloc, et); 3352 /* 3353 * Error from this last rotate is not critical, so 3354 * print but don't bubble it up. 3355 */ 3356 if (ret) 3357 mlog_errno(ret); 3358 ret = 0; 3359 } else { 3360 /* 3361 * Merge a record to the left or right. 3362 * 3363 * 'contig_type' is relative to the existing record, 3364 * so for example, if we're "right contig", it's to 3365 * the record on the left (hence the left merge). 3366 */ 3367 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3368 ret = ocfs2_merge_rec_left(inode, 3369 path, 3370 handle, split_rec, 3371 dealloc, et, 3372 split_index); 3373 if (ret) { 3374 mlog_errno(ret); 3375 goto out; 3376 } 3377 } else { 3378 ret = ocfs2_merge_rec_right(inode, 3379 path, 3380 handle, split_rec, 3381 split_index); 3382 if (ret) { 3383 mlog_errno(ret); 3384 goto out; 3385 } 3386 } 3387 3388 if (ctxt->c_split_covers_rec) { 3389 /* 3390 * The merge may have left an empty extent in 3391 * our leaf. Try to rotate it away. 3392 */ 3393 ret = ocfs2_rotate_tree_left(inode, handle, path, 3394 dealloc, et); 3395 if (ret) 3396 mlog_errno(ret); 3397 ret = 0; 3398 } 3399 } 3400 3401 out: 3402 return ret; 3403 } 3404 3405 static void ocfs2_subtract_from_rec(struct super_block *sb, 3406 enum ocfs2_split_type split, 3407 struct ocfs2_extent_rec *rec, 3408 struct ocfs2_extent_rec *split_rec) 3409 { 3410 u64 len_blocks; 3411 3412 len_blocks = ocfs2_clusters_to_blocks(sb, 3413 le16_to_cpu(split_rec->e_leaf_clusters)); 3414 3415 if (split == SPLIT_LEFT) { 3416 /* 3417 * Region is on the left edge of the existing 3418 * record. 3419 */ 3420 le32_add_cpu(&rec->e_cpos, 3421 le16_to_cpu(split_rec->e_leaf_clusters)); 3422 le64_add_cpu(&rec->e_blkno, len_blocks); 3423 le16_add_cpu(&rec->e_leaf_clusters, 3424 -le16_to_cpu(split_rec->e_leaf_clusters)); 3425 } else { 3426 /* 3427 * Region is on the right edge of the existing 3428 * record. 3429 */ 3430 le16_add_cpu(&rec->e_leaf_clusters, 3431 -le16_to_cpu(split_rec->e_leaf_clusters)); 3432 } 3433 } 3434 3435 /* 3436 * Do the final bits of extent record insertion at the target leaf 3437 * list. If this leaf is part of an allocation tree, it is assumed 3438 * that the tree above has been prepared. 3439 */ 3440 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3441 struct ocfs2_extent_list *el, 3442 struct ocfs2_insert_type *insert, 3443 struct inode *inode) 3444 { 3445 int i = insert->ins_contig_index; 3446 unsigned int range; 3447 struct ocfs2_extent_rec *rec; 3448 3449 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 3450 3451 if (insert->ins_split != SPLIT_NONE) { 3452 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3453 BUG_ON(i == -1); 3454 rec = &el->l_recs[i]; 3455 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3456 insert_rec); 3457 goto rotate; 3458 } 3459 3460 /* 3461 * Contiguous insert - either left or right. 3462 */ 3463 if (insert->ins_contig != CONTIG_NONE) { 3464 rec = &el->l_recs[i]; 3465 if (insert->ins_contig == CONTIG_LEFT) { 3466 rec->e_blkno = insert_rec->e_blkno; 3467 rec->e_cpos = insert_rec->e_cpos; 3468 } 3469 le16_add_cpu(&rec->e_leaf_clusters, 3470 le16_to_cpu(insert_rec->e_leaf_clusters)); 3471 return; 3472 } 3473 3474 /* 3475 * Handle insert into an empty leaf. 3476 */ 3477 if (le16_to_cpu(el->l_next_free_rec) == 0 || 3478 ((le16_to_cpu(el->l_next_free_rec) == 1) && 3479 ocfs2_is_empty_extent(&el->l_recs[0]))) { 3480 el->l_recs[0] = *insert_rec; 3481 el->l_next_free_rec = cpu_to_le16(1); 3482 return; 3483 } 3484 3485 /* 3486 * Appending insert. 3487 */ 3488 if (insert->ins_appending == APPEND_TAIL) { 3489 i = le16_to_cpu(el->l_next_free_rec) - 1; 3490 rec = &el->l_recs[i]; 3491 range = le32_to_cpu(rec->e_cpos) 3492 + le16_to_cpu(rec->e_leaf_clusters); 3493 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 3494 3495 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3496 le16_to_cpu(el->l_count), 3497 "inode %lu, depth %u, count %u, next free %u, " 3498 "rec.cpos %u, rec.clusters %u, " 3499 "insert.cpos %u, insert.clusters %u\n", 3500 inode->i_ino, 3501 le16_to_cpu(el->l_tree_depth), 3502 le16_to_cpu(el->l_count), 3503 le16_to_cpu(el->l_next_free_rec), 3504 le32_to_cpu(el->l_recs[i].e_cpos), 3505 le16_to_cpu(el->l_recs[i].e_leaf_clusters), 3506 le32_to_cpu(insert_rec->e_cpos), 3507 le16_to_cpu(insert_rec->e_leaf_clusters)); 3508 i++; 3509 el->l_recs[i] = *insert_rec; 3510 le16_add_cpu(&el->l_next_free_rec, 1); 3511 return; 3512 } 3513 3514 rotate: 3515 /* 3516 * Ok, we have to rotate. 3517 * 3518 * At this point, it is safe to assume that inserting into an 3519 * empty leaf and appending to a leaf have both been handled 3520 * above. 3521 * 3522 * This leaf needs to have space, either by the empty 1st 3523 * extent record, or by virtue of an l_next_rec < l_count. 3524 */ 3525 ocfs2_rotate_leaf(el, insert_rec); 3526 } 3527 3528 static void ocfs2_adjust_rightmost_records(struct inode *inode, 3529 handle_t *handle, 3530 struct ocfs2_path *path, 3531 struct ocfs2_extent_rec *insert_rec) 3532 { 3533 int ret, i, next_free; 3534 struct buffer_head *bh; 3535 struct ocfs2_extent_list *el; 3536 struct ocfs2_extent_rec *rec; 3537 3538 /* 3539 * Update everything except the leaf block. 3540 */ 3541 for (i = 0; i < path->p_tree_depth; i++) { 3542 bh = path->p_node[i].bh; 3543 el = path->p_node[i].el; 3544 3545 next_free = le16_to_cpu(el->l_next_free_rec); 3546 if (next_free == 0) { 3547 ocfs2_error(inode->i_sb, 3548 "Dinode %llu has a bad extent list", 3549 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3550 ret = -EIO; 3551 return; 3552 } 3553 3554 rec = &el->l_recs[next_free - 1]; 3555 3556 rec->e_int_clusters = insert_rec->e_cpos; 3557 le32_add_cpu(&rec->e_int_clusters, 3558 le16_to_cpu(insert_rec->e_leaf_clusters)); 3559 le32_add_cpu(&rec->e_int_clusters, 3560 -le32_to_cpu(rec->e_cpos)); 3561 3562 ret = ocfs2_journal_dirty(handle, bh); 3563 if (ret) 3564 mlog_errno(ret); 3565 3566 } 3567 } 3568 3569 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3570 struct ocfs2_extent_rec *insert_rec, 3571 struct ocfs2_path *right_path, 3572 struct ocfs2_path **ret_left_path) 3573 { 3574 int ret, next_free; 3575 struct ocfs2_extent_list *el; 3576 struct ocfs2_path *left_path = NULL; 3577 3578 *ret_left_path = NULL; 3579 3580 /* 3581 * This shouldn't happen for non-trees. The extent rec cluster 3582 * count manipulation below only works for interior nodes. 3583 */ 3584 BUG_ON(right_path->p_tree_depth == 0); 3585 3586 /* 3587 * If our appending insert is at the leftmost edge of a leaf, 3588 * then we might need to update the rightmost records of the 3589 * neighboring path. 3590 */ 3591 el = path_leaf_el(right_path); 3592 next_free = le16_to_cpu(el->l_next_free_rec); 3593 if (next_free == 0 || 3594 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 3595 u32 left_cpos; 3596 3597 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 3598 &left_cpos); 3599 if (ret) { 3600 mlog_errno(ret); 3601 goto out; 3602 } 3603 3604 mlog(0, "Append may need a left path update. cpos: %u, " 3605 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 3606 left_cpos); 3607 3608 /* 3609 * No need to worry if the append is already in the 3610 * leftmost leaf. 3611 */ 3612 if (left_cpos) { 3613 left_path = ocfs2_new_path(path_root_bh(right_path), 3614 path_root_el(right_path)); 3615 if (!left_path) { 3616 ret = -ENOMEM; 3617 mlog_errno(ret); 3618 goto out; 3619 } 3620 3621 ret = ocfs2_find_path(inode, left_path, left_cpos); 3622 if (ret) { 3623 mlog_errno(ret); 3624 goto out; 3625 } 3626 3627 /* 3628 * ocfs2_insert_path() will pass the left_path to the 3629 * journal for us. 3630 */ 3631 } 3632 } 3633 3634 ret = ocfs2_journal_access_path(inode, handle, right_path); 3635 if (ret) { 3636 mlog_errno(ret); 3637 goto out; 3638 } 3639 3640 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 3641 3642 *ret_left_path = left_path; 3643 ret = 0; 3644 out: 3645 if (ret != 0) 3646 ocfs2_free_path(left_path); 3647 3648 return ret; 3649 } 3650 3651 static void ocfs2_split_record(struct inode *inode, 3652 struct ocfs2_path *left_path, 3653 struct ocfs2_path *right_path, 3654 struct ocfs2_extent_rec *split_rec, 3655 enum ocfs2_split_type split) 3656 { 3657 int index; 3658 u32 cpos = le32_to_cpu(split_rec->e_cpos); 3659 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3660 struct ocfs2_extent_rec *rec, *tmprec; 3661 3662 right_el = path_leaf_el(right_path);; 3663 if (left_path) 3664 left_el = path_leaf_el(left_path); 3665 3666 el = right_el; 3667 insert_el = right_el; 3668 index = ocfs2_search_extent_list(el, cpos); 3669 if (index != -1) { 3670 if (index == 0 && left_path) { 3671 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3672 3673 /* 3674 * This typically means that the record 3675 * started in the left path but moved to the 3676 * right as a result of rotation. We either 3677 * move the existing record to the left, or we 3678 * do the later insert there. 3679 * 3680 * In this case, the left path should always 3681 * exist as the rotate code will have passed 3682 * it back for a post-insert update. 3683 */ 3684 3685 if (split == SPLIT_LEFT) { 3686 /* 3687 * It's a left split. Since we know 3688 * that the rotate code gave us an 3689 * empty extent in the left path, we 3690 * can just do the insert there. 3691 */ 3692 insert_el = left_el; 3693 } else { 3694 /* 3695 * Right split - we have to move the 3696 * existing record over to the left 3697 * leaf. The insert will be into the 3698 * newly created empty extent in the 3699 * right leaf. 3700 */ 3701 tmprec = &right_el->l_recs[index]; 3702 ocfs2_rotate_leaf(left_el, tmprec); 3703 el = left_el; 3704 3705 memset(tmprec, 0, sizeof(*tmprec)); 3706 index = ocfs2_search_extent_list(left_el, cpos); 3707 BUG_ON(index == -1); 3708 } 3709 } 3710 } else { 3711 BUG_ON(!left_path); 3712 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); 3713 /* 3714 * Left path is easy - we can just allow the insert to 3715 * happen. 3716 */ 3717 el = left_el; 3718 insert_el = left_el; 3719 index = ocfs2_search_extent_list(el, cpos); 3720 BUG_ON(index == -1); 3721 } 3722 3723 rec = &el->l_recs[index]; 3724 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 3725 ocfs2_rotate_leaf(insert_el, split_rec); 3726 } 3727 3728 /* 3729 * This function only does inserts on an allocation b-tree. For tree 3730 * depth = 0, ocfs2_insert_at_leaf() is called directly. 3731 * 3732 * right_path is the path we want to do the actual insert 3733 * in. left_path should only be passed in if we need to update that 3734 * portion of the tree after an edge insert. 3735 */ 3736 static int ocfs2_insert_path(struct inode *inode, 3737 handle_t *handle, 3738 struct ocfs2_path *left_path, 3739 struct ocfs2_path *right_path, 3740 struct ocfs2_extent_rec *insert_rec, 3741 struct ocfs2_insert_type *insert) 3742 { 3743 int ret, subtree_index; 3744 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3745 3746 if (left_path) { 3747 int credits = handle->h_buffer_credits; 3748 3749 /* 3750 * There's a chance that left_path got passed back to 3751 * us without being accounted for in the 3752 * journal. Extend our transaction here to be sure we 3753 * can change those blocks. 3754 */ 3755 credits += left_path->p_tree_depth; 3756 3757 ret = ocfs2_extend_trans(handle, credits); 3758 if (ret < 0) { 3759 mlog_errno(ret); 3760 goto out; 3761 } 3762 3763 ret = ocfs2_journal_access_path(inode, handle, left_path); 3764 if (ret < 0) { 3765 mlog_errno(ret); 3766 goto out; 3767 } 3768 } 3769 3770 /* 3771 * Pass both paths to the journal. The majority of inserts 3772 * will be touching all components anyway. 3773 */ 3774 ret = ocfs2_journal_access_path(inode, handle, right_path); 3775 if (ret < 0) { 3776 mlog_errno(ret); 3777 goto out; 3778 } 3779 3780 if (insert->ins_split != SPLIT_NONE) { 3781 /* 3782 * We could call ocfs2_insert_at_leaf() for some types 3783 * of splits, but it's easier to just let one separate 3784 * function sort it all out. 3785 */ 3786 ocfs2_split_record(inode, left_path, right_path, 3787 insert_rec, insert->ins_split); 3788 3789 /* 3790 * Split might have modified either leaf and we don't 3791 * have a guarantee that the later edge insert will 3792 * dirty this for us. 3793 */ 3794 if (left_path) 3795 ret = ocfs2_journal_dirty(handle, 3796 path_leaf_bh(left_path)); 3797 if (ret) 3798 mlog_errno(ret); 3799 } else 3800 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 3801 insert, inode); 3802 3803 ret = ocfs2_journal_dirty(handle, leaf_bh); 3804 if (ret) 3805 mlog_errno(ret); 3806 3807 if (left_path) { 3808 /* 3809 * The rotate code has indicated that we need to fix 3810 * up portions of the tree after the insert. 3811 * 3812 * XXX: Should we extend the transaction here? 3813 */ 3814 subtree_index = ocfs2_find_subtree_root(inode, left_path, 3815 right_path); 3816 ocfs2_complete_edge_insert(inode, handle, left_path, 3817 right_path, subtree_index); 3818 } 3819 3820 ret = 0; 3821 out: 3822 return ret; 3823 } 3824 3825 static int ocfs2_do_insert_extent(struct inode *inode, 3826 handle_t *handle, 3827 struct ocfs2_extent_tree *et, 3828 struct ocfs2_extent_rec *insert_rec, 3829 struct ocfs2_insert_type *type) 3830 { 3831 int ret, rotate = 0; 3832 u32 cpos; 3833 struct ocfs2_path *right_path = NULL; 3834 struct ocfs2_path *left_path = NULL; 3835 struct ocfs2_extent_list *el; 3836 3837 el = et->root_el; 3838 3839 ret = ocfs2_journal_access(handle, inode, et->root_bh, 3840 OCFS2_JOURNAL_ACCESS_WRITE); 3841 if (ret) { 3842 mlog_errno(ret); 3843 goto out; 3844 } 3845 3846 if (le16_to_cpu(el->l_tree_depth) == 0) { 3847 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 3848 goto out_update_clusters; 3849 } 3850 3851 right_path = ocfs2_new_path(et->root_bh, et->root_el); 3852 if (!right_path) { 3853 ret = -ENOMEM; 3854 mlog_errno(ret); 3855 goto out; 3856 } 3857 3858 /* 3859 * Determine the path to start with. Rotations need the 3860 * rightmost path, everything else can go directly to the 3861 * target leaf. 3862 */ 3863 cpos = le32_to_cpu(insert_rec->e_cpos); 3864 if (type->ins_appending == APPEND_NONE && 3865 type->ins_contig == CONTIG_NONE) { 3866 rotate = 1; 3867 cpos = UINT_MAX; 3868 } 3869 3870 ret = ocfs2_find_path(inode, right_path, cpos); 3871 if (ret) { 3872 mlog_errno(ret); 3873 goto out; 3874 } 3875 3876 /* 3877 * Rotations and appends need special treatment - they modify 3878 * parts of the tree's above them. 3879 * 3880 * Both might pass back a path immediate to the left of the 3881 * one being inserted to. This will be cause 3882 * ocfs2_insert_path() to modify the rightmost records of 3883 * left_path to account for an edge insert. 3884 * 3885 * XXX: When modifying this code, keep in mind that an insert 3886 * can wind up skipping both of these two special cases... 3887 */ 3888 if (rotate) { 3889 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 3890 le32_to_cpu(insert_rec->e_cpos), 3891 right_path, &left_path); 3892 if (ret) { 3893 mlog_errno(ret); 3894 goto out; 3895 } 3896 3897 /* 3898 * ocfs2_rotate_tree_right() might have extended the 3899 * transaction without re-journaling our tree root. 3900 */ 3901 ret = ocfs2_journal_access(handle, inode, et->root_bh, 3902 OCFS2_JOURNAL_ACCESS_WRITE); 3903 if (ret) { 3904 mlog_errno(ret); 3905 goto out; 3906 } 3907 } else if (type->ins_appending == APPEND_TAIL 3908 && type->ins_contig != CONTIG_LEFT) { 3909 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 3910 right_path, &left_path); 3911 if (ret) { 3912 mlog_errno(ret); 3913 goto out; 3914 } 3915 } 3916 3917 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 3918 insert_rec, type); 3919 if (ret) { 3920 mlog_errno(ret); 3921 goto out; 3922 } 3923 3924 out_update_clusters: 3925 if (type->ins_split == SPLIT_NONE) 3926 ocfs2_update_clusters(inode, et, 3927 le16_to_cpu(insert_rec->e_leaf_clusters)); 3928 3929 ret = ocfs2_journal_dirty(handle, et->root_bh); 3930 if (ret) 3931 mlog_errno(ret); 3932 3933 out: 3934 ocfs2_free_path(left_path); 3935 ocfs2_free_path(right_path); 3936 3937 return ret; 3938 } 3939 3940 static enum ocfs2_contig_type 3941 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 3942 struct ocfs2_extent_list *el, int index, 3943 struct ocfs2_extent_rec *split_rec) 3944 { 3945 int status; 3946 enum ocfs2_contig_type ret = CONTIG_NONE; 3947 u32 left_cpos, right_cpos; 3948 struct ocfs2_extent_rec *rec = NULL; 3949 struct ocfs2_extent_list *new_el; 3950 struct ocfs2_path *left_path = NULL, *right_path = NULL; 3951 struct buffer_head *bh; 3952 struct ocfs2_extent_block *eb; 3953 3954 if (index > 0) { 3955 rec = &el->l_recs[index - 1]; 3956 } else if (path->p_tree_depth > 0) { 3957 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3958 path, &left_cpos); 3959 if (status) 3960 goto out; 3961 3962 if (left_cpos != 0) { 3963 left_path = ocfs2_new_path(path_root_bh(path), 3964 path_root_el(path)); 3965 if (!left_path) 3966 goto out; 3967 3968 status = ocfs2_find_path(inode, left_path, left_cpos); 3969 if (status) 3970 goto out; 3971 3972 new_el = path_leaf_el(left_path); 3973 3974 if (le16_to_cpu(new_el->l_next_free_rec) != 3975 le16_to_cpu(new_el->l_count)) { 3976 bh = path_leaf_bh(left_path); 3977 eb = (struct ocfs2_extent_block *)bh->b_data; 3978 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 3979 eb); 3980 goto out; 3981 } 3982 rec = &new_el->l_recs[ 3983 le16_to_cpu(new_el->l_next_free_rec) - 1]; 3984 } 3985 } 3986 3987 /* 3988 * We're careful to check for an empty extent record here - 3989 * the merge code will know what to do if it sees one. 3990 */ 3991 if (rec) { 3992 if (index == 1 && ocfs2_is_empty_extent(rec)) { 3993 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 3994 ret = CONTIG_RIGHT; 3995 } else { 3996 ret = ocfs2_extent_contig(inode, rec, split_rec); 3997 } 3998 } 3999 4000 rec = NULL; 4001 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) 4002 rec = &el->l_recs[index + 1]; 4003 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4004 path->p_tree_depth > 0) { 4005 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4006 path, &right_cpos); 4007 if (status) 4008 goto out; 4009 4010 if (right_cpos == 0) 4011 goto out; 4012 4013 right_path = ocfs2_new_path(path_root_bh(path), 4014 path_root_el(path)); 4015 if (!right_path) 4016 goto out; 4017 4018 status = ocfs2_find_path(inode, right_path, right_cpos); 4019 if (status) 4020 goto out; 4021 4022 new_el = path_leaf_el(right_path); 4023 rec = &new_el->l_recs[0]; 4024 if (ocfs2_is_empty_extent(rec)) { 4025 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4026 bh = path_leaf_bh(right_path); 4027 eb = (struct ocfs2_extent_block *)bh->b_data; 4028 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4029 eb); 4030 goto out; 4031 } 4032 rec = &new_el->l_recs[1]; 4033 } 4034 } 4035 4036 if (rec) { 4037 enum ocfs2_contig_type contig_type; 4038 4039 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4040 4041 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4042 ret = CONTIG_LEFTRIGHT; 4043 else if (ret == CONTIG_NONE) 4044 ret = contig_type; 4045 } 4046 4047 out: 4048 if (left_path) 4049 ocfs2_free_path(left_path); 4050 if (right_path) 4051 ocfs2_free_path(right_path); 4052 4053 return ret; 4054 } 4055 4056 static void ocfs2_figure_contig_type(struct inode *inode, 4057 struct ocfs2_insert_type *insert, 4058 struct ocfs2_extent_list *el, 4059 struct ocfs2_extent_rec *insert_rec) 4060 { 4061 int i; 4062 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4063 4064 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4065 4066 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4067 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4068 insert_rec); 4069 if (contig_type != CONTIG_NONE) { 4070 insert->ins_contig_index = i; 4071 break; 4072 } 4073 } 4074 insert->ins_contig = contig_type; 4075 } 4076 4077 /* 4078 * This should only be called against the righmost leaf extent list. 4079 * 4080 * ocfs2_figure_appending_type() will figure out whether we'll have to 4081 * insert at the tail of the rightmost leaf. 4082 * 4083 * This should also work against the root extent list for tree's with 0 4084 * depth. If we consider the root extent list to be the rightmost leaf node 4085 * then the logic here makes sense. 4086 */ 4087 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4088 struct ocfs2_extent_list *el, 4089 struct ocfs2_extent_rec *insert_rec) 4090 { 4091 int i; 4092 u32 cpos = le32_to_cpu(insert_rec->e_cpos); 4093 struct ocfs2_extent_rec *rec; 4094 4095 insert->ins_appending = APPEND_NONE; 4096 4097 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4098 4099 if (!el->l_next_free_rec) 4100 goto set_tail_append; 4101 4102 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 4103 /* Were all records empty? */ 4104 if (le16_to_cpu(el->l_next_free_rec) == 1) 4105 goto set_tail_append; 4106 } 4107 4108 i = le16_to_cpu(el->l_next_free_rec) - 1; 4109 rec = &el->l_recs[i]; 4110 4111 if (cpos >= 4112 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 4113 goto set_tail_append; 4114 4115 return; 4116 4117 set_tail_append: 4118 insert->ins_appending = APPEND_TAIL; 4119 } 4120 4121 /* 4122 * Helper function called at the begining of an insert. 4123 * 4124 * This computes a few things that are commonly used in the process of 4125 * inserting into the btree: 4126 * - Whether the new extent is contiguous with an existing one. 4127 * - The current tree depth. 4128 * - Whether the insert is an appending one. 4129 * - The total # of free records in the tree. 4130 * 4131 * All of the information is stored on the ocfs2_insert_type 4132 * structure. 4133 */ 4134 static int ocfs2_figure_insert_type(struct inode *inode, 4135 struct ocfs2_extent_tree *et, 4136 struct buffer_head **last_eb_bh, 4137 struct ocfs2_extent_rec *insert_rec, 4138 int *free_records, 4139 struct ocfs2_insert_type *insert) 4140 { 4141 int ret; 4142 struct ocfs2_extent_block *eb; 4143 struct ocfs2_extent_list *el; 4144 struct ocfs2_path *path = NULL; 4145 struct buffer_head *bh = NULL; 4146 4147 insert->ins_split = SPLIT_NONE; 4148 4149 el = et->root_el; 4150 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4151 4152 if (el->l_tree_depth) { 4153 /* 4154 * If we have tree depth, we read in the 4155 * rightmost extent block ahead of time as 4156 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4157 * may want it later. 4158 */ 4159 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4160 ocfs2_get_last_eb_blk(et), &bh, 4161 OCFS2_BH_CACHED, inode); 4162 if (ret) { 4163 mlog_exit(ret); 4164 goto out; 4165 } 4166 eb = (struct ocfs2_extent_block *) bh->b_data; 4167 el = &eb->h_list; 4168 } 4169 4170 /* 4171 * Unless we have a contiguous insert, we'll need to know if 4172 * there is room left in our allocation tree for another 4173 * extent record. 4174 * 4175 * XXX: This test is simplistic, we can search for empty 4176 * extent records too. 4177 */ 4178 *free_records = le16_to_cpu(el->l_count) - 4179 le16_to_cpu(el->l_next_free_rec); 4180 4181 if (!insert->ins_tree_depth) { 4182 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4183 ocfs2_figure_appending_type(insert, el, insert_rec); 4184 return 0; 4185 } 4186 4187 path = ocfs2_new_path(et->root_bh, et->root_el); 4188 if (!path) { 4189 ret = -ENOMEM; 4190 mlog_errno(ret); 4191 goto out; 4192 } 4193 4194 /* 4195 * In the case that we're inserting past what the tree 4196 * currently accounts for, ocfs2_find_path() will return for 4197 * us the rightmost tree path. This is accounted for below in 4198 * the appending code. 4199 */ 4200 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4201 if (ret) { 4202 mlog_errno(ret); 4203 goto out; 4204 } 4205 4206 el = path_leaf_el(path); 4207 4208 /* 4209 * Now that we have the path, there's two things we want to determine: 4210 * 1) Contiguousness (also set contig_index if this is so) 4211 * 4212 * 2) Are we doing an append? We can trivially break this up 4213 * into two types of appends: simple record append, or a 4214 * rotate inside the tail leaf. 4215 */ 4216 ocfs2_figure_contig_type(inode, insert, el, insert_rec); 4217 4218 /* 4219 * The insert code isn't quite ready to deal with all cases of 4220 * left contiguousness. Specifically, if it's an insert into 4221 * the 1st record in a leaf, it will require the adjustment of 4222 * cluster count on the last record of the path directly to it's 4223 * left. For now, just catch that case and fool the layers 4224 * above us. This works just fine for tree_depth == 0, which 4225 * is why we allow that above. 4226 */ 4227 if (insert->ins_contig == CONTIG_LEFT && 4228 insert->ins_contig_index == 0) 4229 insert->ins_contig = CONTIG_NONE; 4230 4231 /* 4232 * Ok, so we can simply compare against last_eb to figure out 4233 * whether the path doesn't exist. This will only happen in 4234 * the case that we're doing a tail append, so maybe we can 4235 * take advantage of that information somehow. 4236 */ 4237 if (ocfs2_get_last_eb_blk(et) == 4238 path_leaf_bh(path)->b_blocknr) { 4239 /* 4240 * Ok, ocfs2_find_path() returned us the rightmost 4241 * tree path. This might be an appending insert. There are 4242 * two cases: 4243 * 1) We're doing a true append at the tail: 4244 * -This might even be off the end of the leaf 4245 * 2) We're "appending" by rotating in the tail 4246 */ 4247 ocfs2_figure_appending_type(insert, el, insert_rec); 4248 } 4249 4250 out: 4251 ocfs2_free_path(path); 4252 4253 if (ret == 0) 4254 *last_eb_bh = bh; 4255 else 4256 brelse(bh); 4257 return ret; 4258 } 4259 4260 /* 4261 * Insert an extent into an inode btree. 4262 * 4263 * The caller needs to update fe->i_clusters 4264 */ 4265 static int ocfs2_insert_extent(struct ocfs2_super *osb, 4266 handle_t *handle, 4267 struct inode *inode, 4268 struct buffer_head *root_bh, 4269 u32 cpos, 4270 u64 start_blk, 4271 u32 new_clusters, 4272 u8 flags, 4273 struct ocfs2_alloc_context *meta_ac, 4274 struct ocfs2_extent_tree *et) 4275 { 4276 int status; 4277 int uninitialized_var(free_records); 4278 struct buffer_head *last_eb_bh = NULL; 4279 struct ocfs2_insert_type insert = {0, }; 4280 struct ocfs2_extent_rec rec; 4281 4282 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 4283 4284 mlog(0, "add %u clusters at position %u to inode %llu\n", 4285 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4286 4287 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 4288 (OCFS2_I(inode)->ip_clusters != cpos), 4289 "Device %s, asking for sparse allocation: inode %llu, " 4290 "cpos %u, clusters %u\n", 4291 osb->dev_str, 4292 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, 4293 OCFS2_I(inode)->ip_clusters); 4294 4295 memset(&rec, 0, sizeof(rec)); 4296 rec.e_cpos = cpu_to_le32(cpos); 4297 rec.e_blkno = cpu_to_le64(start_blk); 4298 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4299 rec.e_flags = flags; 4300 4301 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4302 &free_records, &insert); 4303 if (status < 0) { 4304 mlog_errno(status); 4305 goto bail; 4306 } 4307 4308 mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 4309 "Insert.contig_index: %d, Insert.free_records: %d, " 4310 "Insert.tree_depth: %d\n", 4311 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 4312 free_records, insert.ins_tree_depth); 4313 4314 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4315 status = ocfs2_grow_tree(inode, handle, et, 4316 &insert.ins_tree_depth, &last_eb_bh, 4317 meta_ac); 4318 if (status) { 4319 mlog_errno(status); 4320 goto bail; 4321 } 4322 } 4323 4324 /* Finally, we can add clusters. This might rotate the tree for us. */ 4325 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4326 if (status < 0) 4327 mlog_errno(status); 4328 else if (et->type == OCFS2_DINODE_EXTENT) 4329 ocfs2_extent_map_insert_rec(inode, &rec); 4330 4331 bail: 4332 if (last_eb_bh) 4333 brelse(last_eb_bh); 4334 4335 mlog_exit(status); 4336 return status; 4337 } 4338 4339 int ocfs2_dinode_insert_extent(struct ocfs2_super *osb, 4340 handle_t *handle, 4341 struct inode *inode, 4342 struct buffer_head *root_bh, 4343 u32 cpos, 4344 u64 start_blk, 4345 u32 new_clusters, 4346 u8 flags, 4347 struct ocfs2_alloc_context *meta_ac) 4348 { 4349 int status; 4350 struct ocfs2_extent_tree *et = NULL; 4351 4352 et = ocfs2_new_extent_tree(root_bh, OCFS2_DINODE_EXTENT, NULL); 4353 if (!et) { 4354 status = -ENOMEM; 4355 mlog_errno(status); 4356 goto bail; 4357 } 4358 4359 status = ocfs2_insert_extent(osb, handle, inode, root_bh, 4360 cpos, start_blk, new_clusters, 4361 flags, meta_ac, et); 4362 4363 if (et) 4364 ocfs2_free_extent_tree(et); 4365 bail: 4366 return status; 4367 } 4368 4369 int ocfs2_xattr_value_insert_extent(struct ocfs2_super *osb, 4370 handle_t *handle, 4371 struct inode *inode, 4372 struct buffer_head *root_bh, 4373 u32 cpos, 4374 u64 start_blk, 4375 u32 new_clusters, 4376 u8 flags, 4377 struct ocfs2_alloc_context *meta_ac, 4378 void *private) 4379 { 4380 int status; 4381 struct ocfs2_extent_tree *et = NULL; 4382 4383 et = ocfs2_new_extent_tree(root_bh, OCFS2_XATTR_VALUE_EXTENT, private); 4384 if (!et) { 4385 status = -ENOMEM; 4386 mlog_errno(status); 4387 goto bail; 4388 } 4389 4390 status = ocfs2_insert_extent(osb, handle, inode, root_bh, 4391 cpos, start_blk, new_clusters, 4392 flags, meta_ac, et); 4393 4394 if (et) 4395 ocfs2_free_extent_tree(et); 4396 bail: 4397 return status; 4398 } 4399 4400 /* 4401 * Allcate and add clusters into the extent b-tree. 4402 * The new clusters(clusters_to_add) will be inserted at logical_offset. 4403 * The extent b-tree's root is root_el and it should be in root_bh, and 4404 * it is not limited to the file storage. Any extent tree can use this 4405 * function if it implements the proper ocfs2_extent_tree. 4406 */ 4407 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4408 struct inode *inode, 4409 u32 *logical_offset, 4410 u32 clusters_to_add, 4411 int mark_unwritten, 4412 struct buffer_head *root_bh, 4413 struct ocfs2_extent_list *root_el, 4414 handle_t *handle, 4415 struct ocfs2_alloc_context *data_ac, 4416 struct ocfs2_alloc_context *meta_ac, 4417 enum ocfs2_alloc_restarted *reason_ret, 4418 enum ocfs2_extent_tree_type type, 4419 void *private) 4420 { 4421 int status = 0; 4422 int free_extents; 4423 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4424 u32 bit_off, num_bits; 4425 u64 block; 4426 u8 flags = 0; 4427 4428 BUG_ON(!clusters_to_add); 4429 4430 if (mark_unwritten) 4431 flags = OCFS2_EXT_UNWRITTEN; 4432 4433 free_extents = ocfs2_num_free_extents(osb, inode, root_bh, type, 4434 private); 4435 if (free_extents < 0) { 4436 status = free_extents; 4437 mlog_errno(status); 4438 goto leave; 4439 } 4440 4441 /* there are two cases which could cause us to EAGAIN in the 4442 * we-need-more-metadata case: 4443 * 1) we haven't reserved *any* 4444 * 2) we are so fragmented, we've needed to add metadata too 4445 * many times. */ 4446 if (!free_extents && !meta_ac) { 4447 mlog(0, "we haven't reserved any metadata!\n"); 4448 status = -EAGAIN; 4449 reason = RESTART_META; 4450 goto leave; 4451 } else if ((!free_extents) 4452 && (ocfs2_alloc_context_bits_left(meta_ac) 4453 < ocfs2_extend_meta_needed(root_el))) { 4454 mlog(0, "filesystem is really fragmented...\n"); 4455 status = -EAGAIN; 4456 reason = RESTART_META; 4457 goto leave; 4458 } 4459 4460 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4461 clusters_to_add, &bit_off, &num_bits); 4462 if (status < 0) { 4463 if (status != -ENOSPC) 4464 mlog_errno(status); 4465 goto leave; 4466 } 4467 4468 BUG_ON(num_bits > clusters_to_add); 4469 4470 /* reserve our write early -- insert_extent may update the inode */ 4471 status = ocfs2_journal_access(handle, inode, root_bh, 4472 OCFS2_JOURNAL_ACCESS_WRITE); 4473 if (status < 0) { 4474 mlog_errno(status); 4475 goto leave; 4476 } 4477 4478 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4479 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4480 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4481 if (type == OCFS2_DINODE_EXTENT) 4482 status = ocfs2_dinode_insert_extent(osb, handle, inode, root_bh, 4483 *logical_offset, block, 4484 num_bits, flags, meta_ac); 4485 else 4486 status = ocfs2_xattr_value_insert_extent(osb, handle, 4487 inode, root_bh, 4488 *logical_offset, 4489 block, num_bits, flags, 4490 meta_ac, private); 4491 if (status < 0) { 4492 mlog_errno(status); 4493 goto leave; 4494 } 4495 4496 status = ocfs2_journal_dirty(handle, root_bh); 4497 if (status < 0) { 4498 mlog_errno(status); 4499 goto leave; 4500 } 4501 4502 clusters_to_add -= num_bits; 4503 *logical_offset += num_bits; 4504 4505 if (clusters_to_add) { 4506 mlog(0, "need to alloc once more, wanted = %u\n", 4507 clusters_to_add); 4508 status = -EAGAIN; 4509 reason = RESTART_TRANS; 4510 } 4511 4512 leave: 4513 mlog_exit(status); 4514 if (reason_ret) 4515 *reason_ret = reason; 4516 return status; 4517 } 4518 4519 static void ocfs2_make_right_split_rec(struct super_block *sb, 4520 struct ocfs2_extent_rec *split_rec, 4521 u32 cpos, 4522 struct ocfs2_extent_rec *rec) 4523 { 4524 u32 rec_cpos = le32_to_cpu(rec->e_cpos); 4525 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); 4526 4527 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4528 4529 split_rec->e_cpos = cpu_to_le32(cpos); 4530 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); 4531 4532 split_rec->e_blkno = rec->e_blkno; 4533 le64_add_cpu(&split_rec->e_blkno, 4534 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); 4535 4536 split_rec->e_flags = rec->e_flags; 4537 } 4538 4539 static int ocfs2_split_and_insert(struct inode *inode, 4540 handle_t *handle, 4541 struct ocfs2_path *path, 4542 struct ocfs2_extent_tree *et, 4543 struct buffer_head **last_eb_bh, 4544 int split_index, 4545 struct ocfs2_extent_rec *orig_split_rec, 4546 struct ocfs2_alloc_context *meta_ac) 4547 { 4548 int ret = 0, depth; 4549 unsigned int insert_range, rec_range, do_leftright = 0; 4550 struct ocfs2_extent_rec tmprec; 4551 struct ocfs2_extent_list *rightmost_el; 4552 struct ocfs2_extent_rec rec; 4553 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4554 struct ocfs2_insert_type insert; 4555 struct ocfs2_extent_block *eb; 4556 4557 leftright: 4558 /* 4559 * Store a copy of the record on the stack - it might move 4560 * around as the tree is manipulated below. 4561 */ 4562 rec = path_leaf_el(path)->l_recs[split_index]; 4563 4564 rightmost_el = et->root_el; 4565 4566 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4567 if (depth) { 4568 BUG_ON(!(*last_eb_bh)); 4569 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 4570 rightmost_el = &eb->h_list; 4571 } 4572 4573 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4574 le16_to_cpu(rightmost_el->l_count)) { 4575 ret = ocfs2_grow_tree(inode, handle, et, 4576 &depth, last_eb_bh, meta_ac); 4577 if (ret) { 4578 mlog_errno(ret); 4579 goto out; 4580 } 4581 } 4582 4583 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4584 insert.ins_appending = APPEND_NONE; 4585 insert.ins_contig = CONTIG_NONE; 4586 insert.ins_tree_depth = depth; 4587 4588 insert_range = le32_to_cpu(split_rec.e_cpos) + 4589 le16_to_cpu(split_rec.e_leaf_clusters); 4590 rec_range = le32_to_cpu(rec.e_cpos) + 4591 le16_to_cpu(rec.e_leaf_clusters); 4592 4593 if (split_rec.e_cpos == rec.e_cpos) { 4594 insert.ins_split = SPLIT_LEFT; 4595 } else if (insert_range == rec_range) { 4596 insert.ins_split = SPLIT_RIGHT; 4597 } else { 4598 /* 4599 * Left/right split. We fake this as a right split 4600 * first and then make a second pass as a left split. 4601 */ 4602 insert.ins_split = SPLIT_RIGHT; 4603 4604 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 4605 &rec); 4606 4607 split_rec = tmprec; 4608 4609 BUG_ON(do_leftright); 4610 do_leftright = 1; 4611 } 4612 4613 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 4614 if (ret) { 4615 mlog_errno(ret); 4616 goto out; 4617 } 4618 4619 if (do_leftright == 1) { 4620 u32 cpos; 4621 struct ocfs2_extent_list *el; 4622 4623 do_leftright++; 4624 split_rec = *orig_split_rec; 4625 4626 ocfs2_reinit_path(path, 1); 4627 4628 cpos = le32_to_cpu(split_rec.e_cpos); 4629 ret = ocfs2_find_path(inode, path, cpos); 4630 if (ret) { 4631 mlog_errno(ret); 4632 goto out; 4633 } 4634 4635 el = path_leaf_el(path); 4636 split_index = ocfs2_search_extent_list(el, cpos); 4637 goto leftright; 4638 } 4639 out: 4640 4641 return ret; 4642 } 4643 4644 /* 4645 * Mark part or all of the extent record at split_index in the leaf 4646 * pointed to by path as written. This removes the unwritten 4647 * extent flag. 4648 * 4649 * Care is taken to handle contiguousness so as to not grow the tree. 4650 * 4651 * meta_ac is not strictly necessary - we only truly need it if growth 4652 * of the tree is required. All other cases will degrade into a less 4653 * optimal tree layout. 4654 * 4655 * last_eb_bh should be the rightmost leaf block for any extent 4656 * btree. Since a split may grow the tree or a merge might shrink it, 4657 * the caller cannot trust the contents of that buffer after this call. 4658 * 4659 * This code is optimized for readability - several passes might be 4660 * made over certain portions of the tree. All of those blocks will 4661 * have been brought into cache (and pinned via the journal), so the 4662 * extra overhead is not expressed in terms of disk reads. 4663 */ 4664 static int __ocfs2_mark_extent_written(struct inode *inode, 4665 struct ocfs2_extent_tree *et, 4666 handle_t *handle, 4667 struct ocfs2_path *path, 4668 int split_index, 4669 struct ocfs2_extent_rec *split_rec, 4670 struct ocfs2_alloc_context *meta_ac, 4671 struct ocfs2_cached_dealloc_ctxt *dealloc) 4672 { 4673 int ret = 0; 4674 struct ocfs2_extent_list *el = path_leaf_el(path); 4675 struct buffer_head *last_eb_bh = NULL; 4676 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 4677 struct ocfs2_merge_ctxt ctxt; 4678 struct ocfs2_extent_list *rightmost_el; 4679 4680 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { 4681 ret = -EIO; 4682 mlog_errno(ret); 4683 goto out; 4684 } 4685 4686 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 4687 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 4688 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 4689 ret = -EIO; 4690 mlog_errno(ret); 4691 goto out; 4692 } 4693 4694 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 4695 split_index, 4696 split_rec); 4697 4698 /* 4699 * The core merge / split code wants to know how much room is 4700 * left in this inodes allocation tree, so we pass the 4701 * rightmost extent list. 4702 */ 4703 if (path->p_tree_depth) { 4704 struct ocfs2_extent_block *eb; 4705 4706 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4707 ocfs2_get_last_eb_blk(et), 4708 &last_eb_bh, OCFS2_BH_CACHED, inode); 4709 if (ret) { 4710 mlog_exit(ret); 4711 goto out; 4712 } 4713 4714 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4715 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 4716 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 4717 ret = -EROFS; 4718 goto out; 4719 } 4720 4721 rightmost_el = &eb->h_list; 4722 } else 4723 rightmost_el = path_root_el(path); 4724 4725 if (rec->e_cpos == split_rec->e_cpos && 4726 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 4727 ctxt.c_split_covers_rec = 1; 4728 else 4729 ctxt.c_split_covers_rec = 0; 4730 4731 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 4732 4733 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", 4734 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, 4735 ctxt.c_split_covers_rec); 4736 4737 if (ctxt.c_contig_type == CONTIG_NONE) { 4738 if (ctxt.c_split_covers_rec) 4739 el->l_recs[split_index] = *split_rec; 4740 else 4741 ret = ocfs2_split_and_insert(inode, handle, path, et, 4742 &last_eb_bh, split_index, 4743 split_rec, meta_ac); 4744 if (ret) 4745 mlog_errno(ret); 4746 } else { 4747 ret = ocfs2_try_to_merge_extent(inode, handle, path, 4748 split_index, split_rec, 4749 dealloc, &ctxt, et); 4750 if (ret) 4751 mlog_errno(ret); 4752 } 4753 4754 out: 4755 brelse(last_eb_bh); 4756 return ret; 4757 } 4758 4759 /* 4760 * Mark the already-existing extent at cpos as written for len clusters. 4761 * 4762 * If the existing extent is larger than the request, initiate a 4763 * split. An attempt will be made at merging with adjacent extents. 4764 * 4765 * The caller is responsible for passing down meta_ac if we'll need it. 4766 */ 4767 int ocfs2_mark_extent_written(struct inode *inode, struct buffer_head *root_bh, 4768 handle_t *handle, u32 cpos, u32 len, u32 phys, 4769 struct ocfs2_alloc_context *meta_ac, 4770 struct ocfs2_cached_dealloc_ctxt *dealloc, 4771 enum ocfs2_extent_tree_type et_type, 4772 void *private) 4773 { 4774 int ret, index; 4775 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 4776 struct ocfs2_extent_rec split_rec; 4777 struct ocfs2_path *left_path = NULL; 4778 struct ocfs2_extent_list *el; 4779 struct ocfs2_extent_tree *et = NULL; 4780 4781 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", 4782 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); 4783 4784 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 4785 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 4786 "that are being written to, but the feature bit " 4787 "is not set in the super block.", 4788 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4789 ret = -EROFS; 4790 goto out; 4791 } 4792 4793 et = ocfs2_new_extent_tree(root_bh, et_type, private); 4794 if (!et) { 4795 ret = -ENOMEM; 4796 mlog_errno(ret); 4797 goto out; 4798 } 4799 4800 /* 4801 * XXX: This should be fixed up so that we just re-insert the 4802 * next extent records. 4803 */ 4804 if (et_type == OCFS2_DINODE_EXTENT) 4805 ocfs2_extent_map_trunc(inode, 0); 4806 4807 left_path = ocfs2_new_path(et->root_bh, et->root_el); 4808 if (!left_path) { 4809 ret = -ENOMEM; 4810 mlog_errno(ret); 4811 goto out; 4812 } 4813 4814 ret = ocfs2_find_path(inode, left_path, cpos); 4815 if (ret) { 4816 mlog_errno(ret); 4817 goto out; 4818 } 4819 el = path_leaf_el(left_path); 4820 4821 index = ocfs2_search_extent_list(el, cpos); 4822 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 4823 ocfs2_error(inode->i_sb, 4824 "Inode %llu has an extent at cpos %u which can no " 4825 "longer be found.\n", 4826 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 4827 ret = -EROFS; 4828 goto out; 4829 } 4830 4831 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4832 split_rec.e_cpos = cpu_to_le32(cpos); 4833 split_rec.e_leaf_clusters = cpu_to_le16(len); 4834 split_rec.e_blkno = cpu_to_le64(start_blkno); 4835 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 4836 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 4837 4838 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 4839 index, &split_rec, meta_ac, 4840 dealloc); 4841 if (ret) 4842 mlog_errno(ret); 4843 4844 out: 4845 ocfs2_free_path(left_path); 4846 if (et) 4847 ocfs2_free_extent_tree(et); 4848 return ret; 4849 } 4850 4851 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 4852 handle_t *handle, struct ocfs2_path *path, 4853 int index, u32 new_range, 4854 struct ocfs2_alloc_context *meta_ac) 4855 { 4856 int ret, depth, credits = handle->h_buffer_credits; 4857 struct buffer_head *last_eb_bh = NULL; 4858 struct ocfs2_extent_block *eb; 4859 struct ocfs2_extent_list *rightmost_el, *el; 4860 struct ocfs2_extent_rec split_rec; 4861 struct ocfs2_extent_rec *rec; 4862 struct ocfs2_insert_type insert; 4863 4864 /* 4865 * Setup the record to split before we grow the tree. 4866 */ 4867 el = path_leaf_el(path); 4868 rec = &el->l_recs[index]; 4869 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 4870 4871 depth = path->p_tree_depth; 4872 if (depth > 0) { 4873 ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 4874 ocfs2_get_last_eb_blk(et), 4875 &last_eb_bh, OCFS2_BH_CACHED, inode); 4876 if (ret < 0) { 4877 mlog_errno(ret); 4878 goto out; 4879 } 4880 4881 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4882 rightmost_el = &eb->h_list; 4883 } else 4884 rightmost_el = path_leaf_el(path); 4885 4886 credits += path->p_tree_depth + 4887 ocfs2_extend_meta_needed(et->root_el); 4888 ret = ocfs2_extend_trans(handle, credits); 4889 if (ret) { 4890 mlog_errno(ret); 4891 goto out; 4892 } 4893 4894 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4895 le16_to_cpu(rightmost_el->l_count)) { 4896 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 4897 meta_ac); 4898 if (ret) { 4899 mlog_errno(ret); 4900 goto out; 4901 } 4902 } 4903 4904 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4905 insert.ins_appending = APPEND_NONE; 4906 insert.ins_contig = CONTIG_NONE; 4907 insert.ins_split = SPLIT_RIGHT; 4908 insert.ins_tree_depth = depth; 4909 4910 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 4911 if (ret) 4912 mlog_errno(ret); 4913 4914 out: 4915 brelse(last_eb_bh); 4916 return ret; 4917 } 4918 4919 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 4920 struct ocfs2_path *path, int index, 4921 struct ocfs2_cached_dealloc_ctxt *dealloc, 4922 u32 cpos, u32 len, 4923 struct ocfs2_extent_tree *et) 4924 { 4925 int ret; 4926 u32 left_cpos, rec_range, trunc_range; 4927 int wants_rotate = 0, is_rightmost_tree_rec = 0; 4928 struct super_block *sb = inode->i_sb; 4929 struct ocfs2_path *left_path = NULL; 4930 struct ocfs2_extent_list *el = path_leaf_el(path); 4931 struct ocfs2_extent_rec *rec; 4932 struct ocfs2_extent_block *eb; 4933 4934 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 4935 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 4936 if (ret) { 4937 mlog_errno(ret); 4938 goto out; 4939 } 4940 4941 index--; 4942 } 4943 4944 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && 4945 path->p_tree_depth) { 4946 /* 4947 * Check whether this is the rightmost tree record. If 4948 * we remove all of this record or part of its right 4949 * edge then an update of the record lengths above it 4950 * will be required. 4951 */ 4952 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 4953 if (eb->h_next_leaf_blk == 0) 4954 is_rightmost_tree_rec = 1; 4955 } 4956 4957 rec = &el->l_recs[index]; 4958 if (index == 0 && path->p_tree_depth && 4959 le32_to_cpu(rec->e_cpos) == cpos) { 4960 /* 4961 * Changing the leftmost offset (via partial or whole 4962 * record truncate) of an interior (or rightmost) path 4963 * means we have to update the subtree that is formed 4964 * by this leaf and the one to it's left. 4965 * 4966 * There are two cases we can skip: 4967 * 1) Path is the leftmost one in our inode tree. 4968 * 2) The leaf is rightmost and will be empty after 4969 * we remove the extent record - the rotate code 4970 * knows how to update the newly formed edge. 4971 */ 4972 4973 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 4974 &left_cpos); 4975 if (ret) { 4976 mlog_errno(ret); 4977 goto out; 4978 } 4979 4980 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 4981 left_path = ocfs2_new_path(path_root_bh(path), 4982 path_root_el(path)); 4983 if (!left_path) { 4984 ret = -ENOMEM; 4985 mlog_errno(ret); 4986 goto out; 4987 } 4988 4989 ret = ocfs2_find_path(inode, left_path, left_cpos); 4990 if (ret) { 4991 mlog_errno(ret); 4992 goto out; 4993 } 4994 } 4995 } 4996 4997 ret = ocfs2_extend_rotate_transaction(handle, 0, 4998 handle->h_buffer_credits, 4999 path); 5000 if (ret) { 5001 mlog_errno(ret); 5002 goto out; 5003 } 5004 5005 ret = ocfs2_journal_access_path(inode, handle, path); 5006 if (ret) { 5007 mlog_errno(ret); 5008 goto out; 5009 } 5010 5011 ret = ocfs2_journal_access_path(inode, handle, left_path); 5012 if (ret) { 5013 mlog_errno(ret); 5014 goto out; 5015 } 5016 5017 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5018 trunc_range = cpos + len; 5019 5020 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { 5021 int next_free; 5022 5023 memset(rec, 0, sizeof(*rec)); 5024 ocfs2_cleanup_merge(el, index); 5025 wants_rotate = 1; 5026 5027 next_free = le16_to_cpu(el->l_next_free_rec); 5028 if (is_rightmost_tree_rec && next_free > 1) { 5029 /* 5030 * We skip the edge update if this path will 5031 * be deleted by the rotate code. 5032 */ 5033 rec = &el->l_recs[next_free - 1]; 5034 ocfs2_adjust_rightmost_records(inode, handle, path, 5035 rec); 5036 } 5037 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5038 /* Remove leftmost portion of the record. */ 5039 le32_add_cpu(&rec->e_cpos, len); 5040 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); 5041 le16_add_cpu(&rec->e_leaf_clusters, -len); 5042 } else if (rec_range == trunc_range) { 5043 /* Remove rightmost portion of the record */ 5044 le16_add_cpu(&rec->e_leaf_clusters, -len); 5045 if (is_rightmost_tree_rec) 5046 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5047 } else { 5048 /* Caller should have trapped this. */ 5049 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5050 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5051 le32_to_cpu(rec->e_cpos), 5052 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5053 BUG(); 5054 } 5055 5056 if (left_path) { 5057 int subtree_index; 5058 5059 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5060 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5061 subtree_index); 5062 } 5063 5064 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5065 5066 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5067 if (ret) { 5068 mlog_errno(ret); 5069 goto out; 5070 } 5071 5072 out: 5073 ocfs2_free_path(left_path); 5074 return ret; 5075 } 5076 5077 int ocfs2_remove_extent(struct inode *inode, struct buffer_head *root_bh, 5078 u32 cpos, u32 len, handle_t *handle, 5079 struct ocfs2_alloc_context *meta_ac, 5080 struct ocfs2_cached_dealloc_ctxt *dealloc, 5081 enum ocfs2_extent_tree_type et_type, 5082 void *private) 5083 { 5084 int ret, index; 5085 u32 rec_range, trunc_range; 5086 struct ocfs2_extent_rec *rec; 5087 struct ocfs2_extent_list *el; 5088 struct ocfs2_path *path = NULL; 5089 struct ocfs2_extent_tree *et = NULL; 5090 5091 et = ocfs2_new_extent_tree(root_bh, et_type, private); 5092 if (!et) { 5093 ret = -ENOMEM; 5094 mlog_errno(ret); 5095 goto out; 5096 } 5097 5098 ocfs2_extent_map_trunc(inode, 0); 5099 5100 path = ocfs2_new_path(et->root_bh, et->root_el); 5101 if (!path) { 5102 ret = -ENOMEM; 5103 mlog_errno(ret); 5104 goto out; 5105 } 5106 5107 ret = ocfs2_find_path(inode, path, cpos); 5108 if (ret) { 5109 mlog_errno(ret); 5110 goto out; 5111 } 5112 5113 el = path_leaf_el(path); 5114 index = ocfs2_search_extent_list(el, cpos); 5115 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5116 ocfs2_error(inode->i_sb, 5117 "Inode %llu has an extent at cpos %u which can no " 5118 "longer be found.\n", 5119 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5120 ret = -EROFS; 5121 goto out; 5122 } 5123 5124 /* 5125 * We have 3 cases of extent removal: 5126 * 1) Range covers the entire extent rec 5127 * 2) Range begins or ends on one edge of the extent rec 5128 * 3) Range is in the middle of the extent rec (no shared edges) 5129 * 5130 * For case 1 we remove the extent rec and left rotate to 5131 * fill the hole. 5132 * 5133 * For case 2 we just shrink the existing extent rec, with a 5134 * tree update if the shrinking edge is also the edge of an 5135 * extent block. 5136 * 5137 * For case 3 we do a right split to turn the extent rec into 5138 * something case 2 can handle. 5139 */ 5140 rec = &el->l_recs[index]; 5141 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5142 trunc_range = cpos + len; 5143 5144 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5145 5146 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5147 "(cpos %u, len %u)\n", 5148 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5149 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5150 5151 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5152 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5153 cpos, len, et); 5154 if (ret) { 5155 mlog_errno(ret); 5156 goto out; 5157 } 5158 } else { 5159 ret = ocfs2_split_tree(inode, et, handle, path, index, 5160 trunc_range, meta_ac); 5161 if (ret) { 5162 mlog_errno(ret); 5163 goto out; 5164 } 5165 5166 /* 5167 * The split could have manipulated the tree enough to 5168 * move the record location, so we have to look for it again. 5169 */ 5170 ocfs2_reinit_path(path, 1); 5171 5172 ret = ocfs2_find_path(inode, path, cpos); 5173 if (ret) { 5174 mlog_errno(ret); 5175 goto out; 5176 } 5177 5178 el = path_leaf_el(path); 5179 index = ocfs2_search_extent_list(el, cpos); 5180 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5181 ocfs2_error(inode->i_sb, 5182 "Inode %llu: split at cpos %u lost record.", 5183 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5184 cpos); 5185 ret = -EROFS; 5186 goto out; 5187 } 5188 5189 /* 5190 * Double check our values here. If anything is fishy, 5191 * it's easier to catch it at the top level. 5192 */ 5193 rec = &el->l_recs[index]; 5194 rec_range = le32_to_cpu(rec->e_cpos) + 5195 ocfs2_rec_clusters(el, rec); 5196 if (rec_range != trunc_range) { 5197 ocfs2_error(inode->i_sb, 5198 "Inode %llu: error after split at cpos %u" 5199 "trunc len %u, existing record is (%u,%u)", 5200 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5201 cpos, len, le32_to_cpu(rec->e_cpos), 5202 ocfs2_rec_clusters(el, rec)); 5203 ret = -EROFS; 5204 goto out; 5205 } 5206 5207 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5208 cpos, len, et); 5209 if (ret) { 5210 mlog_errno(ret); 5211 goto out; 5212 } 5213 } 5214 5215 out: 5216 ocfs2_free_path(path); 5217 if (et) 5218 ocfs2_free_extent_tree(et); 5219 return ret; 5220 } 5221 5222 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5223 { 5224 struct buffer_head *tl_bh = osb->osb_tl_bh; 5225 struct ocfs2_dinode *di; 5226 struct ocfs2_truncate_log *tl; 5227 5228 di = (struct ocfs2_dinode *) tl_bh->b_data; 5229 tl = &di->id2.i_dealloc; 5230 5231 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 5232 "slot %d, invalid truncate log parameters: used = " 5233 "%u, count = %u\n", osb->slot_num, 5234 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 5235 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 5236 } 5237 5238 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 5239 unsigned int new_start) 5240 { 5241 unsigned int tail_index; 5242 unsigned int current_tail; 5243 5244 /* No records, nothing to coalesce */ 5245 if (!le16_to_cpu(tl->tl_used)) 5246 return 0; 5247 5248 tail_index = le16_to_cpu(tl->tl_used) - 1; 5249 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 5250 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 5251 5252 return current_tail == new_start; 5253 } 5254 5255 int ocfs2_truncate_log_append(struct ocfs2_super *osb, 5256 handle_t *handle, 5257 u64 start_blk, 5258 unsigned int num_clusters) 5259 { 5260 int status, index; 5261 unsigned int start_cluster, tl_count; 5262 struct inode *tl_inode = osb->osb_tl_inode; 5263 struct buffer_head *tl_bh = osb->osb_tl_bh; 5264 struct ocfs2_dinode *di; 5265 struct ocfs2_truncate_log *tl; 5266 5267 mlog_entry("start_blk = %llu, num_clusters = %u\n", 5268 (unsigned long long)start_blk, num_clusters); 5269 5270 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5271 5272 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5273 5274 di = (struct ocfs2_dinode *) tl_bh->b_data; 5275 tl = &di->id2.i_dealloc; 5276 if (!OCFS2_IS_VALID_DINODE(di)) { 5277 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 5278 status = -EIO; 5279 goto bail; 5280 } 5281 5282 tl_count = le16_to_cpu(tl->tl_count); 5283 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5284 tl_count == 0, 5285 "Truncate record count on #%llu invalid " 5286 "wanted %u, actual %u\n", 5287 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5288 ocfs2_truncate_recs_per_inode(osb->sb), 5289 le16_to_cpu(tl->tl_count)); 5290 5291 /* Caller should have known to flush before calling us. */ 5292 index = le16_to_cpu(tl->tl_used); 5293 if (index >= tl_count) { 5294 status = -ENOSPC; 5295 mlog_errno(status); 5296 goto bail; 5297 } 5298 5299 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5300 OCFS2_JOURNAL_ACCESS_WRITE); 5301 if (status < 0) { 5302 mlog_errno(status); 5303 goto bail; 5304 } 5305 5306 mlog(0, "Log truncate of %u clusters starting at cluster %u to " 5307 "%llu (index = %d)\n", num_clusters, start_cluster, 5308 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 5309 5310 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5311 /* 5312 * Move index back to the record we are coalescing with. 5313 * ocfs2_truncate_log_can_coalesce() guarantees nonzero 5314 */ 5315 index--; 5316 5317 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5318 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 5319 index, le32_to_cpu(tl->tl_recs[index].t_start), 5320 num_clusters); 5321 } else { 5322 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5323 tl->tl_used = cpu_to_le16(index + 1); 5324 } 5325 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5326 5327 status = ocfs2_journal_dirty(handle, tl_bh); 5328 if (status < 0) { 5329 mlog_errno(status); 5330 goto bail; 5331 } 5332 5333 bail: 5334 mlog_exit(status); 5335 return status; 5336 } 5337 5338 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5339 handle_t *handle, 5340 struct inode *data_alloc_inode, 5341 struct buffer_head *data_alloc_bh) 5342 { 5343 int status = 0; 5344 int i; 5345 unsigned int num_clusters; 5346 u64 start_blk; 5347 struct ocfs2_truncate_rec rec; 5348 struct ocfs2_dinode *di; 5349 struct ocfs2_truncate_log *tl; 5350 struct inode *tl_inode = osb->osb_tl_inode; 5351 struct buffer_head *tl_bh = osb->osb_tl_bh; 5352 5353 mlog_entry_void(); 5354 5355 di = (struct ocfs2_dinode *) tl_bh->b_data; 5356 tl = &di->id2.i_dealloc; 5357 i = le16_to_cpu(tl->tl_used) - 1; 5358 while (i >= 0) { 5359 /* Caller has given us at least enough credits to 5360 * update the truncate log dinode */ 5361 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5362 OCFS2_JOURNAL_ACCESS_WRITE); 5363 if (status < 0) { 5364 mlog_errno(status); 5365 goto bail; 5366 } 5367 5368 tl->tl_used = cpu_to_le16(i); 5369 5370 status = ocfs2_journal_dirty(handle, tl_bh); 5371 if (status < 0) { 5372 mlog_errno(status); 5373 goto bail; 5374 } 5375 5376 /* TODO: Perhaps we can calculate the bulk of the 5377 * credits up front rather than extending like 5378 * this. */ 5379 status = ocfs2_extend_trans(handle, 5380 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5381 if (status < 0) { 5382 mlog_errno(status); 5383 goto bail; 5384 } 5385 5386 rec = tl->tl_recs[i]; 5387 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5388 le32_to_cpu(rec.t_start)); 5389 num_clusters = le32_to_cpu(rec.t_clusters); 5390 5391 /* if start_blk is not set, we ignore the record as 5392 * invalid. */ 5393 if (start_blk) { 5394 mlog(0, "free record %d, start = %u, clusters = %u\n", 5395 i, le32_to_cpu(rec.t_start), num_clusters); 5396 5397 status = ocfs2_free_clusters(handle, data_alloc_inode, 5398 data_alloc_bh, start_blk, 5399 num_clusters); 5400 if (status < 0) { 5401 mlog_errno(status); 5402 goto bail; 5403 } 5404 } 5405 i--; 5406 } 5407 5408 bail: 5409 mlog_exit(status); 5410 return status; 5411 } 5412 5413 /* Expects you to already be holding tl_inode->i_mutex */ 5414 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5415 { 5416 int status; 5417 unsigned int num_to_flush; 5418 handle_t *handle; 5419 struct inode *tl_inode = osb->osb_tl_inode; 5420 struct inode *data_alloc_inode = NULL; 5421 struct buffer_head *tl_bh = osb->osb_tl_bh; 5422 struct buffer_head *data_alloc_bh = NULL; 5423 struct ocfs2_dinode *di; 5424 struct ocfs2_truncate_log *tl; 5425 5426 mlog_entry_void(); 5427 5428 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5429 5430 di = (struct ocfs2_dinode *) tl_bh->b_data; 5431 tl = &di->id2.i_dealloc; 5432 if (!OCFS2_IS_VALID_DINODE(di)) { 5433 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 5434 status = -EIO; 5435 goto out; 5436 } 5437 5438 num_to_flush = le16_to_cpu(tl->tl_used); 5439 mlog(0, "Flush %u records from truncate log #%llu\n", 5440 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5441 if (!num_to_flush) { 5442 status = 0; 5443 goto out; 5444 } 5445 5446 data_alloc_inode = ocfs2_get_system_file_inode(osb, 5447 GLOBAL_BITMAP_SYSTEM_INODE, 5448 OCFS2_INVALID_SLOT); 5449 if (!data_alloc_inode) { 5450 status = -EINVAL; 5451 mlog(ML_ERROR, "Could not get bitmap inode!\n"); 5452 goto out; 5453 } 5454 5455 mutex_lock(&data_alloc_inode->i_mutex); 5456 5457 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); 5458 if (status < 0) { 5459 mlog_errno(status); 5460 goto out_mutex; 5461 } 5462 5463 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5464 if (IS_ERR(handle)) { 5465 status = PTR_ERR(handle); 5466 mlog_errno(status); 5467 goto out_unlock; 5468 } 5469 5470 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 5471 data_alloc_bh); 5472 if (status < 0) 5473 mlog_errno(status); 5474 5475 ocfs2_commit_trans(osb, handle); 5476 5477 out_unlock: 5478 brelse(data_alloc_bh); 5479 ocfs2_inode_unlock(data_alloc_inode, 1); 5480 5481 out_mutex: 5482 mutex_unlock(&data_alloc_inode->i_mutex); 5483 iput(data_alloc_inode); 5484 5485 out: 5486 mlog_exit(status); 5487 return status; 5488 } 5489 5490 int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5491 { 5492 int status; 5493 struct inode *tl_inode = osb->osb_tl_inode; 5494 5495 mutex_lock(&tl_inode->i_mutex); 5496 status = __ocfs2_flush_truncate_log(osb); 5497 mutex_unlock(&tl_inode->i_mutex); 5498 5499 return status; 5500 } 5501 5502 static void ocfs2_truncate_log_worker(struct work_struct *work) 5503 { 5504 int status; 5505 struct ocfs2_super *osb = 5506 container_of(work, struct ocfs2_super, 5507 osb_truncate_log_wq.work); 5508 5509 mlog_entry_void(); 5510 5511 status = ocfs2_flush_truncate_log(osb); 5512 if (status < 0) 5513 mlog_errno(status); 5514 else 5515 ocfs2_init_inode_steal_slot(osb); 5516 5517 mlog_exit(status); 5518 } 5519 5520 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 5521 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 5522 int cancel) 5523 { 5524 if (osb->osb_tl_inode) { 5525 /* We want to push off log flushes while truncates are 5526 * still running. */ 5527 if (cancel) 5528 cancel_delayed_work(&osb->osb_truncate_log_wq); 5529 5530 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 5531 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 5532 } 5533 } 5534 5535 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 5536 int slot_num, 5537 struct inode **tl_inode, 5538 struct buffer_head **tl_bh) 5539 { 5540 int status; 5541 struct inode *inode = NULL; 5542 struct buffer_head *bh = NULL; 5543 5544 inode = ocfs2_get_system_file_inode(osb, 5545 TRUNCATE_LOG_SYSTEM_INODE, 5546 slot_num); 5547 if (!inode) { 5548 status = -EINVAL; 5549 mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 5550 goto bail; 5551 } 5552 5553 status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 5554 OCFS2_BH_CACHED, inode); 5555 if (status < 0) { 5556 iput(inode); 5557 mlog_errno(status); 5558 goto bail; 5559 } 5560 5561 *tl_inode = inode; 5562 *tl_bh = bh; 5563 bail: 5564 mlog_exit(status); 5565 return status; 5566 } 5567 5568 /* called during the 1st stage of node recovery. we stamp a clean 5569 * truncate log and pass back a copy for processing later. if the 5570 * truncate log does not require processing, a *tl_copy is set to 5571 * NULL. */ 5572 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 5573 int slot_num, 5574 struct ocfs2_dinode **tl_copy) 5575 { 5576 int status; 5577 struct inode *tl_inode = NULL; 5578 struct buffer_head *tl_bh = NULL; 5579 struct ocfs2_dinode *di; 5580 struct ocfs2_truncate_log *tl; 5581 5582 *tl_copy = NULL; 5583 5584 mlog(0, "recover truncate log from slot %d\n", slot_num); 5585 5586 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 5587 if (status < 0) { 5588 mlog_errno(status); 5589 goto bail; 5590 } 5591 5592 di = (struct ocfs2_dinode *) tl_bh->b_data; 5593 tl = &di->id2.i_dealloc; 5594 if (!OCFS2_IS_VALID_DINODE(di)) { 5595 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); 5596 status = -EIO; 5597 goto bail; 5598 } 5599 5600 if (le16_to_cpu(tl->tl_used)) { 5601 mlog(0, "We'll have %u logs to recover\n", 5602 le16_to_cpu(tl->tl_used)); 5603 5604 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 5605 if (!(*tl_copy)) { 5606 status = -ENOMEM; 5607 mlog_errno(status); 5608 goto bail; 5609 } 5610 5611 /* Assuming the write-out below goes well, this copy 5612 * will be passed back to recovery for processing. */ 5613 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 5614 5615 /* All we need to do to clear the truncate log is set 5616 * tl_used. */ 5617 tl->tl_used = 0; 5618 5619 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5620 if (status < 0) { 5621 mlog_errno(status); 5622 goto bail; 5623 } 5624 } 5625 5626 bail: 5627 if (tl_inode) 5628 iput(tl_inode); 5629 if (tl_bh) 5630 brelse(tl_bh); 5631 5632 if (status < 0 && (*tl_copy)) { 5633 kfree(*tl_copy); 5634 *tl_copy = NULL; 5635 } 5636 5637 mlog_exit(status); 5638 return status; 5639 } 5640 5641 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 5642 struct ocfs2_dinode *tl_copy) 5643 { 5644 int status = 0; 5645 int i; 5646 unsigned int clusters, num_recs, start_cluster; 5647 u64 start_blk; 5648 handle_t *handle; 5649 struct inode *tl_inode = osb->osb_tl_inode; 5650 struct ocfs2_truncate_log *tl; 5651 5652 mlog_entry_void(); 5653 5654 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 5655 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 5656 return -EINVAL; 5657 } 5658 5659 tl = &tl_copy->id2.i_dealloc; 5660 num_recs = le16_to_cpu(tl->tl_used); 5661 mlog(0, "cleanup %u records from %llu\n", num_recs, 5662 (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 5663 5664 mutex_lock(&tl_inode->i_mutex); 5665 for(i = 0; i < num_recs; i++) { 5666 if (ocfs2_truncate_log_needs_flush(osb)) { 5667 status = __ocfs2_flush_truncate_log(osb); 5668 if (status < 0) { 5669 mlog_errno(status); 5670 goto bail_up; 5671 } 5672 } 5673 5674 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5675 if (IS_ERR(handle)) { 5676 status = PTR_ERR(handle); 5677 mlog_errno(status); 5678 goto bail_up; 5679 } 5680 5681 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 5682 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 5683 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 5684 5685 status = ocfs2_truncate_log_append(osb, handle, 5686 start_blk, clusters); 5687 ocfs2_commit_trans(osb, handle); 5688 if (status < 0) { 5689 mlog_errno(status); 5690 goto bail_up; 5691 } 5692 } 5693 5694 bail_up: 5695 mutex_unlock(&tl_inode->i_mutex); 5696 5697 mlog_exit(status); 5698 return status; 5699 } 5700 5701 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 5702 { 5703 int status; 5704 struct inode *tl_inode = osb->osb_tl_inode; 5705 5706 mlog_entry_void(); 5707 5708 if (tl_inode) { 5709 cancel_delayed_work(&osb->osb_truncate_log_wq); 5710 flush_workqueue(ocfs2_wq); 5711 5712 status = ocfs2_flush_truncate_log(osb); 5713 if (status < 0) 5714 mlog_errno(status); 5715 5716 brelse(osb->osb_tl_bh); 5717 iput(osb->osb_tl_inode); 5718 } 5719 5720 mlog_exit_void(); 5721 } 5722 5723 int ocfs2_truncate_log_init(struct ocfs2_super *osb) 5724 { 5725 int status; 5726 struct inode *tl_inode = NULL; 5727 struct buffer_head *tl_bh = NULL; 5728 5729 mlog_entry_void(); 5730 5731 status = ocfs2_get_truncate_log_info(osb, 5732 osb->slot_num, 5733 &tl_inode, 5734 &tl_bh); 5735 if (status < 0) 5736 mlog_errno(status); 5737 5738 /* ocfs2_truncate_log_shutdown keys on the existence of 5739 * osb->osb_tl_inode so we don't set any of the osb variables 5740 * until we're sure all is well. */ 5741 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 5742 ocfs2_truncate_log_worker); 5743 osb->osb_tl_bh = tl_bh; 5744 osb->osb_tl_inode = tl_inode; 5745 5746 mlog_exit(status); 5747 return status; 5748 } 5749 5750 /* 5751 * Delayed de-allocation of suballocator blocks. 5752 * 5753 * Some sets of block de-allocations might involve multiple suballocator inodes. 5754 * 5755 * The locking for this can get extremely complicated, especially when 5756 * the suballocator inodes to delete from aren't known until deep 5757 * within an unrelated codepath. 5758 * 5759 * ocfs2_extent_block structures are a good example of this - an inode 5760 * btree could have been grown by any number of nodes each allocating 5761 * out of their own suballoc inode. 5762 * 5763 * These structures allow the delay of block de-allocation until a 5764 * later time, when locking of multiple cluster inodes won't cause 5765 * deadlock. 5766 */ 5767 5768 /* 5769 * Describes a single block free from a suballocator 5770 */ 5771 struct ocfs2_cached_block_free { 5772 struct ocfs2_cached_block_free *free_next; 5773 u64 free_blk; 5774 unsigned int free_bit; 5775 }; 5776 5777 struct ocfs2_per_slot_free_list { 5778 struct ocfs2_per_slot_free_list *f_next_suballocator; 5779 int f_inode_type; 5780 int f_slot; 5781 struct ocfs2_cached_block_free *f_first; 5782 }; 5783 5784 static int ocfs2_free_cached_items(struct ocfs2_super *osb, 5785 int sysfile_type, 5786 int slot, 5787 struct ocfs2_cached_block_free *head) 5788 { 5789 int ret; 5790 u64 bg_blkno; 5791 handle_t *handle; 5792 struct inode *inode; 5793 struct buffer_head *di_bh = NULL; 5794 struct ocfs2_cached_block_free *tmp; 5795 5796 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 5797 if (!inode) { 5798 ret = -EINVAL; 5799 mlog_errno(ret); 5800 goto out; 5801 } 5802 5803 mutex_lock(&inode->i_mutex); 5804 5805 ret = ocfs2_inode_lock(inode, &di_bh, 1); 5806 if (ret) { 5807 mlog_errno(ret); 5808 goto out_mutex; 5809 } 5810 5811 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 5812 if (IS_ERR(handle)) { 5813 ret = PTR_ERR(handle); 5814 mlog_errno(ret); 5815 goto out_unlock; 5816 } 5817 5818 while (head) { 5819 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 5820 head->free_bit); 5821 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 5822 head->free_bit, (unsigned long long)head->free_blk); 5823 5824 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 5825 head->free_bit, bg_blkno, 1); 5826 if (ret) { 5827 mlog_errno(ret); 5828 goto out_journal; 5829 } 5830 5831 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 5832 if (ret) { 5833 mlog_errno(ret); 5834 goto out_journal; 5835 } 5836 5837 tmp = head; 5838 head = head->free_next; 5839 kfree(tmp); 5840 } 5841 5842 out_journal: 5843 ocfs2_commit_trans(osb, handle); 5844 5845 out_unlock: 5846 ocfs2_inode_unlock(inode, 1); 5847 brelse(di_bh); 5848 out_mutex: 5849 mutex_unlock(&inode->i_mutex); 5850 iput(inode); 5851 out: 5852 while(head) { 5853 /* Premature exit may have left some dangling items. */ 5854 tmp = head; 5855 head = head->free_next; 5856 kfree(tmp); 5857 } 5858 5859 return ret; 5860 } 5861 5862 int ocfs2_run_deallocs(struct ocfs2_super *osb, 5863 struct ocfs2_cached_dealloc_ctxt *ctxt) 5864 { 5865 int ret = 0, ret2; 5866 struct ocfs2_per_slot_free_list *fl; 5867 5868 if (!ctxt) 5869 return 0; 5870 5871 while (ctxt->c_first_suballocator) { 5872 fl = ctxt->c_first_suballocator; 5873 5874 if (fl->f_first) { 5875 mlog(0, "Free items: (type %u, slot %d)\n", 5876 fl->f_inode_type, fl->f_slot); 5877 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 5878 fl->f_slot, fl->f_first); 5879 if (ret2) 5880 mlog_errno(ret2); 5881 if (!ret) 5882 ret = ret2; 5883 } 5884 5885 ctxt->c_first_suballocator = fl->f_next_suballocator; 5886 kfree(fl); 5887 } 5888 5889 return ret; 5890 } 5891 5892 static struct ocfs2_per_slot_free_list * 5893 ocfs2_find_per_slot_free_list(int type, 5894 int slot, 5895 struct ocfs2_cached_dealloc_ctxt *ctxt) 5896 { 5897 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 5898 5899 while (fl) { 5900 if (fl->f_inode_type == type && fl->f_slot == slot) 5901 return fl; 5902 5903 fl = fl->f_next_suballocator; 5904 } 5905 5906 fl = kmalloc(sizeof(*fl), GFP_NOFS); 5907 if (fl) { 5908 fl->f_inode_type = type; 5909 fl->f_slot = slot; 5910 fl->f_first = NULL; 5911 fl->f_next_suballocator = ctxt->c_first_suballocator; 5912 5913 ctxt->c_first_suballocator = fl; 5914 } 5915 return fl; 5916 } 5917 5918 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 5919 int type, int slot, u64 blkno, 5920 unsigned int bit) 5921 { 5922 int ret; 5923 struct ocfs2_per_slot_free_list *fl; 5924 struct ocfs2_cached_block_free *item; 5925 5926 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 5927 if (fl == NULL) { 5928 ret = -ENOMEM; 5929 mlog_errno(ret); 5930 goto out; 5931 } 5932 5933 item = kmalloc(sizeof(*item), GFP_NOFS); 5934 if (item == NULL) { 5935 ret = -ENOMEM; 5936 mlog_errno(ret); 5937 goto out; 5938 } 5939 5940 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 5941 type, slot, bit, (unsigned long long)blkno); 5942 5943 item->free_blk = blkno; 5944 item->free_bit = bit; 5945 item->free_next = fl->f_first; 5946 5947 fl->f_first = item; 5948 5949 ret = 0; 5950 out: 5951 return ret; 5952 } 5953 5954 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 5955 struct ocfs2_extent_block *eb) 5956 { 5957 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 5958 le16_to_cpu(eb->h_suballoc_slot), 5959 le64_to_cpu(eb->h_blkno), 5960 le16_to_cpu(eb->h_suballoc_bit)); 5961 } 5962 5963 /* This function will figure out whether the currently last extent 5964 * block will be deleted, and if it will, what the new last extent 5965 * block will be so we can update his h_next_leaf_blk field, as well 5966 * as the dinodes i_last_eb_blk */ 5967 static int ocfs2_find_new_last_ext_blk(struct inode *inode, 5968 unsigned int clusters_to_del, 5969 struct ocfs2_path *path, 5970 struct buffer_head **new_last_eb) 5971 { 5972 int next_free, ret = 0; 5973 u32 cpos; 5974 struct ocfs2_extent_rec *rec; 5975 struct ocfs2_extent_block *eb; 5976 struct ocfs2_extent_list *el; 5977 struct buffer_head *bh = NULL; 5978 5979 *new_last_eb = NULL; 5980 5981 /* we have no tree, so of course, no last_eb. */ 5982 if (!path->p_tree_depth) 5983 goto out; 5984 5985 /* trunc to zero special case - this makes tree_depth = 0 5986 * regardless of what it is. */ 5987 if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 5988 goto out; 5989 5990 el = path_leaf_el(path); 5991 BUG_ON(!el->l_next_free_rec); 5992 5993 /* 5994 * Make sure that this extent list will actually be empty 5995 * after we clear away the data. We can shortcut out if 5996 * there's more than one non-empty extent in the 5997 * list. Otherwise, a check of the remaining extent is 5998 * necessary. 5999 */ 6000 next_free = le16_to_cpu(el->l_next_free_rec); 6001 rec = NULL; 6002 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6003 if (next_free > 2) 6004 goto out; 6005 6006 /* We may have a valid extent in index 1, check it. */ 6007 if (next_free == 2) 6008 rec = &el->l_recs[1]; 6009 6010 /* 6011 * Fall through - no more nonempty extents, so we want 6012 * to delete this leaf. 6013 */ 6014 } else { 6015 if (next_free > 1) 6016 goto out; 6017 6018 rec = &el->l_recs[0]; 6019 } 6020 6021 if (rec) { 6022 /* 6023 * Check it we'll only be trimming off the end of this 6024 * cluster. 6025 */ 6026 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 6027 goto out; 6028 } 6029 6030 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 6031 if (ret) { 6032 mlog_errno(ret); 6033 goto out; 6034 } 6035 6036 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6037 if (ret) { 6038 mlog_errno(ret); 6039 goto out; 6040 } 6041 6042 eb = (struct ocfs2_extent_block *) bh->b_data; 6043 el = &eb->h_list; 6044 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6045 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6046 ret = -EROFS; 6047 goto out; 6048 } 6049 6050 *new_last_eb = bh; 6051 get_bh(*new_last_eb); 6052 mlog(0, "returning block %llu, (cpos: %u)\n", 6053 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 6054 out: 6055 brelse(bh); 6056 6057 return ret; 6058 } 6059 6060 /* 6061 * Trim some clusters off the rightmost edge of a tree. Only called 6062 * during truncate. 6063 * 6064 * The caller needs to: 6065 * - start journaling of each path component. 6066 * - compute and fully set up any new last ext block 6067 */ 6068 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6069 handle_t *handle, struct ocfs2_truncate_context *tc, 6070 u32 clusters_to_del, u64 *delete_start) 6071 { 6072 int ret, i, index = path->p_tree_depth; 6073 u32 new_edge = 0; 6074 u64 deleted_eb = 0; 6075 struct buffer_head *bh; 6076 struct ocfs2_extent_list *el; 6077 struct ocfs2_extent_rec *rec; 6078 6079 *delete_start = 0; 6080 6081 while (index >= 0) { 6082 bh = path->p_node[index].bh; 6083 el = path->p_node[index].el; 6084 6085 mlog(0, "traveling tree (index = %d, block = %llu)\n", 6086 index, (unsigned long long)bh->b_blocknr); 6087 6088 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 6089 6090 if (index != 6091 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 6092 ocfs2_error(inode->i_sb, 6093 "Inode %lu has invalid ext. block %llu", 6094 inode->i_ino, 6095 (unsigned long long)bh->b_blocknr); 6096 ret = -EROFS; 6097 goto out; 6098 } 6099 6100 find_tail_record: 6101 i = le16_to_cpu(el->l_next_free_rec) - 1; 6102 rec = &el->l_recs[i]; 6103 6104 mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 6105 "next = %u\n", i, le32_to_cpu(rec->e_cpos), 6106 ocfs2_rec_clusters(el, rec), 6107 (unsigned long long)le64_to_cpu(rec->e_blkno), 6108 le16_to_cpu(el->l_next_free_rec)); 6109 6110 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 6111 6112 if (le16_to_cpu(el->l_tree_depth) == 0) { 6113 /* 6114 * If the leaf block contains a single empty 6115 * extent and no records, we can just remove 6116 * the block. 6117 */ 6118 if (i == 0 && ocfs2_is_empty_extent(rec)) { 6119 memset(rec, 0, 6120 sizeof(struct ocfs2_extent_rec)); 6121 el->l_next_free_rec = cpu_to_le16(0); 6122 6123 goto delete; 6124 } 6125 6126 /* 6127 * Remove any empty extents by shifting things 6128 * left. That should make life much easier on 6129 * the code below. This condition is rare 6130 * enough that we shouldn't see a performance 6131 * hit. 6132 */ 6133 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6134 le16_add_cpu(&el->l_next_free_rec, -1); 6135 6136 for(i = 0; 6137 i < le16_to_cpu(el->l_next_free_rec); i++) 6138 el->l_recs[i] = el->l_recs[i + 1]; 6139 6140 memset(&el->l_recs[i], 0, 6141 sizeof(struct ocfs2_extent_rec)); 6142 6143 /* 6144 * We've modified our extent list. The 6145 * simplest way to handle this change 6146 * is to being the search from the 6147 * start again. 6148 */ 6149 goto find_tail_record; 6150 } 6151 6152 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 6153 6154 /* 6155 * We'll use "new_edge" on our way back up the 6156 * tree to know what our rightmost cpos is. 6157 */ 6158 new_edge = le16_to_cpu(rec->e_leaf_clusters); 6159 new_edge += le32_to_cpu(rec->e_cpos); 6160 6161 /* 6162 * The caller will use this to delete data blocks. 6163 */ 6164 *delete_start = le64_to_cpu(rec->e_blkno) 6165 + ocfs2_clusters_to_blocks(inode->i_sb, 6166 le16_to_cpu(rec->e_leaf_clusters)); 6167 6168 /* 6169 * If it's now empty, remove this record. 6170 */ 6171 if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 6172 memset(rec, 0, 6173 sizeof(struct ocfs2_extent_rec)); 6174 le16_add_cpu(&el->l_next_free_rec, -1); 6175 } 6176 } else { 6177 if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 6178 memset(rec, 0, 6179 sizeof(struct ocfs2_extent_rec)); 6180 le16_add_cpu(&el->l_next_free_rec, -1); 6181 6182 goto delete; 6183 } 6184 6185 /* Can this actually happen? */ 6186 if (le16_to_cpu(el->l_next_free_rec) == 0) 6187 goto delete; 6188 6189 /* 6190 * We never actually deleted any clusters 6191 * because our leaf was empty. There's no 6192 * reason to adjust the rightmost edge then. 6193 */ 6194 if (new_edge == 0) 6195 goto delete; 6196 6197 rec->e_int_clusters = cpu_to_le32(new_edge); 6198 le32_add_cpu(&rec->e_int_clusters, 6199 -le32_to_cpu(rec->e_cpos)); 6200 6201 /* 6202 * A deleted child record should have been 6203 * caught above. 6204 */ 6205 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 6206 } 6207 6208 delete: 6209 ret = ocfs2_journal_dirty(handle, bh); 6210 if (ret) { 6211 mlog_errno(ret); 6212 goto out; 6213 } 6214 6215 mlog(0, "extent list container %llu, after: record %d: " 6216 "(%u, %u, %llu), next = %u.\n", 6217 (unsigned long long)bh->b_blocknr, i, 6218 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 6219 (unsigned long long)le64_to_cpu(rec->e_blkno), 6220 le16_to_cpu(el->l_next_free_rec)); 6221 6222 /* 6223 * We must be careful to only attempt delete of an 6224 * extent block (and not the root inode block). 6225 */ 6226 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 6227 struct ocfs2_extent_block *eb = 6228 (struct ocfs2_extent_block *)bh->b_data; 6229 6230 /* 6231 * Save this for use when processing the 6232 * parent block. 6233 */ 6234 deleted_eb = le64_to_cpu(eb->h_blkno); 6235 6236 mlog(0, "deleting this extent block.\n"); 6237 6238 ocfs2_remove_from_cache(inode, bh); 6239 6240 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6241 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6242 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 6243 6244 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 6245 /* An error here is not fatal. */ 6246 if (ret < 0) 6247 mlog_errno(ret); 6248 } else { 6249 deleted_eb = 0; 6250 } 6251 6252 index--; 6253 } 6254 6255 ret = 0; 6256 out: 6257 return ret; 6258 } 6259 6260 static int ocfs2_do_truncate(struct ocfs2_super *osb, 6261 unsigned int clusters_to_del, 6262 struct inode *inode, 6263 struct buffer_head *fe_bh, 6264 handle_t *handle, 6265 struct ocfs2_truncate_context *tc, 6266 struct ocfs2_path *path) 6267 { 6268 int status; 6269 struct ocfs2_dinode *fe; 6270 struct ocfs2_extent_block *last_eb = NULL; 6271 struct ocfs2_extent_list *el; 6272 struct buffer_head *last_eb_bh = NULL; 6273 u64 delete_blk = 0; 6274 6275 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6276 6277 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 6278 path, &last_eb_bh); 6279 if (status < 0) { 6280 mlog_errno(status); 6281 goto bail; 6282 } 6283 6284 /* 6285 * Each component will be touched, so we might as well journal 6286 * here to avoid having to handle errors later. 6287 */ 6288 status = ocfs2_journal_access_path(inode, handle, path); 6289 if (status < 0) { 6290 mlog_errno(status); 6291 goto bail; 6292 } 6293 6294 if (last_eb_bh) { 6295 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6296 OCFS2_JOURNAL_ACCESS_WRITE); 6297 if (status < 0) { 6298 mlog_errno(status); 6299 goto bail; 6300 } 6301 6302 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6303 } 6304 6305 el = &(fe->id2.i_list); 6306 6307 /* 6308 * Lower levels depend on this never happening, but it's best 6309 * to check it up here before changing the tree. 6310 */ 6311 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 6312 ocfs2_error(inode->i_sb, 6313 "Inode %lu has an empty extent record, depth %u\n", 6314 inode->i_ino, le16_to_cpu(el->l_tree_depth)); 6315 status = -EROFS; 6316 goto bail; 6317 } 6318 6319 spin_lock(&OCFS2_I(inode)->ip_lock); 6320 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6321 clusters_to_del; 6322 spin_unlock(&OCFS2_I(inode)->ip_lock); 6323 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 6324 inode->i_blocks = ocfs2_inode_sector_count(inode); 6325 6326 status = ocfs2_trim_tree(inode, path, handle, tc, 6327 clusters_to_del, &delete_blk); 6328 if (status) { 6329 mlog_errno(status); 6330 goto bail; 6331 } 6332 6333 if (le32_to_cpu(fe->i_clusters) == 0) { 6334 /* trunc to zero is a special case. */ 6335 el->l_tree_depth = 0; 6336 fe->i_last_eb_blk = 0; 6337 } else if (last_eb) 6338 fe->i_last_eb_blk = last_eb->h_blkno; 6339 6340 status = ocfs2_journal_dirty(handle, fe_bh); 6341 if (status < 0) { 6342 mlog_errno(status); 6343 goto bail; 6344 } 6345 6346 if (last_eb) { 6347 /* If there will be a new last extent block, then by 6348 * definition, there cannot be any leaves to the right of 6349 * him. */ 6350 last_eb->h_next_leaf_blk = 0; 6351 status = ocfs2_journal_dirty(handle, last_eb_bh); 6352 if (status < 0) { 6353 mlog_errno(status); 6354 goto bail; 6355 } 6356 } 6357 6358 if (delete_blk) { 6359 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6360 clusters_to_del); 6361 if (status < 0) { 6362 mlog_errno(status); 6363 goto bail; 6364 } 6365 } 6366 status = 0; 6367 bail: 6368 6369 mlog_exit(status); 6370 return status; 6371 } 6372 6373 static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 6374 { 6375 set_buffer_uptodate(bh); 6376 mark_buffer_dirty(bh); 6377 return 0; 6378 } 6379 6380 static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) 6381 { 6382 set_buffer_uptodate(bh); 6383 mark_buffer_dirty(bh); 6384 return ocfs2_journal_dirty_data(handle, bh); 6385 } 6386 6387 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 6388 unsigned int from, unsigned int to, 6389 struct page *page, int zero, u64 *phys) 6390 { 6391 int ret, partial = 0; 6392 6393 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 6394 if (ret) 6395 mlog_errno(ret); 6396 6397 if (zero) 6398 zero_user_segment(page, from, to); 6399 6400 /* 6401 * Need to set the buffers we zero'd into uptodate 6402 * here if they aren't - ocfs2_map_page_blocks() 6403 * might've skipped some 6404 */ 6405 if (ocfs2_should_order_data(inode)) { 6406 ret = walk_page_buffers(handle, 6407 page_buffers(page), 6408 from, to, &partial, 6409 ocfs2_ordered_zero_func); 6410 if (ret < 0) 6411 mlog_errno(ret); 6412 } else { 6413 ret = walk_page_buffers(handle, page_buffers(page), 6414 from, to, &partial, 6415 ocfs2_writeback_zero_func); 6416 if (ret < 0) 6417 mlog_errno(ret); 6418 } 6419 6420 if (!partial) 6421 SetPageUptodate(page); 6422 6423 flush_dcache_page(page); 6424 } 6425 6426 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 6427 loff_t end, struct page **pages, 6428 int numpages, u64 phys, handle_t *handle) 6429 { 6430 int i; 6431 struct page *page; 6432 unsigned int from, to = PAGE_CACHE_SIZE; 6433 struct super_block *sb = inode->i_sb; 6434 6435 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 6436 6437 if (numpages == 0) 6438 goto out; 6439 6440 to = PAGE_CACHE_SIZE; 6441 for(i = 0; i < numpages; i++) { 6442 page = pages[i]; 6443 6444 from = start & (PAGE_CACHE_SIZE - 1); 6445 if ((end >> PAGE_CACHE_SHIFT) == page->index) 6446 to = end & (PAGE_CACHE_SIZE - 1); 6447 6448 BUG_ON(from > PAGE_CACHE_SIZE); 6449 BUG_ON(to > PAGE_CACHE_SIZE); 6450 6451 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, 6452 &phys); 6453 6454 start = (page->index + 1) << PAGE_CACHE_SHIFT; 6455 } 6456 out: 6457 if (pages) 6458 ocfs2_unlock_and_free_pages(pages, numpages); 6459 } 6460 6461 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 6462 struct page **pages, int *num) 6463 { 6464 int numpages, ret = 0; 6465 struct super_block *sb = inode->i_sb; 6466 struct address_space *mapping = inode->i_mapping; 6467 unsigned long index; 6468 loff_t last_page_bytes; 6469 6470 BUG_ON(start > end); 6471 6472 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 6473 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 6474 6475 numpages = 0; 6476 last_page_bytes = PAGE_ALIGN(end); 6477 index = start >> PAGE_CACHE_SHIFT; 6478 do { 6479 pages[numpages] = grab_cache_page(mapping, index); 6480 if (!pages[numpages]) { 6481 ret = -ENOMEM; 6482 mlog_errno(ret); 6483 goto out; 6484 } 6485 6486 numpages++; 6487 index++; 6488 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); 6489 6490 out: 6491 if (ret != 0) { 6492 if (pages) 6493 ocfs2_unlock_and_free_pages(pages, numpages); 6494 numpages = 0; 6495 } 6496 6497 *num = numpages; 6498 6499 return ret; 6500 } 6501 6502 /* 6503 * Zero the area past i_size but still within an allocated 6504 * cluster. This avoids exposing nonzero data on subsequent file 6505 * extends. 6506 * 6507 * We need to call this before i_size is updated on the inode because 6508 * otherwise block_write_full_page() will skip writeout of pages past 6509 * i_size. The new_i_size parameter is passed for this reason. 6510 */ 6511 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 6512 u64 range_start, u64 range_end) 6513 { 6514 int ret = 0, numpages; 6515 struct page **pages = NULL; 6516 u64 phys; 6517 unsigned int ext_flags; 6518 struct super_block *sb = inode->i_sb; 6519 6520 /* 6521 * File systems which don't support sparse files zero on every 6522 * extend. 6523 */ 6524 if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) 6525 return 0; 6526 6527 pages = kcalloc(ocfs2_pages_per_cluster(sb), 6528 sizeof(struct page *), GFP_NOFS); 6529 if (pages == NULL) { 6530 ret = -ENOMEM; 6531 mlog_errno(ret); 6532 goto out; 6533 } 6534 6535 if (range_start == range_end) 6536 goto out; 6537 6538 ret = ocfs2_extent_map_get_blocks(inode, 6539 range_start >> sb->s_blocksize_bits, 6540 &phys, NULL, &ext_flags); 6541 if (ret) { 6542 mlog_errno(ret); 6543 goto out; 6544 } 6545 6546 /* 6547 * Tail is a hole, or is marked unwritten. In either case, we 6548 * can count on read and write to return/push zero's. 6549 */ 6550 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) 6551 goto out; 6552 6553 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 6554 &numpages); 6555 if (ret) { 6556 mlog_errno(ret); 6557 goto out; 6558 } 6559 6560 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 6561 numpages, phys, handle); 6562 6563 /* 6564 * Initiate writeout of the pages we zero'd here. We don't 6565 * wait on them - the truncate_inode_pages() call later will 6566 * do that for us. 6567 */ 6568 ret = do_sync_mapping_range(inode->i_mapping, range_start, 6569 range_end - 1, SYNC_FILE_RANGE_WRITE); 6570 if (ret) 6571 mlog_errno(ret); 6572 6573 out: 6574 if (pages) 6575 kfree(pages); 6576 6577 return ret; 6578 } 6579 6580 static void ocfs2_zero_dinode_id2(struct inode *inode, struct ocfs2_dinode *di) 6581 { 6582 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 6583 6584 memset(&di->id2, 0, blocksize - offsetof(struct ocfs2_dinode, id2)); 6585 } 6586 6587 void ocfs2_dinode_new_extent_list(struct inode *inode, 6588 struct ocfs2_dinode *di) 6589 { 6590 ocfs2_zero_dinode_id2(inode, di); 6591 di->id2.i_list.l_tree_depth = 0; 6592 di->id2.i_list.l_next_free_rec = 0; 6593 di->id2.i_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_inode(inode->i_sb)); 6594 } 6595 6596 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 6597 { 6598 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6599 struct ocfs2_inline_data *idata = &di->id2.i_data; 6600 6601 spin_lock(&oi->ip_lock); 6602 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 6603 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6604 spin_unlock(&oi->ip_lock); 6605 6606 /* 6607 * We clear the entire i_data structure here so that all 6608 * fields can be properly initialized. 6609 */ 6610 ocfs2_zero_dinode_id2(inode, di); 6611 6612 idata->id_count = cpu_to_le16(ocfs2_max_inline_data(inode->i_sb)); 6613 } 6614 6615 int ocfs2_convert_inline_data_to_extents(struct inode *inode, 6616 struct buffer_head *di_bh) 6617 { 6618 int ret, i, has_data, num_pages = 0; 6619 handle_t *handle; 6620 u64 uninitialized_var(block); 6621 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6622 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6623 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 6624 struct ocfs2_alloc_context *data_ac = NULL; 6625 struct page **pages = NULL; 6626 loff_t end = osb->s_clustersize; 6627 6628 has_data = i_size_read(inode) ? 1 : 0; 6629 6630 if (has_data) { 6631 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), 6632 sizeof(struct page *), GFP_NOFS); 6633 if (pages == NULL) { 6634 ret = -ENOMEM; 6635 mlog_errno(ret); 6636 goto out; 6637 } 6638 6639 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 6640 if (ret) { 6641 mlog_errno(ret); 6642 goto out; 6643 } 6644 } 6645 6646 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); 6647 if (IS_ERR(handle)) { 6648 ret = PTR_ERR(handle); 6649 mlog_errno(ret); 6650 goto out_unlock; 6651 } 6652 6653 ret = ocfs2_journal_access(handle, inode, di_bh, 6654 OCFS2_JOURNAL_ACCESS_WRITE); 6655 if (ret) { 6656 mlog_errno(ret); 6657 goto out_commit; 6658 } 6659 6660 if (has_data) { 6661 u32 bit_off, num; 6662 unsigned int page_end; 6663 u64 phys; 6664 6665 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6666 &num); 6667 if (ret) { 6668 mlog_errno(ret); 6669 goto out_commit; 6670 } 6671 6672 /* 6673 * Save two copies, one for insert, and one that can 6674 * be changed by ocfs2_map_and_dirty_page() below. 6675 */ 6676 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 6677 6678 /* 6679 * Non sparse file systems zero on extend, so no need 6680 * to do that now. 6681 */ 6682 if (!ocfs2_sparse_alloc(osb) && 6683 PAGE_CACHE_SIZE < osb->s_clustersize) 6684 end = PAGE_CACHE_SIZE; 6685 6686 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); 6687 if (ret) { 6688 mlog_errno(ret); 6689 goto out_commit; 6690 } 6691 6692 /* 6693 * This should populate the 1st page for us and mark 6694 * it up to date. 6695 */ 6696 ret = ocfs2_read_inline_data(inode, pages[0], di_bh); 6697 if (ret) { 6698 mlog_errno(ret); 6699 goto out_commit; 6700 } 6701 6702 page_end = PAGE_CACHE_SIZE; 6703 if (PAGE_CACHE_SIZE > osb->s_clustersize) 6704 page_end = osb->s_clustersize; 6705 6706 for (i = 0; i < num_pages; i++) 6707 ocfs2_map_and_dirty_page(inode, handle, 0, page_end, 6708 pages[i], i > 0, &phys); 6709 } 6710 6711 spin_lock(&oi->ip_lock); 6712 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 6713 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6714 spin_unlock(&oi->ip_lock); 6715 6716 ocfs2_dinode_new_extent_list(inode, di); 6717 6718 ocfs2_journal_dirty(handle, di_bh); 6719 6720 if (has_data) { 6721 /* 6722 * An error at this point should be extremely rare. If 6723 * this proves to be false, we could always re-build 6724 * the in-inode data from our pages. 6725 */ 6726 ret = ocfs2_dinode_insert_extent(osb, handle, inode, di_bh, 6727 0, block, 1, 0, NULL); 6728 if (ret) { 6729 mlog_errno(ret); 6730 goto out_commit; 6731 } 6732 6733 inode->i_blocks = ocfs2_inode_sector_count(inode); 6734 } 6735 6736 out_commit: 6737 ocfs2_commit_trans(osb, handle); 6738 6739 out_unlock: 6740 if (data_ac) 6741 ocfs2_free_alloc_context(data_ac); 6742 6743 out: 6744 if (pages) { 6745 ocfs2_unlock_and_free_pages(pages, num_pages); 6746 kfree(pages); 6747 } 6748 6749 return ret; 6750 } 6751 6752 /* 6753 * It is expected, that by the time you call this function, 6754 * inode->i_size and fe->i_size have been adjusted. 6755 * 6756 * WARNING: This will kfree the truncate context 6757 */ 6758 int ocfs2_commit_truncate(struct ocfs2_super *osb, 6759 struct inode *inode, 6760 struct buffer_head *fe_bh, 6761 struct ocfs2_truncate_context *tc) 6762 { 6763 int status, i, credits, tl_sem = 0; 6764 u32 clusters_to_del, new_highest_cpos, range; 6765 struct ocfs2_extent_list *el; 6766 handle_t *handle = NULL; 6767 struct inode *tl_inode = osb->osb_tl_inode; 6768 struct ocfs2_path *path = NULL; 6769 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 6770 6771 mlog_entry_void(); 6772 6773 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 6774 i_size_read(inode)); 6775 6776 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 6777 if (!path) { 6778 status = -ENOMEM; 6779 mlog_errno(status); 6780 goto bail; 6781 } 6782 6783 ocfs2_extent_map_trunc(inode, new_highest_cpos); 6784 6785 start: 6786 /* 6787 * Check that we still have allocation to delete. 6788 */ 6789 if (OCFS2_I(inode)->ip_clusters == 0) { 6790 status = 0; 6791 goto bail; 6792 } 6793 6794 /* 6795 * Truncate always works against the rightmost tree branch. 6796 */ 6797 status = ocfs2_find_path(inode, path, UINT_MAX); 6798 if (status) { 6799 mlog_errno(status); 6800 goto bail; 6801 } 6802 6803 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 6804 OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 6805 6806 /* 6807 * By now, el will point to the extent list on the bottom most 6808 * portion of this tree. Only the tail record is considered in 6809 * each pass. 6810 * 6811 * We handle the following cases, in order: 6812 * - empty extent: delete the remaining branch 6813 * - remove the entire record 6814 * - remove a partial record 6815 * - no record needs to be removed (truncate has completed) 6816 */ 6817 el = path_leaf_el(path); 6818 if (le16_to_cpu(el->l_next_free_rec) == 0) { 6819 ocfs2_error(inode->i_sb, 6820 "Inode %llu has empty extent block at %llu\n", 6821 (unsigned long long)OCFS2_I(inode)->ip_blkno, 6822 (unsigned long long)path_leaf_bh(path)->b_blocknr); 6823 status = -EROFS; 6824 goto bail; 6825 } 6826 6827 i = le16_to_cpu(el->l_next_free_rec) - 1; 6828 range = le32_to_cpu(el->l_recs[i].e_cpos) + 6829 ocfs2_rec_clusters(el, &el->l_recs[i]); 6830 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 6831 clusters_to_del = 0; 6832 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 6833 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 6834 } else if (range > new_highest_cpos) { 6835 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 6836 le32_to_cpu(el->l_recs[i].e_cpos)) - 6837 new_highest_cpos; 6838 } else { 6839 status = 0; 6840 goto bail; 6841 } 6842 6843 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 6844 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 6845 6846 mutex_lock(&tl_inode->i_mutex); 6847 tl_sem = 1; 6848 /* ocfs2_truncate_log_needs_flush guarantees us at least one 6849 * record is free for use. If there isn't any, we flush to get 6850 * an empty truncate log. */ 6851 if (ocfs2_truncate_log_needs_flush(osb)) { 6852 status = __ocfs2_flush_truncate_log(osb); 6853 if (status < 0) { 6854 mlog_errno(status); 6855 goto bail; 6856 } 6857 } 6858 6859 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 6860 (struct ocfs2_dinode *)fe_bh->b_data, 6861 el); 6862 handle = ocfs2_start_trans(osb, credits); 6863 if (IS_ERR(handle)) { 6864 status = PTR_ERR(handle); 6865 handle = NULL; 6866 mlog_errno(status); 6867 goto bail; 6868 } 6869 6870 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 6871 tc, path); 6872 if (status < 0) { 6873 mlog_errno(status); 6874 goto bail; 6875 } 6876 6877 mutex_unlock(&tl_inode->i_mutex); 6878 tl_sem = 0; 6879 6880 ocfs2_commit_trans(osb, handle); 6881 handle = NULL; 6882 6883 ocfs2_reinit_path(path, 1); 6884 6885 /* 6886 * The check above will catch the case where we've truncated 6887 * away all allocation. 6888 */ 6889 goto start; 6890 6891 bail: 6892 6893 ocfs2_schedule_truncate_log_flush(osb, 1); 6894 6895 if (tl_sem) 6896 mutex_unlock(&tl_inode->i_mutex); 6897 6898 if (handle) 6899 ocfs2_commit_trans(osb, handle); 6900 6901 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 6902 6903 ocfs2_free_path(path); 6904 6905 /* This will drop the ext_alloc cluster lock for us */ 6906 ocfs2_free_truncate_context(tc); 6907 6908 mlog_exit(status); 6909 return status; 6910 } 6911 6912 /* 6913 * Expects the inode to already be locked. 6914 */ 6915 int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6916 struct inode *inode, 6917 struct buffer_head *fe_bh, 6918 struct ocfs2_truncate_context **tc) 6919 { 6920 int status; 6921 unsigned int new_i_clusters; 6922 struct ocfs2_dinode *fe; 6923 struct ocfs2_extent_block *eb; 6924 struct buffer_head *last_eb_bh = NULL; 6925 6926 mlog_entry_void(); 6927 6928 *tc = NULL; 6929 6930 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 6931 i_size_read(inode)); 6932 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6933 6934 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 6935 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 6936 (unsigned long long)le64_to_cpu(fe->i_size)); 6937 6938 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 6939 if (!(*tc)) { 6940 status = -ENOMEM; 6941 mlog_errno(status); 6942 goto bail; 6943 } 6944 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 6945 6946 if (fe->id2.i_list.l_tree_depth) { 6947 status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 6948 &last_eb_bh, OCFS2_BH_CACHED, inode); 6949 if (status < 0) { 6950 mlog_errno(status); 6951 goto bail; 6952 } 6953 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6954 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6955 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6956 6957 brelse(last_eb_bh); 6958 status = -EIO; 6959 goto bail; 6960 } 6961 } 6962 6963 (*tc)->tc_last_eb_bh = last_eb_bh; 6964 6965 status = 0; 6966 bail: 6967 if (status < 0) { 6968 if (*tc) 6969 ocfs2_free_truncate_context(*tc); 6970 *tc = NULL; 6971 } 6972 mlog_exit_void(); 6973 return status; 6974 } 6975 6976 /* 6977 * 'start' is inclusive, 'end' is not. 6978 */ 6979 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 6980 unsigned int start, unsigned int end, int trunc) 6981 { 6982 int ret; 6983 unsigned int numbytes; 6984 handle_t *handle; 6985 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6986 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 6987 struct ocfs2_inline_data *idata = &di->id2.i_data; 6988 6989 if (end > i_size_read(inode)) 6990 end = i_size_read(inode); 6991 6992 BUG_ON(start >= end); 6993 6994 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 6995 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 6996 !ocfs2_supports_inline_data(osb)) { 6997 ocfs2_error(inode->i_sb, 6998 "Inline data flags for inode %llu don't agree! " 6999 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", 7000 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7001 le16_to_cpu(di->i_dyn_features), 7002 OCFS2_I(inode)->ip_dyn_features, 7003 osb->s_feature_incompat); 7004 ret = -EROFS; 7005 goto out; 7006 } 7007 7008 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 7009 if (IS_ERR(handle)) { 7010 ret = PTR_ERR(handle); 7011 mlog_errno(ret); 7012 goto out; 7013 } 7014 7015 ret = ocfs2_journal_access(handle, inode, di_bh, 7016 OCFS2_JOURNAL_ACCESS_WRITE); 7017 if (ret) { 7018 mlog_errno(ret); 7019 goto out_commit; 7020 } 7021 7022 numbytes = end - start; 7023 memset(idata->id_data + start, 0, numbytes); 7024 7025 /* 7026 * No need to worry about the data page here - it's been 7027 * truncated already and inline data doesn't need it for 7028 * pushing zero's to disk, so we'll let readpage pick it up 7029 * later. 7030 */ 7031 if (trunc) { 7032 i_size_write(inode, start); 7033 di->i_size = cpu_to_le64(start); 7034 } 7035 7036 inode->i_blocks = ocfs2_inode_sector_count(inode); 7037 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 7038 7039 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7040 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7041 7042 ocfs2_journal_dirty(handle, di_bh); 7043 7044 out_commit: 7045 ocfs2_commit_trans(osb, handle); 7046 7047 out: 7048 return ret; 7049 } 7050 7051 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 7052 { 7053 /* 7054 * The caller is responsible for completing deallocation 7055 * before freeing the context. 7056 */ 7057 if (tc->tc_dealloc.c_first_suballocator != NULL) 7058 mlog(ML_NOTICE, 7059 "Truncate completion has non-empty dealloc context\n"); 7060 7061 if (tc->tc_last_eb_bh) 7062 brelse(tc->tc_last_eb_bh); 7063 7064 kfree(tc); 7065 } 7066