1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * alloc.c 5 * 6 * Extent allocs and frees 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/swap.h> 31 32 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33 #include <cluster/masklog.h> 34 35 #include "ocfs2.h" 36 37 #include "alloc.h" 38 #include "aops.h" 39 #include "dlmglue.h" 40 #include "extent_map.h" 41 #include "inode.h" 42 #include "journal.h" 43 #include "localalloc.h" 44 #include "suballoc.h" 45 #include "sysfile.h" 46 #include "file.h" 47 #include "super.h" 48 #include "uptodate.h" 49 50 #include "buffer_head_io.h" 51 52 53 /* 54 * Operations for a specific extent tree type. 55 * 56 * To implement an on-disk btree (extent tree) type in ocfs2, add 57 * an ocfs2_extent_tree_operations structure and the matching 58 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it 59 * for the allocation portion of the extent tree. 60 */ 61 struct ocfs2_extent_tree_operations { 62 /* 63 * last_eb_blk is the block number of the right most leaf extent 64 * block. Most on-disk structures containing an extent tree store 65 * this value for fast access. The ->eo_set_last_eb_blk() and 66 * ->eo_get_last_eb_blk() operations access this value. They are 67 * both required. 68 */ 69 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, 70 u64 blkno); 71 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); 72 73 /* 74 * The on-disk structure usually keeps track of how many total 75 * clusters are stored in this extent tree. This function updates 76 * that value. new_clusters is the delta, and must be 77 * added to the total. Required. 78 */ 79 void (*eo_update_clusters)(struct inode *inode, 80 struct ocfs2_extent_tree *et, 81 u32 new_clusters); 82 83 /* 84 * If ->eo_insert_check() exists, it is called before rec is 85 * inserted into the extent tree. It is optional. 86 */ 87 int (*eo_insert_check)(struct inode *inode, 88 struct ocfs2_extent_tree *et, 89 struct ocfs2_extent_rec *rec); 90 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); 91 92 /* 93 * -------------------------------------------------------------- 94 * The remaining are internal to ocfs2_extent_tree and don't have 95 * accessor functions 96 */ 97 98 /* 99 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. 100 * It is required. 101 */ 102 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); 103 104 /* 105 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if 106 * it exists. If it does not, et->et_max_leaf_clusters is set 107 * to 0 (unlimited). Optional. 108 */ 109 void (*eo_fill_max_leaf_clusters)(struct inode *inode, 110 struct ocfs2_extent_tree *et); 111 }; 112 113 114 /* 115 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check 116 * in the methods. 117 */ 118 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 119 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 120 u64 blkno); 121 static void ocfs2_dinode_update_clusters(struct inode *inode, 122 struct ocfs2_extent_tree *et, 123 u32 clusters); 124 static int ocfs2_dinode_insert_check(struct inode *inode, 125 struct ocfs2_extent_tree *et, 126 struct ocfs2_extent_rec *rec); 127 static int ocfs2_dinode_sanity_check(struct inode *inode, 128 struct ocfs2_extent_tree *et); 129 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 130 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 131 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 132 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 133 .eo_update_clusters = ocfs2_dinode_update_clusters, 134 .eo_insert_check = ocfs2_dinode_insert_check, 135 .eo_sanity_check = ocfs2_dinode_sanity_check, 136 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 137 }; 138 139 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 140 u64 blkno) 141 { 142 struct ocfs2_dinode *di = et->et_object; 143 144 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 145 di->i_last_eb_blk = cpu_to_le64(blkno); 146 } 147 148 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) 149 { 150 struct ocfs2_dinode *di = et->et_object; 151 152 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 153 return le64_to_cpu(di->i_last_eb_blk); 154 } 155 156 static void ocfs2_dinode_update_clusters(struct inode *inode, 157 struct ocfs2_extent_tree *et, 158 u32 clusters) 159 { 160 struct ocfs2_dinode *di = et->et_object; 161 162 le32_add_cpu(&di->i_clusters, clusters); 163 spin_lock(&OCFS2_I(inode)->ip_lock); 164 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 165 spin_unlock(&OCFS2_I(inode)->ip_lock); 166 } 167 168 static int ocfs2_dinode_insert_check(struct inode *inode, 169 struct ocfs2_extent_tree *et, 170 struct ocfs2_extent_rec *rec) 171 { 172 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 173 174 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 175 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 176 (OCFS2_I(inode)->ip_clusters != rec->e_cpos), 177 "Device %s, asking for sparse allocation: inode %llu, " 178 "cpos %u, clusters %u\n", 179 osb->dev_str, 180 (unsigned long long)OCFS2_I(inode)->ip_blkno, 181 rec->e_cpos, 182 OCFS2_I(inode)->ip_clusters); 183 184 return 0; 185 } 186 187 static int ocfs2_dinode_sanity_check(struct inode *inode, 188 struct ocfs2_extent_tree *et) 189 { 190 int ret = 0; 191 struct ocfs2_dinode *di; 192 193 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 194 195 di = et->et_object; 196 if (!OCFS2_IS_VALID_DINODE(di)) { 197 ret = -EIO; 198 ocfs2_error(inode->i_sb, 199 "Inode %llu has invalid path root", 200 (unsigned long long)OCFS2_I(inode)->ip_blkno); 201 } 202 203 return ret; 204 } 205 206 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 207 { 208 struct ocfs2_dinode *di = et->et_object; 209 210 et->et_root_el = &di->id2.i_list; 211 } 212 213 214 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 215 { 216 struct ocfs2_xattr_value_root *xv = et->et_object; 217 218 et->et_root_el = &xv->xr_list; 219 } 220 221 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 222 u64 blkno) 223 { 224 struct ocfs2_xattr_value_root *xv = 225 (struct ocfs2_xattr_value_root *)et->et_object; 226 227 xv->xr_last_eb_blk = cpu_to_le64(blkno); 228 } 229 230 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 231 { 232 struct ocfs2_xattr_value_root *xv = 233 (struct ocfs2_xattr_value_root *) et->et_object; 234 235 return le64_to_cpu(xv->xr_last_eb_blk); 236 } 237 238 static void ocfs2_xattr_value_update_clusters(struct inode *inode, 239 struct ocfs2_extent_tree *et, 240 u32 clusters) 241 { 242 struct ocfs2_xattr_value_root *xv = 243 (struct ocfs2_xattr_value_root *)et->et_object; 244 245 le32_add_cpu(&xv->xr_clusters, clusters); 246 } 247 248 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 249 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 250 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 251 .eo_update_clusters = ocfs2_xattr_value_update_clusters, 252 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, 253 }; 254 255 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) 256 { 257 struct ocfs2_xattr_block *xb = et->et_object; 258 259 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 260 } 261 262 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, 263 struct ocfs2_extent_tree *et) 264 { 265 et->et_max_leaf_clusters = 266 ocfs2_clusters_for_bytes(inode->i_sb, 267 OCFS2_MAX_XATTR_TREE_LEAF_SIZE); 268 } 269 270 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 271 u64 blkno) 272 { 273 struct ocfs2_xattr_block *xb = et->et_object; 274 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 275 276 xt->xt_last_eb_blk = cpu_to_le64(blkno); 277 } 278 279 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 280 { 281 struct ocfs2_xattr_block *xb = et->et_object; 282 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 283 284 return le64_to_cpu(xt->xt_last_eb_blk); 285 } 286 287 static void ocfs2_xattr_tree_update_clusters(struct inode *inode, 288 struct ocfs2_extent_tree *et, 289 u32 clusters) 290 { 291 struct ocfs2_xattr_block *xb = et->et_object; 292 293 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); 294 } 295 296 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { 297 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, 298 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, 299 .eo_update_clusters = ocfs2_xattr_tree_update_clusters, 300 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, 301 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 302 }; 303 304 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 305 struct inode *inode, 306 struct buffer_head *bh, 307 void *obj, 308 struct ocfs2_extent_tree_operations *ops) 309 { 310 et->et_ops = ops; 311 et->et_root_bh = bh; 312 if (!obj) 313 obj = (void *)bh->b_data; 314 et->et_object = obj; 315 316 et->et_ops->eo_fill_root_el(et); 317 if (!et->et_ops->eo_fill_max_leaf_clusters) 318 et->et_max_leaf_clusters = 0; 319 else 320 et->et_ops->eo_fill_max_leaf_clusters(inode, et); 321 } 322 323 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 324 struct inode *inode, 325 struct buffer_head *bh) 326 { 327 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); 328 } 329 330 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 331 struct inode *inode, 332 struct buffer_head *bh) 333 { 334 __ocfs2_init_extent_tree(et, inode, bh, NULL, 335 &ocfs2_xattr_tree_et_ops); 336 } 337 338 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 339 struct inode *inode, 340 struct buffer_head *bh, 341 struct ocfs2_xattr_value_root *xv) 342 { 343 __ocfs2_init_extent_tree(et, inode, bh, xv, 344 &ocfs2_xattr_value_et_ops); 345 } 346 347 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 348 u64 new_last_eb_blk) 349 { 350 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); 351 } 352 353 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) 354 { 355 return et->et_ops->eo_get_last_eb_blk(et); 356 } 357 358 static inline void ocfs2_et_update_clusters(struct inode *inode, 359 struct ocfs2_extent_tree *et, 360 u32 clusters) 361 { 362 et->et_ops->eo_update_clusters(inode, et, clusters); 363 } 364 365 static inline int ocfs2_et_insert_check(struct inode *inode, 366 struct ocfs2_extent_tree *et, 367 struct ocfs2_extent_rec *rec) 368 { 369 int ret = 0; 370 371 if (et->et_ops->eo_insert_check) 372 ret = et->et_ops->eo_insert_check(inode, et, rec); 373 return ret; 374 } 375 376 static inline int ocfs2_et_sanity_check(struct inode *inode, 377 struct ocfs2_extent_tree *et) 378 { 379 int ret = 0; 380 381 if (et->et_ops->eo_sanity_check) 382 ret = et->et_ops->eo_sanity_check(inode, et); 383 return ret; 384 } 385 386 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 387 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 388 struct ocfs2_extent_block *eb); 389 390 /* 391 * Structures which describe a path through a btree, and functions to 392 * manipulate them. 393 * 394 * The idea here is to be as generic as possible with the tree 395 * manipulation code. 396 */ 397 struct ocfs2_path_item { 398 struct buffer_head *bh; 399 struct ocfs2_extent_list *el; 400 }; 401 402 #define OCFS2_MAX_PATH_DEPTH 5 403 404 struct ocfs2_path { 405 int p_tree_depth; 406 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 407 }; 408 409 #define path_root_bh(_path) ((_path)->p_node[0].bh) 410 #define path_root_el(_path) ((_path)->p_node[0].el) 411 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 412 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 413 #define path_num_items(_path) ((_path)->p_tree_depth + 1) 414 415 /* 416 * Reset the actual path elements so that we can re-use the structure 417 * to build another path. Generally, this involves freeing the buffer 418 * heads. 419 */ 420 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 421 { 422 int i, start = 0, depth = 0; 423 struct ocfs2_path_item *node; 424 425 if (keep_root) 426 start = 1; 427 428 for(i = start; i < path_num_items(path); i++) { 429 node = &path->p_node[i]; 430 431 brelse(node->bh); 432 node->bh = NULL; 433 node->el = NULL; 434 } 435 436 /* 437 * Tree depth may change during truncate, or insert. If we're 438 * keeping the root extent list, then make sure that our path 439 * structure reflects the proper depth. 440 */ 441 if (keep_root) 442 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 443 444 path->p_tree_depth = depth; 445 } 446 447 static void ocfs2_free_path(struct ocfs2_path *path) 448 { 449 if (path) { 450 ocfs2_reinit_path(path, 0); 451 kfree(path); 452 } 453 } 454 455 /* 456 * All the elements of src into dest. After this call, src could be freed 457 * without affecting dest. 458 * 459 * Both paths should have the same root. Any non-root elements of dest 460 * will be freed. 461 */ 462 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) 463 { 464 int i; 465 466 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 467 BUG_ON(path_root_el(dest) != path_root_el(src)); 468 469 ocfs2_reinit_path(dest, 1); 470 471 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 472 dest->p_node[i].bh = src->p_node[i].bh; 473 dest->p_node[i].el = src->p_node[i].el; 474 475 if (dest->p_node[i].bh) 476 get_bh(dest->p_node[i].bh); 477 } 478 } 479 480 /* 481 * Make the *dest path the same as src and re-initialize src path to 482 * have a root only. 483 */ 484 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 485 { 486 int i; 487 488 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 489 490 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 491 brelse(dest->p_node[i].bh); 492 493 dest->p_node[i].bh = src->p_node[i].bh; 494 dest->p_node[i].el = src->p_node[i].el; 495 496 src->p_node[i].bh = NULL; 497 src->p_node[i].el = NULL; 498 } 499 } 500 501 /* 502 * Insert an extent block at given index. 503 * 504 * This will not take an additional reference on eb_bh. 505 */ 506 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 507 struct buffer_head *eb_bh) 508 { 509 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 510 511 /* 512 * Right now, no root bh is an extent block, so this helps 513 * catch code errors with dinode trees. The assertion can be 514 * safely removed if we ever need to insert extent block 515 * structures at the root. 516 */ 517 BUG_ON(index == 0); 518 519 path->p_node[index].bh = eb_bh; 520 path->p_node[index].el = &eb->h_list; 521 } 522 523 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 524 struct ocfs2_extent_list *root_el) 525 { 526 struct ocfs2_path *path; 527 528 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 529 530 path = kzalloc(sizeof(*path), GFP_NOFS); 531 if (path) { 532 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 533 get_bh(root_bh); 534 path_root_bh(path) = root_bh; 535 path_root_el(path) = root_el; 536 } 537 538 return path; 539 } 540 541 /* 542 * Convenience function to journal all components in a path. 543 */ 544 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 545 struct ocfs2_path *path) 546 { 547 int i, ret = 0; 548 549 if (!path) 550 goto out; 551 552 for(i = 0; i < path_num_items(path); i++) { 553 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 554 OCFS2_JOURNAL_ACCESS_WRITE); 555 if (ret < 0) { 556 mlog_errno(ret); 557 goto out; 558 } 559 } 560 561 out: 562 return ret; 563 } 564 565 /* 566 * Return the index of the extent record which contains cluster #v_cluster. 567 * -1 is returned if it was not found. 568 * 569 * Should work fine on interior and exterior nodes. 570 */ 571 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) 572 { 573 int ret = -1; 574 int i; 575 struct ocfs2_extent_rec *rec; 576 u32 rec_end, rec_start, clusters; 577 578 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 579 rec = &el->l_recs[i]; 580 581 rec_start = le32_to_cpu(rec->e_cpos); 582 clusters = ocfs2_rec_clusters(el, rec); 583 584 rec_end = rec_start + clusters; 585 586 if (v_cluster >= rec_start && v_cluster < rec_end) { 587 ret = i; 588 break; 589 } 590 } 591 592 return ret; 593 } 594 595 enum ocfs2_contig_type { 596 CONTIG_NONE = 0, 597 CONTIG_LEFT, 598 CONTIG_RIGHT, 599 CONTIG_LEFTRIGHT, 600 }; 601 602 603 /* 604 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 605 * ocfs2_extent_contig only work properly against leaf nodes! 606 */ 607 static int ocfs2_block_extent_contig(struct super_block *sb, 608 struct ocfs2_extent_rec *ext, 609 u64 blkno) 610 { 611 u64 blk_end = le64_to_cpu(ext->e_blkno); 612 613 blk_end += ocfs2_clusters_to_blocks(sb, 614 le16_to_cpu(ext->e_leaf_clusters)); 615 616 return blkno == blk_end; 617 } 618 619 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 620 struct ocfs2_extent_rec *right) 621 { 622 u32 left_range; 623 624 left_range = le32_to_cpu(left->e_cpos) + 625 le16_to_cpu(left->e_leaf_clusters); 626 627 return (left_range == le32_to_cpu(right->e_cpos)); 628 } 629 630 static enum ocfs2_contig_type 631 ocfs2_extent_contig(struct inode *inode, 632 struct ocfs2_extent_rec *ext, 633 struct ocfs2_extent_rec *insert_rec) 634 { 635 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 636 637 /* 638 * Refuse to coalesce extent records with different flag 639 * fields - we don't want to mix unwritten extents with user 640 * data. 641 */ 642 if (ext->e_flags != insert_rec->e_flags) 643 return CONTIG_NONE; 644 645 if (ocfs2_extents_adjacent(ext, insert_rec) && 646 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 647 return CONTIG_RIGHT; 648 649 blkno = le64_to_cpu(ext->e_blkno); 650 if (ocfs2_extents_adjacent(insert_rec, ext) && 651 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 652 return CONTIG_LEFT; 653 654 return CONTIG_NONE; 655 } 656 657 /* 658 * NOTE: We can have pretty much any combination of contiguousness and 659 * appending. 660 * 661 * The usefulness of APPEND_TAIL is more in that it lets us know that 662 * we'll have to update the path to that leaf. 663 */ 664 enum ocfs2_append_type { 665 APPEND_NONE = 0, 666 APPEND_TAIL, 667 }; 668 669 enum ocfs2_split_type { 670 SPLIT_NONE = 0, 671 SPLIT_LEFT, 672 SPLIT_RIGHT, 673 }; 674 675 struct ocfs2_insert_type { 676 enum ocfs2_split_type ins_split; 677 enum ocfs2_append_type ins_appending; 678 enum ocfs2_contig_type ins_contig; 679 int ins_contig_index; 680 int ins_tree_depth; 681 }; 682 683 struct ocfs2_merge_ctxt { 684 enum ocfs2_contig_type c_contig_type; 685 int c_has_empty_extent; 686 int c_split_covers_rec; 687 }; 688 689 /* 690 * How many free extents have we got before we need more meta data? 691 */ 692 int ocfs2_num_free_extents(struct ocfs2_super *osb, 693 struct inode *inode, 694 struct ocfs2_extent_tree *et) 695 { 696 int retval; 697 struct ocfs2_extent_list *el = NULL; 698 struct ocfs2_extent_block *eb; 699 struct buffer_head *eb_bh = NULL; 700 u64 last_eb_blk = 0; 701 702 mlog_entry_void(); 703 704 el = et->et_root_el; 705 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 706 707 if (last_eb_blk) { 708 retval = ocfs2_read_block(inode, last_eb_blk, 709 &eb_bh); 710 if (retval < 0) { 711 mlog_errno(retval); 712 goto bail; 713 } 714 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 715 el = &eb->h_list; 716 } 717 718 BUG_ON(el->l_tree_depth != 0); 719 720 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 721 bail: 722 brelse(eb_bh); 723 724 mlog_exit(retval); 725 return retval; 726 } 727 728 /* expects array to already be allocated 729 * 730 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 731 * l_count for you 732 */ 733 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 734 handle_t *handle, 735 struct inode *inode, 736 int wanted, 737 struct ocfs2_alloc_context *meta_ac, 738 struct buffer_head *bhs[]) 739 { 740 int count, status, i; 741 u16 suballoc_bit_start; 742 u32 num_got; 743 u64 first_blkno; 744 struct ocfs2_extent_block *eb; 745 746 mlog_entry_void(); 747 748 count = 0; 749 while (count < wanted) { 750 status = ocfs2_claim_metadata(osb, 751 handle, 752 meta_ac, 753 wanted - count, 754 &suballoc_bit_start, 755 &num_got, 756 &first_blkno); 757 if (status < 0) { 758 mlog_errno(status); 759 goto bail; 760 } 761 762 for(i = count; i < (num_got + count); i++) { 763 bhs[i] = sb_getblk(osb->sb, first_blkno); 764 if (bhs[i] == NULL) { 765 status = -EIO; 766 mlog_errno(status); 767 goto bail; 768 } 769 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 770 771 status = ocfs2_journal_access(handle, inode, bhs[i], 772 OCFS2_JOURNAL_ACCESS_CREATE); 773 if (status < 0) { 774 mlog_errno(status); 775 goto bail; 776 } 777 778 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 779 eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 780 /* Ok, setup the minimal stuff here. */ 781 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 782 eb->h_blkno = cpu_to_le64(first_blkno); 783 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 784 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 785 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 786 eb->h_list.l_count = 787 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 788 789 suballoc_bit_start++; 790 first_blkno++; 791 792 /* We'll also be dirtied by the caller, so 793 * this isn't absolutely necessary. */ 794 status = ocfs2_journal_dirty(handle, bhs[i]); 795 if (status < 0) { 796 mlog_errno(status); 797 goto bail; 798 } 799 } 800 801 count += num_got; 802 } 803 804 status = 0; 805 bail: 806 if (status < 0) { 807 for(i = 0; i < wanted; i++) { 808 brelse(bhs[i]); 809 bhs[i] = NULL; 810 } 811 } 812 mlog_exit(status); 813 return status; 814 } 815 816 /* 817 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 818 * 819 * Returns the sum of the rightmost extent rec logical offset and 820 * cluster count. 821 * 822 * ocfs2_add_branch() uses this to determine what logical cluster 823 * value should be populated into the leftmost new branch records. 824 * 825 * ocfs2_shift_tree_depth() uses this to determine the # clusters 826 * value for the new topmost tree record. 827 */ 828 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 829 { 830 int i; 831 832 i = le16_to_cpu(el->l_next_free_rec) - 1; 833 834 return le32_to_cpu(el->l_recs[i].e_cpos) + 835 ocfs2_rec_clusters(el, &el->l_recs[i]); 836 } 837 838 /* 839 * Add an entire tree branch to our inode. eb_bh is the extent block 840 * to start at, if we don't want to start the branch at the dinode 841 * structure. 842 * 843 * last_eb_bh is required as we have to update it's next_leaf pointer 844 * for the new last extent block. 845 * 846 * the new branch will be 'empty' in the sense that every block will 847 * contain a single record with cluster count == 0. 848 */ 849 static int ocfs2_add_branch(struct ocfs2_super *osb, 850 handle_t *handle, 851 struct inode *inode, 852 struct ocfs2_extent_tree *et, 853 struct buffer_head *eb_bh, 854 struct buffer_head **last_eb_bh, 855 struct ocfs2_alloc_context *meta_ac) 856 { 857 int status, new_blocks, i; 858 u64 next_blkno, new_last_eb_blk; 859 struct buffer_head *bh; 860 struct buffer_head **new_eb_bhs = NULL; 861 struct ocfs2_extent_block *eb; 862 struct ocfs2_extent_list *eb_el; 863 struct ocfs2_extent_list *el; 864 u32 new_cpos; 865 866 mlog_entry_void(); 867 868 BUG_ON(!last_eb_bh || !*last_eb_bh); 869 870 if (eb_bh) { 871 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 872 el = &eb->h_list; 873 } else 874 el = et->et_root_el; 875 876 /* we never add a branch to a leaf. */ 877 BUG_ON(!el->l_tree_depth); 878 879 new_blocks = le16_to_cpu(el->l_tree_depth); 880 881 /* allocate the number of new eb blocks we need */ 882 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 883 GFP_KERNEL); 884 if (!new_eb_bhs) { 885 status = -ENOMEM; 886 mlog_errno(status); 887 goto bail; 888 } 889 890 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 891 meta_ac, new_eb_bhs); 892 if (status < 0) { 893 mlog_errno(status); 894 goto bail; 895 } 896 897 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; 898 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 899 900 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 901 * linked with the rest of the tree. 902 * conversly, new_eb_bhs[0] is the new bottommost leaf. 903 * 904 * when we leave the loop, new_last_eb_blk will point to the 905 * newest leaf, and next_blkno will point to the topmost extent 906 * block. */ 907 next_blkno = new_last_eb_blk = 0; 908 for(i = 0; i < new_blocks; i++) { 909 bh = new_eb_bhs[i]; 910 eb = (struct ocfs2_extent_block *) bh->b_data; 911 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 912 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 913 status = -EIO; 914 goto bail; 915 } 916 eb_el = &eb->h_list; 917 918 status = ocfs2_journal_access(handle, inode, bh, 919 OCFS2_JOURNAL_ACCESS_CREATE); 920 if (status < 0) { 921 mlog_errno(status); 922 goto bail; 923 } 924 925 eb->h_next_leaf_blk = 0; 926 eb_el->l_tree_depth = cpu_to_le16(i); 927 eb_el->l_next_free_rec = cpu_to_le16(1); 928 /* 929 * This actually counts as an empty extent as 930 * c_clusters == 0 931 */ 932 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 933 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 934 /* 935 * eb_el isn't always an interior node, but even leaf 936 * nodes want a zero'd flags and reserved field so 937 * this gets the whole 32 bits regardless of use. 938 */ 939 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 940 if (!eb_el->l_tree_depth) 941 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 942 943 status = ocfs2_journal_dirty(handle, bh); 944 if (status < 0) { 945 mlog_errno(status); 946 goto bail; 947 } 948 949 next_blkno = le64_to_cpu(eb->h_blkno); 950 } 951 952 /* This is a bit hairy. We want to update up to three blocks 953 * here without leaving any of them in an inconsistent state 954 * in case of error. We don't have to worry about 955 * journal_dirty erroring as it won't unless we've aborted the 956 * handle (in which case we would never be here) so reserving 957 * the write with journal_access is all we need to do. */ 958 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 959 OCFS2_JOURNAL_ACCESS_WRITE); 960 if (status < 0) { 961 mlog_errno(status); 962 goto bail; 963 } 964 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 965 OCFS2_JOURNAL_ACCESS_WRITE); 966 if (status < 0) { 967 mlog_errno(status); 968 goto bail; 969 } 970 if (eb_bh) { 971 status = ocfs2_journal_access(handle, inode, eb_bh, 972 OCFS2_JOURNAL_ACCESS_WRITE); 973 if (status < 0) { 974 mlog_errno(status); 975 goto bail; 976 } 977 } 978 979 /* Link the new branch into the rest of the tree (el will 980 * either be on the root_bh, or the extent block passed in. */ 981 i = le16_to_cpu(el->l_next_free_rec); 982 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 983 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 984 el->l_recs[i].e_int_clusters = 0; 985 le16_add_cpu(&el->l_next_free_rec, 1); 986 987 /* fe needs a new last extent block pointer, as does the 988 * next_leaf on the previously last-extent-block. */ 989 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); 990 991 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 992 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 993 994 status = ocfs2_journal_dirty(handle, *last_eb_bh); 995 if (status < 0) 996 mlog_errno(status); 997 status = ocfs2_journal_dirty(handle, et->et_root_bh); 998 if (status < 0) 999 mlog_errno(status); 1000 if (eb_bh) { 1001 status = ocfs2_journal_dirty(handle, eb_bh); 1002 if (status < 0) 1003 mlog_errno(status); 1004 } 1005 1006 /* 1007 * Some callers want to track the rightmost leaf so pass it 1008 * back here. 1009 */ 1010 brelse(*last_eb_bh); 1011 get_bh(new_eb_bhs[0]); 1012 *last_eb_bh = new_eb_bhs[0]; 1013 1014 status = 0; 1015 bail: 1016 if (new_eb_bhs) { 1017 for (i = 0; i < new_blocks; i++) 1018 brelse(new_eb_bhs[i]); 1019 kfree(new_eb_bhs); 1020 } 1021 1022 mlog_exit(status); 1023 return status; 1024 } 1025 1026 /* 1027 * adds another level to the allocation tree. 1028 * returns back the new extent block so you can add a branch to it 1029 * after this call. 1030 */ 1031 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1032 handle_t *handle, 1033 struct inode *inode, 1034 struct ocfs2_extent_tree *et, 1035 struct ocfs2_alloc_context *meta_ac, 1036 struct buffer_head **ret_new_eb_bh) 1037 { 1038 int status, i; 1039 u32 new_clusters; 1040 struct buffer_head *new_eb_bh = NULL; 1041 struct ocfs2_extent_block *eb; 1042 struct ocfs2_extent_list *root_el; 1043 struct ocfs2_extent_list *eb_el; 1044 1045 mlog_entry_void(); 1046 1047 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 1048 &new_eb_bh); 1049 if (status < 0) { 1050 mlog_errno(status); 1051 goto bail; 1052 } 1053 1054 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1055 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1056 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1057 status = -EIO; 1058 goto bail; 1059 } 1060 1061 eb_el = &eb->h_list; 1062 root_el = et->et_root_el; 1063 1064 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1065 OCFS2_JOURNAL_ACCESS_CREATE); 1066 if (status < 0) { 1067 mlog_errno(status); 1068 goto bail; 1069 } 1070 1071 /* copy the root extent list data into the new extent block */ 1072 eb_el->l_tree_depth = root_el->l_tree_depth; 1073 eb_el->l_next_free_rec = root_el->l_next_free_rec; 1074 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1075 eb_el->l_recs[i] = root_el->l_recs[i]; 1076 1077 status = ocfs2_journal_dirty(handle, new_eb_bh); 1078 if (status < 0) { 1079 mlog_errno(status); 1080 goto bail; 1081 } 1082 1083 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1084 OCFS2_JOURNAL_ACCESS_WRITE); 1085 if (status < 0) { 1086 mlog_errno(status); 1087 goto bail; 1088 } 1089 1090 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1091 1092 /* update root_bh now */ 1093 le16_add_cpu(&root_el->l_tree_depth, 1); 1094 root_el->l_recs[0].e_cpos = 0; 1095 root_el->l_recs[0].e_blkno = eb->h_blkno; 1096 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1097 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1098 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1099 root_el->l_next_free_rec = cpu_to_le16(1); 1100 1101 /* If this is our 1st tree depth shift, then last_eb_blk 1102 * becomes the allocated extent block */ 1103 if (root_el->l_tree_depth == cpu_to_le16(1)) 1104 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1105 1106 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1107 if (status < 0) { 1108 mlog_errno(status); 1109 goto bail; 1110 } 1111 1112 *ret_new_eb_bh = new_eb_bh; 1113 new_eb_bh = NULL; 1114 status = 0; 1115 bail: 1116 brelse(new_eb_bh); 1117 1118 mlog_exit(status); 1119 return status; 1120 } 1121 1122 /* 1123 * Should only be called when there is no space left in any of the 1124 * leaf nodes. What we want to do is find the lowest tree depth 1125 * non-leaf extent block with room for new records. There are three 1126 * valid results of this search: 1127 * 1128 * 1) a lowest extent block is found, then we pass it back in 1129 * *lowest_eb_bh and return '0' 1130 * 1131 * 2) the search fails to find anything, but the root_el has room. We 1132 * pass NULL back in *lowest_eb_bh, but still return '0' 1133 * 1134 * 3) the search fails to find anything AND the root_el is full, in 1135 * which case we return > 0 1136 * 1137 * return status < 0 indicates an error. 1138 */ 1139 static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1140 struct inode *inode, 1141 struct ocfs2_extent_tree *et, 1142 struct buffer_head **target_bh) 1143 { 1144 int status = 0, i; 1145 u64 blkno; 1146 struct ocfs2_extent_block *eb; 1147 struct ocfs2_extent_list *el; 1148 struct buffer_head *bh = NULL; 1149 struct buffer_head *lowest_bh = NULL; 1150 1151 mlog_entry_void(); 1152 1153 *target_bh = NULL; 1154 1155 el = et->et_root_el; 1156 1157 while(le16_to_cpu(el->l_tree_depth) > 1) { 1158 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1159 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1160 "extent list (next_free_rec == 0)", 1161 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1162 status = -EIO; 1163 goto bail; 1164 } 1165 i = le16_to_cpu(el->l_next_free_rec) - 1; 1166 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1167 if (!blkno) { 1168 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1169 "list where extent # %d has no physical " 1170 "block start", 1171 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1172 status = -EIO; 1173 goto bail; 1174 } 1175 1176 brelse(bh); 1177 bh = NULL; 1178 1179 status = ocfs2_read_block(inode, blkno, &bh); 1180 if (status < 0) { 1181 mlog_errno(status); 1182 goto bail; 1183 } 1184 1185 eb = (struct ocfs2_extent_block *) bh->b_data; 1186 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1187 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1188 status = -EIO; 1189 goto bail; 1190 } 1191 el = &eb->h_list; 1192 1193 if (le16_to_cpu(el->l_next_free_rec) < 1194 le16_to_cpu(el->l_count)) { 1195 brelse(lowest_bh); 1196 lowest_bh = bh; 1197 get_bh(lowest_bh); 1198 } 1199 } 1200 1201 /* If we didn't find one and the fe doesn't have any room, 1202 * then return '1' */ 1203 el = et->et_root_el; 1204 if (!lowest_bh && (el->l_next_free_rec == el->l_count)) 1205 status = 1; 1206 1207 *target_bh = lowest_bh; 1208 bail: 1209 brelse(bh); 1210 1211 mlog_exit(status); 1212 return status; 1213 } 1214 1215 /* 1216 * Grow a b-tree so that it has more records. 1217 * 1218 * We might shift the tree depth in which case existing paths should 1219 * be considered invalid. 1220 * 1221 * Tree depth after the grow is returned via *final_depth. 1222 * 1223 * *last_eb_bh will be updated by ocfs2_add_branch(). 1224 */ 1225 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1226 struct ocfs2_extent_tree *et, int *final_depth, 1227 struct buffer_head **last_eb_bh, 1228 struct ocfs2_alloc_context *meta_ac) 1229 { 1230 int ret, shift; 1231 struct ocfs2_extent_list *el = et->et_root_el; 1232 int depth = le16_to_cpu(el->l_tree_depth); 1233 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1234 struct buffer_head *bh = NULL; 1235 1236 BUG_ON(meta_ac == NULL); 1237 1238 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1239 if (shift < 0) { 1240 ret = shift; 1241 mlog_errno(ret); 1242 goto out; 1243 } 1244 1245 /* We traveled all the way to the bottom of the allocation tree 1246 * and didn't find room for any more extents - we need to add 1247 * another tree level */ 1248 if (shift) { 1249 BUG_ON(bh); 1250 mlog(0, "need to shift tree depth (current = %d)\n", depth); 1251 1252 /* ocfs2_shift_tree_depth will return us a buffer with 1253 * the new extent block (so we can pass that to 1254 * ocfs2_add_branch). */ 1255 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1256 meta_ac, &bh); 1257 if (ret < 0) { 1258 mlog_errno(ret); 1259 goto out; 1260 } 1261 depth++; 1262 if (depth == 1) { 1263 /* 1264 * Special case: we have room now if we shifted from 1265 * tree_depth 0, so no more work needs to be done. 1266 * 1267 * We won't be calling add_branch, so pass 1268 * back *last_eb_bh as the new leaf. At depth 1269 * zero, it should always be null so there's 1270 * no reason to brelse. 1271 */ 1272 BUG_ON(*last_eb_bh); 1273 get_bh(bh); 1274 *last_eb_bh = bh; 1275 goto out; 1276 } 1277 } 1278 1279 /* call ocfs2_add_branch to add the final part of the tree with 1280 * the new data. */ 1281 mlog(0, "add branch. bh = %p\n", bh); 1282 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1283 meta_ac); 1284 if (ret < 0) { 1285 mlog_errno(ret); 1286 goto out; 1287 } 1288 1289 out: 1290 if (final_depth) 1291 *final_depth = depth; 1292 brelse(bh); 1293 return ret; 1294 } 1295 1296 /* 1297 * This function will discard the rightmost extent record. 1298 */ 1299 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1300 { 1301 int next_free = le16_to_cpu(el->l_next_free_rec); 1302 int count = le16_to_cpu(el->l_count); 1303 unsigned int num_bytes; 1304 1305 BUG_ON(!next_free); 1306 /* This will cause us to go off the end of our extent list. */ 1307 BUG_ON(next_free >= count); 1308 1309 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 1310 1311 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 1312 } 1313 1314 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 1315 struct ocfs2_extent_rec *insert_rec) 1316 { 1317 int i, insert_index, next_free, has_empty, num_bytes; 1318 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 1319 struct ocfs2_extent_rec *rec; 1320 1321 next_free = le16_to_cpu(el->l_next_free_rec); 1322 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 1323 1324 BUG_ON(!next_free); 1325 1326 /* The tree code before us didn't allow enough room in the leaf. */ 1327 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); 1328 1329 /* 1330 * The easiest way to approach this is to just remove the 1331 * empty extent and temporarily decrement next_free. 1332 */ 1333 if (has_empty) { 1334 /* 1335 * If next_free was 1 (only an empty extent), this 1336 * loop won't execute, which is fine. We still want 1337 * the decrement above to happen. 1338 */ 1339 for(i = 0; i < (next_free - 1); i++) 1340 el->l_recs[i] = el->l_recs[i+1]; 1341 1342 next_free--; 1343 } 1344 1345 /* 1346 * Figure out what the new record index should be. 1347 */ 1348 for(i = 0; i < next_free; i++) { 1349 rec = &el->l_recs[i]; 1350 1351 if (insert_cpos < le32_to_cpu(rec->e_cpos)) 1352 break; 1353 } 1354 insert_index = i; 1355 1356 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 1357 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 1358 1359 BUG_ON(insert_index < 0); 1360 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1361 BUG_ON(insert_index > next_free); 1362 1363 /* 1364 * No need to memmove if we're just adding to the tail. 1365 */ 1366 if (insert_index != next_free) { 1367 BUG_ON(next_free >= le16_to_cpu(el->l_count)); 1368 1369 num_bytes = next_free - insert_index; 1370 num_bytes *= sizeof(struct ocfs2_extent_rec); 1371 memmove(&el->l_recs[insert_index + 1], 1372 &el->l_recs[insert_index], 1373 num_bytes); 1374 } 1375 1376 /* 1377 * Either we had an empty extent, and need to re-increment or 1378 * there was no empty extent on a non full rightmost leaf node, 1379 * in which case we still need to increment. 1380 */ 1381 next_free++; 1382 el->l_next_free_rec = cpu_to_le16(next_free); 1383 /* 1384 * Make sure none of the math above just messed up our tree. 1385 */ 1386 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 1387 1388 el->l_recs[insert_index] = *insert_rec; 1389 1390 } 1391 1392 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) 1393 { 1394 int size, num_recs = le16_to_cpu(el->l_next_free_rec); 1395 1396 BUG_ON(num_recs == 0); 1397 1398 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 1399 num_recs--; 1400 size = num_recs * sizeof(struct ocfs2_extent_rec); 1401 memmove(&el->l_recs[0], &el->l_recs[1], size); 1402 memset(&el->l_recs[num_recs], 0, 1403 sizeof(struct ocfs2_extent_rec)); 1404 el->l_next_free_rec = cpu_to_le16(num_recs); 1405 } 1406 } 1407 1408 /* 1409 * Create an empty extent record . 1410 * 1411 * l_next_free_rec may be updated. 1412 * 1413 * If an empty extent already exists do nothing. 1414 */ 1415 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1416 { 1417 int next_free = le16_to_cpu(el->l_next_free_rec); 1418 1419 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1420 1421 if (next_free == 0) 1422 goto set_and_inc; 1423 1424 if (ocfs2_is_empty_extent(&el->l_recs[0])) 1425 return; 1426 1427 mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1428 "Asked to create an empty extent in a full list:\n" 1429 "count = %u, tree depth = %u", 1430 le16_to_cpu(el->l_count), 1431 le16_to_cpu(el->l_tree_depth)); 1432 1433 ocfs2_shift_records_right(el); 1434 1435 set_and_inc: 1436 le16_add_cpu(&el->l_next_free_rec, 1); 1437 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1438 } 1439 1440 /* 1441 * For a rotation which involves two leaf nodes, the "root node" is 1442 * the lowest level tree node which contains a path to both leafs. This 1443 * resulting set of information can be used to form a complete "subtree" 1444 * 1445 * This function is passed two full paths from the dinode down to a 1446 * pair of adjacent leaves. It's task is to figure out which path 1447 * index contains the subtree root - this can be the root index itself 1448 * in a worst-case rotation. 1449 * 1450 * The array index of the subtree root is passed back. 1451 */ 1452 static int ocfs2_find_subtree_root(struct inode *inode, 1453 struct ocfs2_path *left, 1454 struct ocfs2_path *right) 1455 { 1456 int i = 0; 1457 1458 /* 1459 * Check that the caller passed in two paths from the same tree. 1460 */ 1461 BUG_ON(path_root_bh(left) != path_root_bh(right)); 1462 1463 do { 1464 i++; 1465 1466 /* 1467 * The caller didn't pass two adjacent paths. 1468 */ 1469 mlog_bug_on_msg(i > left->p_tree_depth, 1470 "Inode %lu, left depth %u, right depth %u\n" 1471 "left leaf blk %llu, right leaf blk %llu\n", 1472 inode->i_ino, left->p_tree_depth, 1473 right->p_tree_depth, 1474 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1475 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1476 } while (left->p_node[i].bh->b_blocknr == 1477 right->p_node[i].bh->b_blocknr); 1478 1479 return i - 1; 1480 } 1481 1482 typedef void (path_insert_t)(void *, struct buffer_head *); 1483 1484 /* 1485 * Traverse a btree path in search of cpos, starting at root_el. 1486 * 1487 * This code can be called with a cpos larger than the tree, in which 1488 * case it will return the rightmost path. 1489 */ 1490 static int __ocfs2_find_path(struct inode *inode, 1491 struct ocfs2_extent_list *root_el, u32 cpos, 1492 path_insert_t *func, void *data) 1493 { 1494 int i, ret = 0; 1495 u32 range; 1496 u64 blkno; 1497 struct buffer_head *bh = NULL; 1498 struct ocfs2_extent_block *eb; 1499 struct ocfs2_extent_list *el; 1500 struct ocfs2_extent_rec *rec; 1501 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1502 1503 el = root_el; 1504 while (el->l_tree_depth) { 1505 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1506 ocfs2_error(inode->i_sb, 1507 "Inode %llu has empty extent list at " 1508 "depth %u\n", 1509 (unsigned long long)oi->ip_blkno, 1510 le16_to_cpu(el->l_tree_depth)); 1511 ret = -EROFS; 1512 goto out; 1513 1514 } 1515 1516 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1517 rec = &el->l_recs[i]; 1518 1519 /* 1520 * In the case that cpos is off the allocation 1521 * tree, this should just wind up returning the 1522 * rightmost record. 1523 */ 1524 range = le32_to_cpu(rec->e_cpos) + 1525 ocfs2_rec_clusters(el, rec); 1526 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1527 break; 1528 } 1529 1530 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1531 if (blkno == 0) { 1532 ocfs2_error(inode->i_sb, 1533 "Inode %llu has bad blkno in extent list " 1534 "at depth %u (index %d)\n", 1535 (unsigned long long)oi->ip_blkno, 1536 le16_to_cpu(el->l_tree_depth), i); 1537 ret = -EROFS; 1538 goto out; 1539 } 1540 1541 brelse(bh); 1542 bh = NULL; 1543 ret = ocfs2_read_block(inode, blkno, &bh); 1544 if (ret) { 1545 mlog_errno(ret); 1546 goto out; 1547 } 1548 1549 eb = (struct ocfs2_extent_block *) bh->b_data; 1550 el = &eb->h_list; 1551 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1552 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1553 ret = -EIO; 1554 goto out; 1555 } 1556 1557 if (le16_to_cpu(el->l_next_free_rec) > 1558 le16_to_cpu(el->l_count)) { 1559 ocfs2_error(inode->i_sb, 1560 "Inode %llu has bad count in extent list " 1561 "at block %llu (next free=%u, count=%u)\n", 1562 (unsigned long long)oi->ip_blkno, 1563 (unsigned long long)bh->b_blocknr, 1564 le16_to_cpu(el->l_next_free_rec), 1565 le16_to_cpu(el->l_count)); 1566 ret = -EROFS; 1567 goto out; 1568 } 1569 1570 if (func) 1571 func(data, bh); 1572 } 1573 1574 out: 1575 /* 1576 * Catch any trailing bh that the loop didn't handle. 1577 */ 1578 brelse(bh); 1579 1580 return ret; 1581 } 1582 1583 /* 1584 * Given an initialized path (that is, it has a valid root extent 1585 * list), this function will traverse the btree in search of the path 1586 * which would contain cpos. 1587 * 1588 * The path traveled is recorded in the path structure. 1589 * 1590 * Note that this will not do any comparisons on leaf node extent 1591 * records, so it will work fine in the case that we just added a tree 1592 * branch. 1593 */ 1594 struct find_path_data { 1595 int index; 1596 struct ocfs2_path *path; 1597 }; 1598 static void find_path_ins(void *data, struct buffer_head *bh) 1599 { 1600 struct find_path_data *fp = data; 1601 1602 get_bh(bh); 1603 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1604 fp->index++; 1605 } 1606 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1607 u32 cpos) 1608 { 1609 struct find_path_data data; 1610 1611 data.index = 1; 1612 data.path = path; 1613 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1614 find_path_ins, &data); 1615 } 1616 1617 static void find_leaf_ins(void *data, struct buffer_head *bh) 1618 { 1619 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1620 struct ocfs2_extent_list *el = &eb->h_list; 1621 struct buffer_head **ret = data; 1622 1623 /* We want to retain only the leaf block. */ 1624 if (le16_to_cpu(el->l_tree_depth) == 0) { 1625 get_bh(bh); 1626 *ret = bh; 1627 } 1628 } 1629 /* 1630 * Find the leaf block in the tree which would contain cpos. No 1631 * checking of the actual leaf is done. 1632 * 1633 * Some paths want to call this instead of allocating a path structure 1634 * and calling ocfs2_find_path(). 1635 * 1636 * This function doesn't handle non btree extent lists. 1637 */ 1638 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1639 u32 cpos, struct buffer_head **leaf_bh) 1640 { 1641 int ret; 1642 struct buffer_head *bh = NULL; 1643 1644 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1645 if (ret) { 1646 mlog_errno(ret); 1647 goto out; 1648 } 1649 1650 *leaf_bh = bh; 1651 out: 1652 return ret; 1653 } 1654 1655 /* 1656 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1657 * 1658 * Basically, we've moved stuff around at the bottom of the tree and 1659 * we need to fix up the extent records above the changes to reflect 1660 * the new changes. 1661 * 1662 * left_rec: the record on the left. 1663 * left_child_el: is the child list pointed to by left_rec 1664 * right_rec: the record to the right of left_rec 1665 * right_child_el: is the child list pointed to by right_rec 1666 * 1667 * By definition, this only works on interior nodes. 1668 */ 1669 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1670 struct ocfs2_extent_list *left_child_el, 1671 struct ocfs2_extent_rec *right_rec, 1672 struct ocfs2_extent_list *right_child_el) 1673 { 1674 u32 left_clusters, right_end; 1675 1676 /* 1677 * Interior nodes never have holes. Their cpos is the cpos of 1678 * the leftmost record in their child list. Their cluster 1679 * count covers the full theoretical range of their child list 1680 * - the range between their cpos and the cpos of the record 1681 * immediately to their right. 1682 */ 1683 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1684 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { 1685 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1686 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1687 } 1688 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1689 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1690 1691 /* 1692 * Calculate the rightmost cluster count boundary before 1693 * moving cpos - we will need to adjust clusters after 1694 * updating e_cpos to keep the same highest cluster count. 1695 */ 1696 right_end = le32_to_cpu(right_rec->e_cpos); 1697 right_end += le32_to_cpu(right_rec->e_int_clusters); 1698 1699 right_rec->e_cpos = left_rec->e_cpos; 1700 le32_add_cpu(&right_rec->e_cpos, left_clusters); 1701 1702 right_end -= le32_to_cpu(right_rec->e_cpos); 1703 right_rec->e_int_clusters = cpu_to_le32(right_end); 1704 } 1705 1706 /* 1707 * Adjust the adjacent root node records involved in a 1708 * rotation. left_el_blkno is passed in as a key so that we can easily 1709 * find it's index in the root list. 1710 */ 1711 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1712 struct ocfs2_extent_list *left_el, 1713 struct ocfs2_extent_list *right_el, 1714 u64 left_el_blkno) 1715 { 1716 int i; 1717 1718 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1719 le16_to_cpu(left_el->l_tree_depth)); 1720 1721 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1722 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1723 break; 1724 } 1725 1726 /* 1727 * The path walking code should have never returned a root and 1728 * two paths which are not adjacent. 1729 */ 1730 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1731 1732 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1733 &root_el->l_recs[i + 1], right_el); 1734 } 1735 1736 /* 1737 * We've changed a leaf block (in right_path) and need to reflect that 1738 * change back up the subtree. 1739 * 1740 * This happens in multiple places: 1741 * - When we've moved an extent record from the left path leaf to the right 1742 * path leaf to make room for an empty extent in the left path leaf. 1743 * - When our insert into the right path leaf is at the leftmost edge 1744 * and requires an update of the path immediately to it's left. This 1745 * can occur at the end of some types of rotation and appending inserts. 1746 * - When we've adjusted the last extent record in the left path leaf and the 1747 * 1st extent record in the right path leaf during cross extent block merge. 1748 */ 1749 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1750 struct ocfs2_path *left_path, 1751 struct ocfs2_path *right_path, 1752 int subtree_index) 1753 { 1754 int ret, i, idx; 1755 struct ocfs2_extent_list *el, *left_el, *right_el; 1756 struct ocfs2_extent_rec *left_rec, *right_rec; 1757 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1758 1759 /* 1760 * Update the counts and position values within all the 1761 * interior nodes to reflect the leaf rotation we just did. 1762 * 1763 * The root node is handled below the loop. 1764 * 1765 * We begin the loop with right_el and left_el pointing to the 1766 * leaf lists and work our way up. 1767 * 1768 * NOTE: within this loop, left_el and right_el always refer 1769 * to the *child* lists. 1770 */ 1771 left_el = path_leaf_el(left_path); 1772 right_el = path_leaf_el(right_path); 1773 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1774 mlog(0, "Adjust records at index %u\n", i); 1775 1776 /* 1777 * One nice property of knowing that all of these 1778 * nodes are below the root is that we only deal with 1779 * the leftmost right node record and the rightmost 1780 * left node record. 1781 */ 1782 el = left_path->p_node[i].el; 1783 idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1784 left_rec = &el->l_recs[idx]; 1785 1786 el = right_path->p_node[i].el; 1787 right_rec = &el->l_recs[0]; 1788 1789 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1790 right_el); 1791 1792 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1793 if (ret) 1794 mlog_errno(ret); 1795 1796 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1797 if (ret) 1798 mlog_errno(ret); 1799 1800 /* 1801 * Setup our list pointers now so that the current 1802 * parents become children in the next iteration. 1803 */ 1804 left_el = left_path->p_node[i].el; 1805 right_el = right_path->p_node[i].el; 1806 } 1807 1808 /* 1809 * At the root node, adjust the two adjacent records which 1810 * begin our path to the leaves. 1811 */ 1812 1813 el = left_path->p_node[subtree_index].el; 1814 left_el = left_path->p_node[subtree_index + 1].el; 1815 right_el = right_path->p_node[subtree_index + 1].el; 1816 1817 ocfs2_adjust_root_records(el, left_el, right_el, 1818 left_path->p_node[subtree_index + 1].bh->b_blocknr); 1819 1820 root_bh = left_path->p_node[subtree_index].bh; 1821 1822 ret = ocfs2_journal_dirty(handle, root_bh); 1823 if (ret) 1824 mlog_errno(ret); 1825 } 1826 1827 static int ocfs2_rotate_subtree_right(struct inode *inode, 1828 handle_t *handle, 1829 struct ocfs2_path *left_path, 1830 struct ocfs2_path *right_path, 1831 int subtree_index) 1832 { 1833 int ret, i; 1834 struct buffer_head *right_leaf_bh; 1835 struct buffer_head *left_leaf_bh = NULL; 1836 struct buffer_head *root_bh; 1837 struct ocfs2_extent_list *right_el, *left_el; 1838 struct ocfs2_extent_rec move_rec; 1839 1840 left_leaf_bh = path_leaf_bh(left_path); 1841 left_el = path_leaf_el(left_path); 1842 1843 if (left_el->l_next_free_rec != left_el->l_count) { 1844 ocfs2_error(inode->i_sb, 1845 "Inode %llu has non-full interior leaf node %llu" 1846 "(next free = %u)", 1847 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1848 (unsigned long long)left_leaf_bh->b_blocknr, 1849 le16_to_cpu(left_el->l_next_free_rec)); 1850 return -EROFS; 1851 } 1852 1853 /* 1854 * This extent block may already have an empty record, so we 1855 * return early if so. 1856 */ 1857 if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1858 return 0; 1859 1860 root_bh = left_path->p_node[subtree_index].bh; 1861 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1862 1863 ret = ocfs2_journal_access(handle, inode, root_bh, 1864 OCFS2_JOURNAL_ACCESS_WRITE); 1865 if (ret) { 1866 mlog_errno(ret); 1867 goto out; 1868 } 1869 1870 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1871 ret = ocfs2_journal_access(handle, inode, 1872 right_path->p_node[i].bh, 1873 OCFS2_JOURNAL_ACCESS_WRITE); 1874 if (ret) { 1875 mlog_errno(ret); 1876 goto out; 1877 } 1878 1879 ret = ocfs2_journal_access(handle, inode, 1880 left_path->p_node[i].bh, 1881 OCFS2_JOURNAL_ACCESS_WRITE); 1882 if (ret) { 1883 mlog_errno(ret); 1884 goto out; 1885 } 1886 } 1887 1888 right_leaf_bh = path_leaf_bh(right_path); 1889 right_el = path_leaf_el(right_path); 1890 1891 /* This is a code error, not a disk corruption. */ 1892 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1893 "because rightmost leaf block %llu is empty\n", 1894 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1895 (unsigned long long)right_leaf_bh->b_blocknr); 1896 1897 ocfs2_create_empty_extent(right_el); 1898 1899 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1900 if (ret) { 1901 mlog_errno(ret); 1902 goto out; 1903 } 1904 1905 /* Do the copy now. */ 1906 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1907 move_rec = left_el->l_recs[i]; 1908 right_el->l_recs[0] = move_rec; 1909 1910 /* 1911 * Clear out the record we just copied and shift everything 1912 * over, leaving an empty extent in the left leaf. 1913 * 1914 * We temporarily subtract from next_free_rec so that the 1915 * shift will lose the tail record (which is now defunct). 1916 */ 1917 le16_add_cpu(&left_el->l_next_free_rec, -1); 1918 ocfs2_shift_records_right(left_el); 1919 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1920 le16_add_cpu(&left_el->l_next_free_rec, 1); 1921 1922 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1923 if (ret) { 1924 mlog_errno(ret); 1925 goto out; 1926 } 1927 1928 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1929 subtree_index); 1930 1931 out: 1932 return ret; 1933 } 1934 1935 /* 1936 * Given a full path, determine what cpos value would return us a path 1937 * containing the leaf immediately to the left of the current one. 1938 * 1939 * Will return zero if the path passed in is already the leftmost path. 1940 */ 1941 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 1942 struct ocfs2_path *path, u32 *cpos) 1943 { 1944 int i, j, ret = 0; 1945 u64 blkno; 1946 struct ocfs2_extent_list *el; 1947 1948 BUG_ON(path->p_tree_depth == 0); 1949 1950 *cpos = 0; 1951 1952 blkno = path_leaf_bh(path)->b_blocknr; 1953 1954 /* Start at the tree node just above the leaf and work our way up. */ 1955 i = path->p_tree_depth - 1; 1956 while (i >= 0) { 1957 el = path->p_node[i].el; 1958 1959 /* 1960 * Find the extent record just before the one in our 1961 * path. 1962 */ 1963 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 1964 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 1965 if (j == 0) { 1966 if (i == 0) { 1967 /* 1968 * We've determined that the 1969 * path specified is already 1970 * the leftmost one - return a 1971 * cpos of zero. 1972 */ 1973 goto out; 1974 } 1975 /* 1976 * The leftmost record points to our 1977 * leaf - we need to travel up the 1978 * tree one level. 1979 */ 1980 goto next_node; 1981 } 1982 1983 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 1984 *cpos = *cpos + ocfs2_rec_clusters(el, 1985 &el->l_recs[j - 1]); 1986 *cpos = *cpos - 1; 1987 goto out; 1988 } 1989 } 1990 1991 /* 1992 * If we got here, we never found a valid node where 1993 * the tree indicated one should be. 1994 */ 1995 ocfs2_error(sb, 1996 "Invalid extent tree at extent block %llu\n", 1997 (unsigned long long)blkno); 1998 ret = -EROFS; 1999 goto out; 2000 2001 next_node: 2002 blkno = path->p_node[i].bh->b_blocknr; 2003 i--; 2004 } 2005 2006 out: 2007 return ret; 2008 } 2009 2010 /* 2011 * Extend the transaction by enough credits to complete the rotation, 2012 * and still leave at least the original number of credits allocated 2013 * to this transaction. 2014 */ 2015 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 2016 int op_credits, 2017 struct ocfs2_path *path) 2018 { 2019 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2020 2021 if (handle->h_buffer_credits < credits) 2022 return ocfs2_extend_trans(handle, credits); 2023 2024 return 0; 2025 } 2026 2027 /* 2028 * Trap the case where we're inserting into the theoretical range past 2029 * the _actual_ left leaf range. Otherwise, we'll rotate a record 2030 * whose cpos is less than ours into the right leaf. 2031 * 2032 * It's only necessary to look at the rightmost record of the left 2033 * leaf because the logic that calls us should ensure that the 2034 * theoretical ranges in the path components above the leaves are 2035 * correct. 2036 */ 2037 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 2038 u32 insert_cpos) 2039 { 2040 struct ocfs2_extent_list *left_el; 2041 struct ocfs2_extent_rec *rec; 2042 int next_free; 2043 2044 left_el = path_leaf_el(left_path); 2045 next_free = le16_to_cpu(left_el->l_next_free_rec); 2046 rec = &left_el->l_recs[next_free - 1]; 2047 2048 if (insert_cpos > le32_to_cpu(rec->e_cpos)) 2049 return 1; 2050 return 0; 2051 } 2052 2053 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) 2054 { 2055 int next_free = le16_to_cpu(el->l_next_free_rec); 2056 unsigned int range; 2057 struct ocfs2_extent_rec *rec; 2058 2059 if (next_free == 0) 2060 return 0; 2061 2062 rec = &el->l_recs[0]; 2063 if (ocfs2_is_empty_extent(rec)) { 2064 /* Empty list. */ 2065 if (next_free == 1) 2066 return 0; 2067 rec = &el->l_recs[1]; 2068 } 2069 2070 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2071 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 2072 return 1; 2073 return 0; 2074 } 2075 2076 /* 2077 * Rotate all the records in a btree right one record, starting at insert_cpos. 2078 * 2079 * The path to the rightmost leaf should be passed in. 2080 * 2081 * The array is assumed to be large enough to hold an entire path (tree depth). 2082 * 2083 * Upon succesful return from this function: 2084 * 2085 * - The 'right_path' array will contain a path to the leaf block 2086 * whose range contains e_cpos. 2087 * - That leaf block will have a single empty extent in list index 0. 2088 * - In the case that the rotation requires a post-insert update, 2089 * *ret_left_path will contain a valid path which can be passed to 2090 * ocfs2_insert_path(). 2091 */ 2092 static int ocfs2_rotate_tree_right(struct inode *inode, 2093 handle_t *handle, 2094 enum ocfs2_split_type split, 2095 u32 insert_cpos, 2096 struct ocfs2_path *right_path, 2097 struct ocfs2_path **ret_left_path) 2098 { 2099 int ret, start, orig_credits = handle->h_buffer_credits; 2100 u32 cpos; 2101 struct ocfs2_path *left_path = NULL; 2102 2103 *ret_left_path = NULL; 2104 2105 left_path = ocfs2_new_path(path_root_bh(right_path), 2106 path_root_el(right_path)); 2107 if (!left_path) { 2108 ret = -ENOMEM; 2109 mlog_errno(ret); 2110 goto out; 2111 } 2112 2113 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 2114 if (ret) { 2115 mlog_errno(ret); 2116 goto out; 2117 } 2118 2119 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 2120 2121 /* 2122 * What we want to do here is: 2123 * 2124 * 1) Start with the rightmost path. 2125 * 2126 * 2) Determine a path to the leaf block directly to the left 2127 * of that leaf. 2128 * 2129 * 3) Determine the 'subtree root' - the lowest level tree node 2130 * which contains a path to both leaves. 2131 * 2132 * 4) Rotate the subtree. 2133 * 2134 * 5) Find the next subtree by considering the left path to be 2135 * the new right path. 2136 * 2137 * The check at the top of this while loop also accepts 2138 * insert_cpos == cpos because cpos is only a _theoretical_ 2139 * value to get us the left path - insert_cpos might very well 2140 * be filling that hole. 2141 * 2142 * Stop at a cpos of '0' because we either started at the 2143 * leftmost branch (i.e., a tree with one branch and a 2144 * rotation inside of it), or we've gone as far as we can in 2145 * rotating subtrees. 2146 */ 2147 while (cpos && insert_cpos <= cpos) { 2148 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2149 insert_cpos, cpos); 2150 2151 ret = ocfs2_find_path(inode, left_path, cpos); 2152 if (ret) { 2153 mlog_errno(ret); 2154 goto out; 2155 } 2156 2157 mlog_bug_on_msg(path_leaf_bh(left_path) == 2158 path_leaf_bh(right_path), 2159 "Inode %lu: error during insert of %u " 2160 "(left path cpos %u) results in two identical " 2161 "paths ending at %llu\n", 2162 inode->i_ino, insert_cpos, cpos, 2163 (unsigned long long) 2164 path_leaf_bh(left_path)->b_blocknr); 2165 2166 if (split == SPLIT_NONE && 2167 ocfs2_rotate_requires_path_adjustment(left_path, 2168 insert_cpos)) { 2169 2170 /* 2171 * We've rotated the tree as much as we 2172 * should. The rest is up to 2173 * ocfs2_insert_path() to complete, after the 2174 * record insertion. We indicate this 2175 * situation by returning the left path. 2176 * 2177 * The reason we don't adjust the records here 2178 * before the record insert is that an error 2179 * later might break the rule where a parent 2180 * record e_cpos will reflect the actual 2181 * e_cpos of the 1st nonempty record of the 2182 * child list. 2183 */ 2184 *ret_left_path = left_path; 2185 goto out_ret_path; 2186 } 2187 2188 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2189 2190 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2191 start, 2192 (unsigned long long) right_path->p_node[start].bh->b_blocknr, 2193 right_path->p_tree_depth); 2194 2195 ret = ocfs2_extend_rotate_transaction(handle, start, 2196 orig_credits, right_path); 2197 if (ret) { 2198 mlog_errno(ret); 2199 goto out; 2200 } 2201 2202 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2203 right_path, start); 2204 if (ret) { 2205 mlog_errno(ret); 2206 goto out; 2207 } 2208 2209 if (split != SPLIT_NONE && 2210 ocfs2_leftmost_rec_contains(path_leaf_el(right_path), 2211 insert_cpos)) { 2212 /* 2213 * A rotate moves the rightmost left leaf 2214 * record over to the leftmost right leaf 2215 * slot. If we're doing an extent split 2216 * instead of a real insert, then we have to 2217 * check that the extent to be split wasn't 2218 * just moved over. If it was, then we can 2219 * exit here, passing left_path back - 2220 * ocfs2_split_extent() is smart enough to 2221 * search both leaves. 2222 */ 2223 *ret_left_path = left_path; 2224 goto out_ret_path; 2225 } 2226 2227 /* 2228 * There is no need to re-read the next right path 2229 * as we know that it'll be our current left 2230 * path. Optimize by copying values instead. 2231 */ 2232 ocfs2_mv_path(right_path, left_path); 2233 2234 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2235 &cpos); 2236 if (ret) { 2237 mlog_errno(ret); 2238 goto out; 2239 } 2240 } 2241 2242 out: 2243 ocfs2_free_path(left_path); 2244 2245 out_ret_path: 2246 return ret; 2247 } 2248 2249 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2250 struct ocfs2_path *path) 2251 { 2252 int i, idx; 2253 struct ocfs2_extent_rec *rec; 2254 struct ocfs2_extent_list *el; 2255 struct ocfs2_extent_block *eb; 2256 u32 range; 2257 2258 /* Path should always be rightmost. */ 2259 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2260 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2261 2262 el = &eb->h_list; 2263 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 2264 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2265 rec = &el->l_recs[idx]; 2266 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2267 2268 for (i = 0; i < path->p_tree_depth; i++) { 2269 el = path->p_node[i].el; 2270 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2271 rec = &el->l_recs[idx]; 2272 2273 rec->e_int_clusters = cpu_to_le32(range); 2274 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); 2275 2276 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2277 } 2278 } 2279 2280 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2281 struct ocfs2_cached_dealloc_ctxt *dealloc, 2282 struct ocfs2_path *path, int unlink_start) 2283 { 2284 int ret, i; 2285 struct ocfs2_extent_block *eb; 2286 struct ocfs2_extent_list *el; 2287 struct buffer_head *bh; 2288 2289 for(i = unlink_start; i < path_num_items(path); i++) { 2290 bh = path->p_node[i].bh; 2291 2292 eb = (struct ocfs2_extent_block *)bh->b_data; 2293 /* 2294 * Not all nodes might have had their final count 2295 * decremented by the caller - handle this here. 2296 */ 2297 el = &eb->h_list; 2298 if (le16_to_cpu(el->l_next_free_rec) > 1) { 2299 mlog(ML_ERROR, 2300 "Inode %llu, attempted to remove extent block " 2301 "%llu with %u records\n", 2302 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2303 (unsigned long long)le64_to_cpu(eb->h_blkno), 2304 le16_to_cpu(el->l_next_free_rec)); 2305 2306 ocfs2_journal_dirty(handle, bh); 2307 ocfs2_remove_from_cache(inode, bh); 2308 continue; 2309 } 2310 2311 el->l_next_free_rec = 0; 2312 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2313 2314 ocfs2_journal_dirty(handle, bh); 2315 2316 ret = ocfs2_cache_extent_block_free(dealloc, eb); 2317 if (ret) 2318 mlog_errno(ret); 2319 2320 ocfs2_remove_from_cache(inode, bh); 2321 } 2322 } 2323 2324 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2325 struct ocfs2_path *left_path, 2326 struct ocfs2_path *right_path, 2327 int subtree_index, 2328 struct ocfs2_cached_dealloc_ctxt *dealloc) 2329 { 2330 int i; 2331 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2332 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; 2333 struct ocfs2_extent_list *el; 2334 struct ocfs2_extent_block *eb; 2335 2336 el = path_leaf_el(left_path); 2337 2338 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; 2339 2340 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 2341 if (root_el->l_recs[i].e_blkno == eb->h_blkno) 2342 break; 2343 2344 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); 2345 2346 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 2347 le16_add_cpu(&root_el->l_next_free_rec, -1); 2348 2349 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2350 eb->h_next_leaf_blk = 0; 2351 2352 ocfs2_journal_dirty(handle, root_bh); 2353 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2354 2355 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2356 subtree_index + 1); 2357 } 2358 2359 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2360 struct ocfs2_path *left_path, 2361 struct ocfs2_path *right_path, 2362 int subtree_index, 2363 struct ocfs2_cached_dealloc_ctxt *dealloc, 2364 int *deleted, 2365 struct ocfs2_extent_tree *et) 2366 { 2367 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2368 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2369 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2370 struct ocfs2_extent_block *eb; 2371 2372 *deleted = 0; 2373 2374 right_leaf_el = path_leaf_el(right_path); 2375 left_leaf_el = path_leaf_el(left_path); 2376 root_bh = left_path->p_node[subtree_index].bh; 2377 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2378 2379 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) 2380 return 0; 2381 2382 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; 2383 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { 2384 /* 2385 * It's legal for us to proceed if the right leaf is 2386 * the rightmost one and it has an empty extent. There 2387 * are two cases to handle - whether the leaf will be 2388 * empty after removal or not. If the leaf isn't empty 2389 * then just remove the empty extent up front. The 2390 * next block will handle empty leaves by flagging 2391 * them for unlink. 2392 * 2393 * Non rightmost leaves will throw -EAGAIN and the 2394 * caller can manually move the subtree and retry. 2395 */ 2396 2397 if (eb->h_next_leaf_blk != 0ULL) 2398 return -EAGAIN; 2399 2400 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2401 ret = ocfs2_journal_access(handle, inode, 2402 path_leaf_bh(right_path), 2403 OCFS2_JOURNAL_ACCESS_WRITE); 2404 if (ret) { 2405 mlog_errno(ret); 2406 goto out; 2407 } 2408 2409 ocfs2_remove_empty_extent(right_leaf_el); 2410 } else 2411 right_has_empty = 1; 2412 } 2413 2414 if (eb->h_next_leaf_blk == 0ULL && 2415 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { 2416 /* 2417 * We have to update i_last_eb_blk during the meta 2418 * data delete. 2419 */ 2420 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2421 OCFS2_JOURNAL_ACCESS_WRITE); 2422 if (ret) { 2423 mlog_errno(ret); 2424 goto out; 2425 } 2426 2427 del_right_subtree = 1; 2428 } 2429 2430 /* 2431 * Getting here with an empty extent in the right path implies 2432 * that it's the rightmost path and will be deleted. 2433 */ 2434 BUG_ON(right_has_empty && !del_right_subtree); 2435 2436 ret = ocfs2_journal_access(handle, inode, root_bh, 2437 OCFS2_JOURNAL_ACCESS_WRITE); 2438 if (ret) { 2439 mlog_errno(ret); 2440 goto out; 2441 } 2442 2443 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2444 ret = ocfs2_journal_access(handle, inode, 2445 right_path->p_node[i].bh, 2446 OCFS2_JOURNAL_ACCESS_WRITE); 2447 if (ret) { 2448 mlog_errno(ret); 2449 goto out; 2450 } 2451 2452 ret = ocfs2_journal_access(handle, inode, 2453 left_path->p_node[i].bh, 2454 OCFS2_JOURNAL_ACCESS_WRITE); 2455 if (ret) { 2456 mlog_errno(ret); 2457 goto out; 2458 } 2459 } 2460 2461 if (!right_has_empty) { 2462 /* 2463 * Only do this if we're moving a real 2464 * record. Otherwise, the action is delayed until 2465 * after removal of the right path in which case we 2466 * can do a simple shift to remove the empty extent. 2467 */ 2468 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); 2469 memset(&right_leaf_el->l_recs[0], 0, 2470 sizeof(struct ocfs2_extent_rec)); 2471 } 2472 if (eb->h_next_leaf_blk == 0ULL) { 2473 /* 2474 * Move recs over to get rid of empty extent, decrease 2475 * next_free. This is allowed to remove the last 2476 * extent in our leaf (setting l_next_free_rec to 2477 * zero) - the delete code below won't care. 2478 */ 2479 ocfs2_remove_empty_extent(right_leaf_el); 2480 } 2481 2482 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2483 if (ret) 2484 mlog_errno(ret); 2485 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 2486 if (ret) 2487 mlog_errno(ret); 2488 2489 if (del_right_subtree) { 2490 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2491 subtree_index, dealloc); 2492 ocfs2_update_edge_lengths(inode, handle, left_path); 2493 2494 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2495 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2496 2497 /* 2498 * Removal of the extent in the left leaf was skipped 2499 * above so we could delete the right path 2500 * 1st. 2501 */ 2502 if (right_has_empty) 2503 ocfs2_remove_empty_extent(left_leaf_el); 2504 2505 ret = ocfs2_journal_dirty(handle, et_root_bh); 2506 if (ret) 2507 mlog_errno(ret); 2508 2509 *deleted = 1; 2510 } else 2511 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2512 subtree_index); 2513 2514 out: 2515 return ret; 2516 } 2517 2518 /* 2519 * Given a full path, determine what cpos value would return us a path 2520 * containing the leaf immediately to the right of the current one. 2521 * 2522 * Will return zero if the path passed in is already the rightmost path. 2523 * 2524 * This looks similar, but is subtly different to 2525 * ocfs2_find_cpos_for_left_leaf(). 2526 */ 2527 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2528 struct ocfs2_path *path, u32 *cpos) 2529 { 2530 int i, j, ret = 0; 2531 u64 blkno; 2532 struct ocfs2_extent_list *el; 2533 2534 *cpos = 0; 2535 2536 if (path->p_tree_depth == 0) 2537 return 0; 2538 2539 blkno = path_leaf_bh(path)->b_blocknr; 2540 2541 /* Start at the tree node just above the leaf and work our way up. */ 2542 i = path->p_tree_depth - 1; 2543 while (i >= 0) { 2544 int next_free; 2545 2546 el = path->p_node[i].el; 2547 2548 /* 2549 * Find the extent record just after the one in our 2550 * path. 2551 */ 2552 next_free = le16_to_cpu(el->l_next_free_rec); 2553 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2554 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2555 if (j == (next_free - 1)) { 2556 if (i == 0) { 2557 /* 2558 * We've determined that the 2559 * path specified is already 2560 * the rightmost one - return a 2561 * cpos of zero. 2562 */ 2563 goto out; 2564 } 2565 /* 2566 * The rightmost record points to our 2567 * leaf - we need to travel up the 2568 * tree one level. 2569 */ 2570 goto next_node; 2571 } 2572 2573 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); 2574 goto out; 2575 } 2576 } 2577 2578 /* 2579 * If we got here, we never found a valid node where 2580 * the tree indicated one should be. 2581 */ 2582 ocfs2_error(sb, 2583 "Invalid extent tree at extent block %llu\n", 2584 (unsigned long long)blkno); 2585 ret = -EROFS; 2586 goto out; 2587 2588 next_node: 2589 blkno = path->p_node[i].bh->b_blocknr; 2590 i--; 2591 } 2592 2593 out: 2594 return ret; 2595 } 2596 2597 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2598 handle_t *handle, 2599 struct buffer_head *bh, 2600 struct ocfs2_extent_list *el) 2601 { 2602 int ret; 2603 2604 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2605 return 0; 2606 2607 ret = ocfs2_journal_access(handle, inode, bh, 2608 OCFS2_JOURNAL_ACCESS_WRITE); 2609 if (ret) { 2610 mlog_errno(ret); 2611 goto out; 2612 } 2613 2614 ocfs2_remove_empty_extent(el); 2615 2616 ret = ocfs2_journal_dirty(handle, bh); 2617 if (ret) 2618 mlog_errno(ret); 2619 2620 out: 2621 return ret; 2622 } 2623 2624 static int __ocfs2_rotate_tree_left(struct inode *inode, 2625 handle_t *handle, int orig_credits, 2626 struct ocfs2_path *path, 2627 struct ocfs2_cached_dealloc_ctxt *dealloc, 2628 struct ocfs2_path **empty_extent_path, 2629 struct ocfs2_extent_tree *et) 2630 { 2631 int ret, subtree_root, deleted; 2632 u32 right_cpos; 2633 struct ocfs2_path *left_path = NULL; 2634 struct ocfs2_path *right_path = NULL; 2635 2636 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2637 2638 *empty_extent_path = NULL; 2639 2640 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2641 &right_cpos); 2642 if (ret) { 2643 mlog_errno(ret); 2644 goto out; 2645 } 2646 2647 left_path = ocfs2_new_path(path_root_bh(path), 2648 path_root_el(path)); 2649 if (!left_path) { 2650 ret = -ENOMEM; 2651 mlog_errno(ret); 2652 goto out; 2653 } 2654 2655 ocfs2_cp_path(left_path, path); 2656 2657 right_path = ocfs2_new_path(path_root_bh(path), 2658 path_root_el(path)); 2659 if (!right_path) { 2660 ret = -ENOMEM; 2661 mlog_errno(ret); 2662 goto out; 2663 } 2664 2665 while (right_cpos) { 2666 ret = ocfs2_find_path(inode, right_path, right_cpos); 2667 if (ret) { 2668 mlog_errno(ret); 2669 goto out; 2670 } 2671 2672 subtree_root = ocfs2_find_subtree_root(inode, left_path, 2673 right_path); 2674 2675 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2676 subtree_root, 2677 (unsigned long long) 2678 right_path->p_node[subtree_root].bh->b_blocknr, 2679 right_path->p_tree_depth); 2680 2681 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2682 orig_credits, left_path); 2683 if (ret) { 2684 mlog_errno(ret); 2685 goto out; 2686 } 2687 2688 /* 2689 * Caller might still want to make changes to the 2690 * tree root, so re-add it to the journal here. 2691 */ 2692 ret = ocfs2_journal_access(handle, inode, 2693 path_root_bh(left_path), 2694 OCFS2_JOURNAL_ACCESS_WRITE); 2695 if (ret) { 2696 mlog_errno(ret); 2697 goto out; 2698 } 2699 2700 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2701 right_path, subtree_root, 2702 dealloc, &deleted, et); 2703 if (ret == -EAGAIN) { 2704 /* 2705 * The rotation has to temporarily stop due to 2706 * the right subtree having an empty 2707 * extent. Pass it back to the caller for a 2708 * fixup. 2709 */ 2710 *empty_extent_path = right_path; 2711 right_path = NULL; 2712 goto out; 2713 } 2714 if (ret) { 2715 mlog_errno(ret); 2716 goto out; 2717 } 2718 2719 /* 2720 * The subtree rotate might have removed records on 2721 * the rightmost edge. If so, then rotation is 2722 * complete. 2723 */ 2724 if (deleted) 2725 break; 2726 2727 ocfs2_mv_path(left_path, right_path); 2728 2729 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 2730 &right_cpos); 2731 if (ret) { 2732 mlog_errno(ret); 2733 goto out; 2734 } 2735 } 2736 2737 out: 2738 ocfs2_free_path(right_path); 2739 ocfs2_free_path(left_path); 2740 2741 return ret; 2742 } 2743 2744 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 2745 struct ocfs2_path *path, 2746 struct ocfs2_cached_dealloc_ctxt *dealloc, 2747 struct ocfs2_extent_tree *et) 2748 { 2749 int ret, subtree_index; 2750 u32 cpos; 2751 struct ocfs2_path *left_path = NULL; 2752 struct ocfs2_extent_block *eb; 2753 struct ocfs2_extent_list *el; 2754 2755 2756 ret = ocfs2_et_sanity_check(inode, et); 2757 if (ret) 2758 goto out; 2759 /* 2760 * There's two ways we handle this depending on 2761 * whether path is the only existing one. 2762 */ 2763 ret = ocfs2_extend_rotate_transaction(handle, 0, 2764 handle->h_buffer_credits, 2765 path); 2766 if (ret) { 2767 mlog_errno(ret); 2768 goto out; 2769 } 2770 2771 ret = ocfs2_journal_access_path(inode, handle, path); 2772 if (ret) { 2773 mlog_errno(ret); 2774 goto out; 2775 } 2776 2777 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 2778 if (ret) { 2779 mlog_errno(ret); 2780 goto out; 2781 } 2782 2783 if (cpos) { 2784 /* 2785 * We have a path to the left of this one - it needs 2786 * an update too. 2787 */ 2788 left_path = ocfs2_new_path(path_root_bh(path), 2789 path_root_el(path)); 2790 if (!left_path) { 2791 ret = -ENOMEM; 2792 mlog_errno(ret); 2793 goto out; 2794 } 2795 2796 ret = ocfs2_find_path(inode, left_path, cpos); 2797 if (ret) { 2798 mlog_errno(ret); 2799 goto out; 2800 } 2801 2802 ret = ocfs2_journal_access_path(inode, handle, left_path); 2803 if (ret) { 2804 mlog_errno(ret); 2805 goto out; 2806 } 2807 2808 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 2809 2810 ocfs2_unlink_subtree(inode, handle, left_path, path, 2811 subtree_index, dealloc); 2812 ocfs2_update_edge_lengths(inode, handle, left_path); 2813 2814 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2815 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2816 } else { 2817 /* 2818 * 'path' is also the leftmost path which 2819 * means it must be the only one. This gets 2820 * handled differently because we want to 2821 * revert the inode back to having extents 2822 * in-line. 2823 */ 2824 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 2825 2826 el = et->et_root_el; 2827 el->l_tree_depth = 0; 2828 el->l_next_free_rec = 0; 2829 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2830 2831 ocfs2_et_set_last_eb_blk(et, 0); 2832 } 2833 2834 ocfs2_journal_dirty(handle, path_root_bh(path)); 2835 2836 out: 2837 ocfs2_free_path(left_path); 2838 return ret; 2839 } 2840 2841 /* 2842 * Left rotation of btree records. 2843 * 2844 * In many ways, this is (unsurprisingly) the opposite of right 2845 * rotation. We start at some non-rightmost path containing an empty 2846 * extent in the leaf block. The code works its way to the rightmost 2847 * path by rotating records to the left in every subtree. 2848 * 2849 * This is used by any code which reduces the number of extent records 2850 * in a leaf. After removal, an empty record should be placed in the 2851 * leftmost list position. 2852 * 2853 * This won't handle a length update of the rightmost path records if 2854 * the rightmost tree leaf record is removed so the caller is 2855 * responsible for detecting and correcting that. 2856 */ 2857 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 2858 struct ocfs2_path *path, 2859 struct ocfs2_cached_dealloc_ctxt *dealloc, 2860 struct ocfs2_extent_tree *et) 2861 { 2862 int ret, orig_credits = handle->h_buffer_credits; 2863 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 2864 struct ocfs2_extent_block *eb; 2865 struct ocfs2_extent_list *el; 2866 2867 el = path_leaf_el(path); 2868 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2869 return 0; 2870 2871 if (path->p_tree_depth == 0) { 2872 rightmost_no_delete: 2873 /* 2874 * Inline extents. This is trivially handled, so do 2875 * it up front. 2876 */ 2877 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2878 path_leaf_bh(path), 2879 path_leaf_el(path)); 2880 if (ret) 2881 mlog_errno(ret); 2882 goto out; 2883 } 2884 2885 /* 2886 * Handle rightmost branch now. There's several cases: 2887 * 1) simple rotation leaving records in there. That's trivial. 2888 * 2) rotation requiring a branch delete - there's no more 2889 * records left. Two cases of this: 2890 * a) There are branches to the left. 2891 * b) This is also the leftmost (the only) branch. 2892 * 2893 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 2894 * 2a) we need the left branch so that we can update it with the unlink 2895 * 2b) we need to bring the inode back to inline extents. 2896 */ 2897 2898 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2899 el = &eb->h_list; 2900 if (eb->h_next_leaf_blk == 0) { 2901 /* 2902 * This gets a bit tricky if we're going to delete the 2903 * rightmost path. Get the other cases out of the way 2904 * 1st. 2905 */ 2906 if (le16_to_cpu(el->l_next_free_rec) > 1) 2907 goto rightmost_no_delete; 2908 2909 if (le16_to_cpu(el->l_next_free_rec) == 0) { 2910 ret = -EIO; 2911 ocfs2_error(inode->i_sb, 2912 "Inode %llu has empty extent block at %llu", 2913 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2914 (unsigned long long)le64_to_cpu(eb->h_blkno)); 2915 goto out; 2916 } 2917 2918 /* 2919 * XXX: The caller can not trust "path" any more after 2920 * this as it will have been deleted. What do we do? 2921 * 2922 * In theory the rotate-for-merge code will never get 2923 * here because it'll always ask for a rotate in a 2924 * nonempty list. 2925 */ 2926 2927 ret = ocfs2_remove_rightmost_path(inode, handle, path, 2928 dealloc, et); 2929 if (ret) 2930 mlog_errno(ret); 2931 goto out; 2932 } 2933 2934 /* 2935 * Now we can loop, remembering the path we get from -EAGAIN 2936 * and restarting from there. 2937 */ 2938 try_rotate: 2939 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 2940 dealloc, &restart_path, et); 2941 if (ret && ret != -EAGAIN) { 2942 mlog_errno(ret); 2943 goto out; 2944 } 2945 2946 while (ret == -EAGAIN) { 2947 tmp_path = restart_path; 2948 restart_path = NULL; 2949 2950 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 2951 tmp_path, dealloc, 2952 &restart_path, et); 2953 if (ret && ret != -EAGAIN) { 2954 mlog_errno(ret); 2955 goto out; 2956 } 2957 2958 ocfs2_free_path(tmp_path); 2959 tmp_path = NULL; 2960 2961 if (ret == 0) 2962 goto try_rotate; 2963 } 2964 2965 out: 2966 ocfs2_free_path(tmp_path); 2967 ocfs2_free_path(restart_path); 2968 return ret; 2969 } 2970 2971 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, 2972 int index) 2973 { 2974 struct ocfs2_extent_rec *rec = &el->l_recs[index]; 2975 unsigned int size; 2976 2977 if (rec->e_leaf_clusters == 0) { 2978 /* 2979 * We consumed all of the merged-from record. An empty 2980 * extent cannot exist anywhere but the 1st array 2981 * position, so move things over if the merged-from 2982 * record doesn't occupy that position. 2983 * 2984 * This creates a new empty extent so the caller 2985 * should be smart enough to have removed any existing 2986 * ones. 2987 */ 2988 if (index > 0) { 2989 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 2990 size = index * sizeof(struct ocfs2_extent_rec); 2991 memmove(&el->l_recs[1], &el->l_recs[0], size); 2992 } 2993 2994 /* 2995 * Always memset - the caller doesn't check whether it 2996 * created an empty extent, so there could be junk in 2997 * the other fields. 2998 */ 2999 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3000 } 3001 } 3002 3003 static int ocfs2_get_right_path(struct inode *inode, 3004 struct ocfs2_path *left_path, 3005 struct ocfs2_path **ret_right_path) 3006 { 3007 int ret; 3008 u32 right_cpos; 3009 struct ocfs2_path *right_path = NULL; 3010 struct ocfs2_extent_list *left_el; 3011 3012 *ret_right_path = NULL; 3013 3014 /* This function shouldn't be called for non-trees. */ 3015 BUG_ON(left_path->p_tree_depth == 0); 3016 3017 left_el = path_leaf_el(left_path); 3018 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3019 3020 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3021 &right_cpos); 3022 if (ret) { 3023 mlog_errno(ret); 3024 goto out; 3025 } 3026 3027 /* This function shouldn't be called for the rightmost leaf. */ 3028 BUG_ON(right_cpos == 0); 3029 3030 right_path = ocfs2_new_path(path_root_bh(left_path), 3031 path_root_el(left_path)); 3032 if (!right_path) { 3033 ret = -ENOMEM; 3034 mlog_errno(ret); 3035 goto out; 3036 } 3037 3038 ret = ocfs2_find_path(inode, right_path, right_cpos); 3039 if (ret) { 3040 mlog_errno(ret); 3041 goto out; 3042 } 3043 3044 *ret_right_path = right_path; 3045 out: 3046 if (ret) 3047 ocfs2_free_path(right_path); 3048 return ret; 3049 } 3050 3051 /* 3052 * Remove split_rec clusters from the record at index and merge them 3053 * onto the beginning of the record "next" to it. 3054 * For index < l_count - 1, the next means the extent rec at index + 1. 3055 * For index == l_count - 1, the "next" means the 1st extent rec of the 3056 * next extent block. 3057 */ 3058 static int ocfs2_merge_rec_right(struct inode *inode, 3059 struct ocfs2_path *left_path, 3060 handle_t *handle, 3061 struct ocfs2_extent_rec *split_rec, 3062 int index) 3063 { 3064 int ret, next_free, i; 3065 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3066 struct ocfs2_extent_rec *left_rec; 3067 struct ocfs2_extent_rec *right_rec; 3068 struct ocfs2_extent_list *right_el; 3069 struct ocfs2_path *right_path = NULL; 3070 int subtree_index = 0; 3071 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3072 struct buffer_head *bh = path_leaf_bh(left_path); 3073 struct buffer_head *root_bh = NULL; 3074 3075 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 3076 left_rec = &el->l_recs[index]; 3077 3078 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3079 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3080 /* we meet with a cross extent block merge. */ 3081 ret = ocfs2_get_right_path(inode, left_path, &right_path); 3082 if (ret) { 3083 mlog_errno(ret); 3084 goto out; 3085 } 3086 3087 right_el = path_leaf_el(right_path); 3088 next_free = le16_to_cpu(right_el->l_next_free_rec); 3089 BUG_ON(next_free <= 0); 3090 right_rec = &right_el->l_recs[0]; 3091 if (ocfs2_is_empty_extent(right_rec)) { 3092 BUG_ON(next_free <= 1); 3093 right_rec = &right_el->l_recs[1]; 3094 } 3095 3096 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3097 le16_to_cpu(left_rec->e_leaf_clusters) != 3098 le32_to_cpu(right_rec->e_cpos)); 3099 3100 subtree_index = ocfs2_find_subtree_root(inode, 3101 left_path, right_path); 3102 3103 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3104 handle->h_buffer_credits, 3105 right_path); 3106 if (ret) { 3107 mlog_errno(ret); 3108 goto out; 3109 } 3110 3111 root_bh = left_path->p_node[subtree_index].bh; 3112 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3113 3114 ret = ocfs2_journal_access(handle, inode, root_bh, 3115 OCFS2_JOURNAL_ACCESS_WRITE); 3116 if (ret) { 3117 mlog_errno(ret); 3118 goto out; 3119 } 3120 3121 for (i = subtree_index + 1; 3122 i < path_num_items(right_path); i++) { 3123 ret = ocfs2_journal_access(handle, inode, 3124 right_path->p_node[i].bh, 3125 OCFS2_JOURNAL_ACCESS_WRITE); 3126 if (ret) { 3127 mlog_errno(ret); 3128 goto out; 3129 } 3130 3131 ret = ocfs2_journal_access(handle, inode, 3132 left_path->p_node[i].bh, 3133 OCFS2_JOURNAL_ACCESS_WRITE); 3134 if (ret) { 3135 mlog_errno(ret); 3136 goto out; 3137 } 3138 } 3139 3140 } else { 3141 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); 3142 right_rec = &el->l_recs[index + 1]; 3143 } 3144 3145 ret = ocfs2_journal_access(handle, inode, bh, 3146 OCFS2_JOURNAL_ACCESS_WRITE); 3147 if (ret) { 3148 mlog_errno(ret); 3149 goto out; 3150 } 3151 3152 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); 3153 3154 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3155 le64_add_cpu(&right_rec->e_blkno, 3156 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3157 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3158 3159 ocfs2_cleanup_merge(el, index); 3160 3161 ret = ocfs2_journal_dirty(handle, bh); 3162 if (ret) 3163 mlog_errno(ret); 3164 3165 if (right_path) { 3166 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3167 if (ret) 3168 mlog_errno(ret); 3169 3170 ocfs2_complete_edge_insert(inode, handle, left_path, 3171 right_path, subtree_index); 3172 } 3173 out: 3174 if (right_path) 3175 ocfs2_free_path(right_path); 3176 return ret; 3177 } 3178 3179 static int ocfs2_get_left_path(struct inode *inode, 3180 struct ocfs2_path *right_path, 3181 struct ocfs2_path **ret_left_path) 3182 { 3183 int ret; 3184 u32 left_cpos; 3185 struct ocfs2_path *left_path = NULL; 3186 3187 *ret_left_path = NULL; 3188 3189 /* This function shouldn't be called for non-trees. */ 3190 BUG_ON(right_path->p_tree_depth == 0); 3191 3192 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3193 right_path, &left_cpos); 3194 if (ret) { 3195 mlog_errno(ret); 3196 goto out; 3197 } 3198 3199 /* This function shouldn't be called for the leftmost leaf. */ 3200 BUG_ON(left_cpos == 0); 3201 3202 left_path = ocfs2_new_path(path_root_bh(right_path), 3203 path_root_el(right_path)); 3204 if (!left_path) { 3205 ret = -ENOMEM; 3206 mlog_errno(ret); 3207 goto out; 3208 } 3209 3210 ret = ocfs2_find_path(inode, left_path, left_cpos); 3211 if (ret) { 3212 mlog_errno(ret); 3213 goto out; 3214 } 3215 3216 *ret_left_path = left_path; 3217 out: 3218 if (ret) 3219 ocfs2_free_path(left_path); 3220 return ret; 3221 } 3222 3223 /* 3224 * Remove split_rec clusters from the record at index and merge them 3225 * onto the tail of the record "before" it. 3226 * For index > 0, the "before" means the extent rec at index - 1. 3227 * 3228 * For index == 0, the "before" means the last record of the previous 3229 * extent block. And there is also a situation that we may need to 3230 * remove the rightmost leaf extent block in the right_path and change 3231 * the right path to indicate the new rightmost path. 3232 */ 3233 static int ocfs2_merge_rec_left(struct inode *inode, 3234 struct ocfs2_path *right_path, 3235 handle_t *handle, 3236 struct ocfs2_extent_rec *split_rec, 3237 struct ocfs2_cached_dealloc_ctxt *dealloc, 3238 struct ocfs2_extent_tree *et, 3239 int index) 3240 { 3241 int ret, i, subtree_index = 0, has_empty_extent = 0; 3242 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3243 struct ocfs2_extent_rec *left_rec; 3244 struct ocfs2_extent_rec *right_rec; 3245 struct ocfs2_extent_list *el = path_leaf_el(right_path); 3246 struct buffer_head *bh = path_leaf_bh(right_path); 3247 struct buffer_head *root_bh = NULL; 3248 struct ocfs2_path *left_path = NULL; 3249 struct ocfs2_extent_list *left_el; 3250 3251 BUG_ON(index < 0); 3252 3253 right_rec = &el->l_recs[index]; 3254 if (index == 0) { 3255 /* we meet with a cross extent block merge. */ 3256 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3257 if (ret) { 3258 mlog_errno(ret); 3259 goto out; 3260 } 3261 3262 left_el = path_leaf_el(left_path); 3263 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != 3264 le16_to_cpu(left_el->l_count)); 3265 3266 left_rec = &left_el->l_recs[ 3267 le16_to_cpu(left_el->l_next_free_rec) - 1]; 3268 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3269 le16_to_cpu(left_rec->e_leaf_clusters) != 3270 le32_to_cpu(split_rec->e_cpos)); 3271 3272 subtree_index = ocfs2_find_subtree_root(inode, 3273 left_path, right_path); 3274 3275 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3276 handle->h_buffer_credits, 3277 left_path); 3278 if (ret) { 3279 mlog_errno(ret); 3280 goto out; 3281 } 3282 3283 root_bh = left_path->p_node[subtree_index].bh; 3284 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3285 3286 ret = ocfs2_journal_access(handle, inode, root_bh, 3287 OCFS2_JOURNAL_ACCESS_WRITE); 3288 if (ret) { 3289 mlog_errno(ret); 3290 goto out; 3291 } 3292 3293 for (i = subtree_index + 1; 3294 i < path_num_items(right_path); i++) { 3295 ret = ocfs2_journal_access(handle, inode, 3296 right_path->p_node[i].bh, 3297 OCFS2_JOURNAL_ACCESS_WRITE); 3298 if (ret) { 3299 mlog_errno(ret); 3300 goto out; 3301 } 3302 3303 ret = ocfs2_journal_access(handle, inode, 3304 left_path->p_node[i].bh, 3305 OCFS2_JOURNAL_ACCESS_WRITE); 3306 if (ret) { 3307 mlog_errno(ret); 3308 goto out; 3309 } 3310 } 3311 } else { 3312 left_rec = &el->l_recs[index - 1]; 3313 if (ocfs2_is_empty_extent(&el->l_recs[0])) 3314 has_empty_extent = 1; 3315 } 3316 3317 ret = ocfs2_journal_access(handle, inode, bh, 3318 OCFS2_JOURNAL_ACCESS_WRITE); 3319 if (ret) { 3320 mlog_errno(ret); 3321 goto out; 3322 } 3323 3324 if (has_empty_extent && index == 1) { 3325 /* 3326 * The easy case - we can just plop the record right in. 3327 */ 3328 *left_rec = *split_rec; 3329 3330 has_empty_extent = 0; 3331 } else 3332 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3333 3334 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3335 le64_add_cpu(&right_rec->e_blkno, 3336 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3337 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3338 3339 ocfs2_cleanup_merge(el, index); 3340 3341 ret = ocfs2_journal_dirty(handle, bh); 3342 if (ret) 3343 mlog_errno(ret); 3344 3345 if (left_path) { 3346 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3347 if (ret) 3348 mlog_errno(ret); 3349 3350 /* 3351 * In the situation that the right_rec is empty and the extent 3352 * block is empty also, ocfs2_complete_edge_insert can't handle 3353 * it and we need to delete the right extent block. 3354 */ 3355 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3356 le16_to_cpu(el->l_next_free_rec) == 1) { 3357 3358 ret = ocfs2_remove_rightmost_path(inode, handle, 3359 right_path, 3360 dealloc, et); 3361 if (ret) { 3362 mlog_errno(ret); 3363 goto out; 3364 } 3365 3366 /* Now the rightmost extent block has been deleted. 3367 * So we use the new rightmost path. 3368 */ 3369 ocfs2_mv_path(right_path, left_path); 3370 left_path = NULL; 3371 } else 3372 ocfs2_complete_edge_insert(inode, handle, left_path, 3373 right_path, subtree_index); 3374 } 3375 out: 3376 if (left_path) 3377 ocfs2_free_path(left_path); 3378 return ret; 3379 } 3380 3381 static int ocfs2_try_to_merge_extent(struct inode *inode, 3382 handle_t *handle, 3383 struct ocfs2_path *path, 3384 int split_index, 3385 struct ocfs2_extent_rec *split_rec, 3386 struct ocfs2_cached_dealloc_ctxt *dealloc, 3387 struct ocfs2_merge_ctxt *ctxt, 3388 struct ocfs2_extent_tree *et) 3389 3390 { 3391 int ret = 0; 3392 struct ocfs2_extent_list *el = path_leaf_el(path); 3393 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3394 3395 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3396 3397 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3398 /* 3399 * The merge code will need to create an empty 3400 * extent to take the place of the newly 3401 * emptied slot. Remove any pre-existing empty 3402 * extents - having more than one in a leaf is 3403 * illegal. 3404 */ 3405 ret = ocfs2_rotate_tree_left(inode, handle, path, 3406 dealloc, et); 3407 if (ret) { 3408 mlog_errno(ret); 3409 goto out; 3410 } 3411 split_index--; 3412 rec = &el->l_recs[split_index]; 3413 } 3414 3415 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 3416 /* 3417 * Left-right contig implies this. 3418 */ 3419 BUG_ON(!ctxt->c_split_covers_rec); 3420 3421 /* 3422 * Since the leftright insert always covers the entire 3423 * extent, this call will delete the insert record 3424 * entirely, resulting in an empty extent record added to 3425 * the extent block. 3426 * 3427 * Since the adding of an empty extent shifts 3428 * everything back to the right, there's no need to 3429 * update split_index here. 3430 * 3431 * When the split_index is zero, we need to merge it to the 3432 * prevoius extent block. It is more efficient and easier 3433 * if we do merge_right first and merge_left later. 3434 */ 3435 ret = ocfs2_merge_rec_right(inode, path, 3436 handle, split_rec, 3437 split_index); 3438 if (ret) { 3439 mlog_errno(ret); 3440 goto out; 3441 } 3442 3443 /* 3444 * We can only get this from logic error above. 3445 */ 3446 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3447 3448 /* The merge left us with an empty extent, remove it. */ 3449 ret = ocfs2_rotate_tree_left(inode, handle, path, 3450 dealloc, et); 3451 if (ret) { 3452 mlog_errno(ret); 3453 goto out; 3454 } 3455 3456 rec = &el->l_recs[split_index]; 3457 3458 /* 3459 * Note that we don't pass split_rec here on purpose - 3460 * we've merged it into the rec already. 3461 */ 3462 ret = ocfs2_merge_rec_left(inode, path, 3463 handle, rec, 3464 dealloc, et, 3465 split_index); 3466 3467 if (ret) { 3468 mlog_errno(ret); 3469 goto out; 3470 } 3471 3472 ret = ocfs2_rotate_tree_left(inode, handle, path, 3473 dealloc, et); 3474 /* 3475 * Error from this last rotate is not critical, so 3476 * print but don't bubble it up. 3477 */ 3478 if (ret) 3479 mlog_errno(ret); 3480 ret = 0; 3481 } else { 3482 /* 3483 * Merge a record to the left or right. 3484 * 3485 * 'contig_type' is relative to the existing record, 3486 * so for example, if we're "right contig", it's to 3487 * the record on the left (hence the left merge). 3488 */ 3489 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3490 ret = ocfs2_merge_rec_left(inode, 3491 path, 3492 handle, split_rec, 3493 dealloc, et, 3494 split_index); 3495 if (ret) { 3496 mlog_errno(ret); 3497 goto out; 3498 } 3499 } else { 3500 ret = ocfs2_merge_rec_right(inode, 3501 path, 3502 handle, split_rec, 3503 split_index); 3504 if (ret) { 3505 mlog_errno(ret); 3506 goto out; 3507 } 3508 } 3509 3510 if (ctxt->c_split_covers_rec) { 3511 /* 3512 * The merge may have left an empty extent in 3513 * our leaf. Try to rotate it away. 3514 */ 3515 ret = ocfs2_rotate_tree_left(inode, handle, path, 3516 dealloc, et); 3517 if (ret) 3518 mlog_errno(ret); 3519 ret = 0; 3520 } 3521 } 3522 3523 out: 3524 return ret; 3525 } 3526 3527 static void ocfs2_subtract_from_rec(struct super_block *sb, 3528 enum ocfs2_split_type split, 3529 struct ocfs2_extent_rec *rec, 3530 struct ocfs2_extent_rec *split_rec) 3531 { 3532 u64 len_blocks; 3533 3534 len_blocks = ocfs2_clusters_to_blocks(sb, 3535 le16_to_cpu(split_rec->e_leaf_clusters)); 3536 3537 if (split == SPLIT_LEFT) { 3538 /* 3539 * Region is on the left edge of the existing 3540 * record. 3541 */ 3542 le32_add_cpu(&rec->e_cpos, 3543 le16_to_cpu(split_rec->e_leaf_clusters)); 3544 le64_add_cpu(&rec->e_blkno, len_blocks); 3545 le16_add_cpu(&rec->e_leaf_clusters, 3546 -le16_to_cpu(split_rec->e_leaf_clusters)); 3547 } else { 3548 /* 3549 * Region is on the right edge of the existing 3550 * record. 3551 */ 3552 le16_add_cpu(&rec->e_leaf_clusters, 3553 -le16_to_cpu(split_rec->e_leaf_clusters)); 3554 } 3555 } 3556 3557 /* 3558 * Do the final bits of extent record insertion at the target leaf 3559 * list. If this leaf is part of an allocation tree, it is assumed 3560 * that the tree above has been prepared. 3561 */ 3562 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3563 struct ocfs2_extent_list *el, 3564 struct ocfs2_insert_type *insert, 3565 struct inode *inode) 3566 { 3567 int i = insert->ins_contig_index; 3568 unsigned int range; 3569 struct ocfs2_extent_rec *rec; 3570 3571 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 3572 3573 if (insert->ins_split != SPLIT_NONE) { 3574 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3575 BUG_ON(i == -1); 3576 rec = &el->l_recs[i]; 3577 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3578 insert_rec); 3579 goto rotate; 3580 } 3581 3582 /* 3583 * Contiguous insert - either left or right. 3584 */ 3585 if (insert->ins_contig != CONTIG_NONE) { 3586 rec = &el->l_recs[i]; 3587 if (insert->ins_contig == CONTIG_LEFT) { 3588 rec->e_blkno = insert_rec->e_blkno; 3589 rec->e_cpos = insert_rec->e_cpos; 3590 } 3591 le16_add_cpu(&rec->e_leaf_clusters, 3592 le16_to_cpu(insert_rec->e_leaf_clusters)); 3593 return; 3594 } 3595 3596 /* 3597 * Handle insert into an empty leaf. 3598 */ 3599 if (le16_to_cpu(el->l_next_free_rec) == 0 || 3600 ((le16_to_cpu(el->l_next_free_rec) == 1) && 3601 ocfs2_is_empty_extent(&el->l_recs[0]))) { 3602 el->l_recs[0] = *insert_rec; 3603 el->l_next_free_rec = cpu_to_le16(1); 3604 return; 3605 } 3606 3607 /* 3608 * Appending insert. 3609 */ 3610 if (insert->ins_appending == APPEND_TAIL) { 3611 i = le16_to_cpu(el->l_next_free_rec) - 1; 3612 rec = &el->l_recs[i]; 3613 range = le32_to_cpu(rec->e_cpos) 3614 + le16_to_cpu(rec->e_leaf_clusters); 3615 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 3616 3617 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3618 le16_to_cpu(el->l_count), 3619 "inode %lu, depth %u, count %u, next free %u, " 3620 "rec.cpos %u, rec.clusters %u, " 3621 "insert.cpos %u, insert.clusters %u\n", 3622 inode->i_ino, 3623 le16_to_cpu(el->l_tree_depth), 3624 le16_to_cpu(el->l_count), 3625 le16_to_cpu(el->l_next_free_rec), 3626 le32_to_cpu(el->l_recs[i].e_cpos), 3627 le16_to_cpu(el->l_recs[i].e_leaf_clusters), 3628 le32_to_cpu(insert_rec->e_cpos), 3629 le16_to_cpu(insert_rec->e_leaf_clusters)); 3630 i++; 3631 el->l_recs[i] = *insert_rec; 3632 le16_add_cpu(&el->l_next_free_rec, 1); 3633 return; 3634 } 3635 3636 rotate: 3637 /* 3638 * Ok, we have to rotate. 3639 * 3640 * At this point, it is safe to assume that inserting into an 3641 * empty leaf and appending to a leaf have both been handled 3642 * above. 3643 * 3644 * This leaf needs to have space, either by the empty 1st 3645 * extent record, or by virtue of an l_next_rec < l_count. 3646 */ 3647 ocfs2_rotate_leaf(el, insert_rec); 3648 } 3649 3650 static void ocfs2_adjust_rightmost_records(struct inode *inode, 3651 handle_t *handle, 3652 struct ocfs2_path *path, 3653 struct ocfs2_extent_rec *insert_rec) 3654 { 3655 int ret, i, next_free; 3656 struct buffer_head *bh; 3657 struct ocfs2_extent_list *el; 3658 struct ocfs2_extent_rec *rec; 3659 3660 /* 3661 * Update everything except the leaf block. 3662 */ 3663 for (i = 0; i < path->p_tree_depth; i++) { 3664 bh = path->p_node[i].bh; 3665 el = path->p_node[i].el; 3666 3667 next_free = le16_to_cpu(el->l_next_free_rec); 3668 if (next_free == 0) { 3669 ocfs2_error(inode->i_sb, 3670 "Dinode %llu has a bad extent list", 3671 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3672 ret = -EIO; 3673 return; 3674 } 3675 3676 rec = &el->l_recs[next_free - 1]; 3677 3678 rec->e_int_clusters = insert_rec->e_cpos; 3679 le32_add_cpu(&rec->e_int_clusters, 3680 le16_to_cpu(insert_rec->e_leaf_clusters)); 3681 le32_add_cpu(&rec->e_int_clusters, 3682 -le32_to_cpu(rec->e_cpos)); 3683 3684 ret = ocfs2_journal_dirty(handle, bh); 3685 if (ret) 3686 mlog_errno(ret); 3687 3688 } 3689 } 3690 3691 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3692 struct ocfs2_extent_rec *insert_rec, 3693 struct ocfs2_path *right_path, 3694 struct ocfs2_path **ret_left_path) 3695 { 3696 int ret, next_free; 3697 struct ocfs2_extent_list *el; 3698 struct ocfs2_path *left_path = NULL; 3699 3700 *ret_left_path = NULL; 3701 3702 /* 3703 * This shouldn't happen for non-trees. The extent rec cluster 3704 * count manipulation below only works for interior nodes. 3705 */ 3706 BUG_ON(right_path->p_tree_depth == 0); 3707 3708 /* 3709 * If our appending insert is at the leftmost edge of a leaf, 3710 * then we might need to update the rightmost records of the 3711 * neighboring path. 3712 */ 3713 el = path_leaf_el(right_path); 3714 next_free = le16_to_cpu(el->l_next_free_rec); 3715 if (next_free == 0 || 3716 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 3717 u32 left_cpos; 3718 3719 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 3720 &left_cpos); 3721 if (ret) { 3722 mlog_errno(ret); 3723 goto out; 3724 } 3725 3726 mlog(0, "Append may need a left path update. cpos: %u, " 3727 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 3728 left_cpos); 3729 3730 /* 3731 * No need to worry if the append is already in the 3732 * leftmost leaf. 3733 */ 3734 if (left_cpos) { 3735 left_path = ocfs2_new_path(path_root_bh(right_path), 3736 path_root_el(right_path)); 3737 if (!left_path) { 3738 ret = -ENOMEM; 3739 mlog_errno(ret); 3740 goto out; 3741 } 3742 3743 ret = ocfs2_find_path(inode, left_path, left_cpos); 3744 if (ret) { 3745 mlog_errno(ret); 3746 goto out; 3747 } 3748 3749 /* 3750 * ocfs2_insert_path() will pass the left_path to the 3751 * journal for us. 3752 */ 3753 } 3754 } 3755 3756 ret = ocfs2_journal_access_path(inode, handle, right_path); 3757 if (ret) { 3758 mlog_errno(ret); 3759 goto out; 3760 } 3761 3762 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 3763 3764 *ret_left_path = left_path; 3765 ret = 0; 3766 out: 3767 if (ret != 0) 3768 ocfs2_free_path(left_path); 3769 3770 return ret; 3771 } 3772 3773 static void ocfs2_split_record(struct inode *inode, 3774 struct ocfs2_path *left_path, 3775 struct ocfs2_path *right_path, 3776 struct ocfs2_extent_rec *split_rec, 3777 enum ocfs2_split_type split) 3778 { 3779 int index; 3780 u32 cpos = le32_to_cpu(split_rec->e_cpos); 3781 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3782 struct ocfs2_extent_rec *rec, *tmprec; 3783 3784 right_el = path_leaf_el(right_path);; 3785 if (left_path) 3786 left_el = path_leaf_el(left_path); 3787 3788 el = right_el; 3789 insert_el = right_el; 3790 index = ocfs2_search_extent_list(el, cpos); 3791 if (index != -1) { 3792 if (index == 0 && left_path) { 3793 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3794 3795 /* 3796 * This typically means that the record 3797 * started in the left path but moved to the 3798 * right as a result of rotation. We either 3799 * move the existing record to the left, or we 3800 * do the later insert there. 3801 * 3802 * In this case, the left path should always 3803 * exist as the rotate code will have passed 3804 * it back for a post-insert update. 3805 */ 3806 3807 if (split == SPLIT_LEFT) { 3808 /* 3809 * It's a left split. Since we know 3810 * that the rotate code gave us an 3811 * empty extent in the left path, we 3812 * can just do the insert there. 3813 */ 3814 insert_el = left_el; 3815 } else { 3816 /* 3817 * Right split - we have to move the 3818 * existing record over to the left 3819 * leaf. The insert will be into the 3820 * newly created empty extent in the 3821 * right leaf. 3822 */ 3823 tmprec = &right_el->l_recs[index]; 3824 ocfs2_rotate_leaf(left_el, tmprec); 3825 el = left_el; 3826 3827 memset(tmprec, 0, sizeof(*tmprec)); 3828 index = ocfs2_search_extent_list(left_el, cpos); 3829 BUG_ON(index == -1); 3830 } 3831 } 3832 } else { 3833 BUG_ON(!left_path); 3834 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); 3835 /* 3836 * Left path is easy - we can just allow the insert to 3837 * happen. 3838 */ 3839 el = left_el; 3840 insert_el = left_el; 3841 index = ocfs2_search_extent_list(el, cpos); 3842 BUG_ON(index == -1); 3843 } 3844 3845 rec = &el->l_recs[index]; 3846 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 3847 ocfs2_rotate_leaf(insert_el, split_rec); 3848 } 3849 3850 /* 3851 * This function only does inserts on an allocation b-tree. For tree 3852 * depth = 0, ocfs2_insert_at_leaf() is called directly. 3853 * 3854 * right_path is the path we want to do the actual insert 3855 * in. left_path should only be passed in if we need to update that 3856 * portion of the tree after an edge insert. 3857 */ 3858 static int ocfs2_insert_path(struct inode *inode, 3859 handle_t *handle, 3860 struct ocfs2_path *left_path, 3861 struct ocfs2_path *right_path, 3862 struct ocfs2_extent_rec *insert_rec, 3863 struct ocfs2_insert_type *insert) 3864 { 3865 int ret, subtree_index; 3866 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3867 3868 if (left_path) { 3869 int credits = handle->h_buffer_credits; 3870 3871 /* 3872 * There's a chance that left_path got passed back to 3873 * us without being accounted for in the 3874 * journal. Extend our transaction here to be sure we 3875 * can change those blocks. 3876 */ 3877 credits += left_path->p_tree_depth; 3878 3879 ret = ocfs2_extend_trans(handle, credits); 3880 if (ret < 0) { 3881 mlog_errno(ret); 3882 goto out; 3883 } 3884 3885 ret = ocfs2_journal_access_path(inode, handle, left_path); 3886 if (ret < 0) { 3887 mlog_errno(ret); 3888 goto out; 3889 } 3890 } 3891 3892 /* 3893 * Pass both paths to the journal. The majority of inserts 3894 * will be touching all components anyway. 3895 */ 3896 ret = ocfs2_journal_access_path(inode, handle, right_path); 3897 if (ret < 0) { 3898 mlog_errno(ret); 3899 goto out; 3900 } 3901 3902 if (insert->ins_split != SPLIT_NONE) { 3903 /* 3904 * We could call ocfs2_insert_at_leaf() for some types 3905 * of splits, but it's easier to just let one separate 3906 * function sort it all out. 3907 */ 3908 ocfs2_split_record(inode, left_path, right_path, 3909 insert_rec, insert->ins_split); 3910 3911 /* 3912 * Split might have modified either leaf and we don't 3913 * have a guarantee that the later edge insert will 3914 * dirty this for us. 3915 */ 3916 if (left_path) 3917 ret = ocfs2_journal_dirty(handle, 3918 path_leaf_bh(left_path)); 3919 if (ret) 3920 mlog_errno(ret); 3921 } else 3922 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 3923 insert, inode); 3924 3925 ret = ocfs2_journal_dirty(handle, leaf_bh); 3926 if (ret) 3927 mlog_errno(ret); 3928 3929 if (left_path) { 3930 /* 3931 * The rotate code has indicated that we need to fix 3932 * up portions of the tree after the insert. 3933 * 3934 * XXX: Should we extend the transaction here? 3935 */ 3936 subtree_index = ocfs2_find_subtree_root(inode, left_path, 3937 right_path); 3938 ocfs2_complete_edge_insert(inode, handle, left_path, 3939 right_path, subtree_index); 3940 } 3941 3942 ret = 0; 3943 out: 3944 return ret; 3945 } 3946 3947 static int ocfs2_do_insert_extent(struct inode *inode, 3948 handle_t *handle, 3949 struct ocfs2_extent_tree *et, 3950 struct ocfs2_extent_rec *insert_rec, 3951 struct ocfs2_insert_type *type) 3952 { 3953 int ret, rotate = 0; 3954 u32 cpos; 3955 struct ocfs2_path *right_path = NULL; 3956 struct ocfs2_path *left_path = NULL; 3957 struct ocfs2_extent_list *el; 3958 3959 el = et->et_root_el; 3960 3961 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 3962 OCFS2_JOURNAL_ACCESS_WRITE); 3963 if (ret) { 3964 mlog_errno(ret); 3965 goto out; 3966 } 3967 3968 if (le16_to_cpu(el->l_tree_depth) == 0) { 3969 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 3970 goto out_update_clusters; 3971 } 3972 3973 right_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 3974 if (!right_path) { 3975 ret = -ENOMEM; 3976 mlog_errno(ret); 3977 goto out; 3978 } 3979 3980 /* 3981 * Determine the path to start with. Rotations need the 3982 * rightmost path, everything else can go directly to the 3983 * target leaf. 3984 */ 3985 cpos = le32_to_cpu(insert_rec->e_cpos); 3986 if (type->ins_appending == APPEND_NONE && 3987 type->ins_contig == CONTIG_NONE) { 3988 rotate = 1; 3989 cpos = UINT_MAX; 3990 } 3991 3992 ret = ocfs2_find_path(inode, right_path, cpos); 3993 if (ret) { 3994 mlog_errno(ret); 3995 goto out; 3996 } 3997 3998 /* 3999 * Rotations and appends need special treatment - they modify 4000 * parts of the tree's above them. 4001 * 4002 * Both might pass back a path immediate to the left of the 4003 * one being inserted to. This will be cause 4004 * ocfs2_insert_path() to modify the rightmost records of 4005 * left_path to account for an edge insert. 4006 * 4007 * XXX: When modifying this code, keep in mind that an insert 4008 * can wind up skipping both of these two special cases... 4009 */ 4010 if (rotate) { 4011 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 4012 le32_to_cpu(insert_rec->e_cpos), 4013 right_path, &left_path); 4014 if (ret) { 4015 mlog_errno(ret); 4016 goto out; 4017 } 4018 4019 /* 4020 * ocfs2_rotate_tree_right() might have extended the 4021 * transaction without re-journaling our tree root. 4022 */ 4023 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4024 OCFS2_JOURNAL_ACCESS_WRITE); 4025 if (ret) { 4026 mlog_errno(ret); 4027 goto out; 4028 } 4029 } else if (type->ins_appending == APPEND_TAIL 4030 && type->ins_contig != CONTIG_LEFT) { 4031 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 4032 right_path, &left_path); 4033 if (ret) { 4034 mlog_errno(ret); 4035 goto out; 4036 } 4037 } 4038 4039 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 4040 insert_rec, type); 4041 if (ret) { 4042 mlog_errno(ret); 4043 goto out; 4044 } 4045 4046 out_update_clusters: 4047 if (type->ins_split == SPLIT_NONE) 4048 ocfs2_et_update_clusters(inode, et, 4049 le16_to_cpu(insert_rec->e_leaf_clusters)); 4050 4051 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4052 if (ret) 4053 mlog_errno(ret); 4054 4055 out: 4056 ocfs2_free_path(left_path); 4057 ocfs2_free_path(right_path); 4058 4059 return ret; 4060 } 4061 4062 static enum ocfs2_contig_type 4063 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 4064 struct ocfs2_extent_list *el, int index, 4065 struct ocfs2_extent_rec *split_rec) 4066 { 4067 int status; 4068 enum ocfs2_contig_type ret = CONTIG_NONE; 4069 u32 left_cpos, right_cpos; 4070 struct ocfs2_extent_rec *rec = NULL; 4071 struct ocfs2_extent_list *new_el; 4072 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4073 struct buffer_head *bh; 4074 struct ocfs2_extent_block *eb; 4075 4076 if (index > 0) { 4077 rec = &el->l_recs[index - 1]; 4078 } else if (path->p_tree_depth > 0) { 4079 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 4080 path, &left_cpos); 4081 if (status) 4082 goto out; 4083 4084 if (left_cpos != 0) { 4085 left_path = ocfs2_new_path(path_root_bh(path), 4086 path_root_el(path)); 4087 if (!left_path) 4088 goto out; 4089 4090 status = ocfs2_find_path(inode, left_path, left_cpos); 4091 if (status) 4092 goto out; 4093 4094 new_el = path_leaf_el(left_path); 4095 4096 if (le16_to_cpu(new_el->l_next_free_rec) != 4097 le16_to_cpu(new_el->l_count)) { 4098 bh = path_leaf_bh(left_path); 4099 eb = (struct ocfs2_extent_block *)bh->b_data; 4100 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4101 eb); 4102 goto out; 4103 } 4104 rec = &new_el->l_recs[ 4105 le16_to_cpu(new_el->l_next_free_rec) - 1]; 4106 } 4107 } 4108 4109 /* 4110 * We're careful to check for an empty extent record here - 4111 * the merge code will know what to do if it sees one. 4112 */ 4113 if (rec) { 4114 if (index == 1 && ocfs2_is_empty_extent(rec)) { 4115 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4116 ret = CONTIG_RIGHT; 4117 } else { 4118 ret = ocfs2_extent_contig(inode, rec, split_rec); 4119 } 4120 } 4121 4122 rec = NULL; 4123 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) 4124 rec = &el->l_recs[index + 1]; 4125 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4126 path->p_tree_depth > 0) { 4127 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4128 path, &right_cpos); 4129 if (status) 4130 goto out; 4131 4132 if (right_cpos == 0) 4133 goto out; 4134 4135 right_path = ocfs2_new_path(path_root_bh(path), 4136 path_root_el(path)); 4137 if (!right_path) 4138 goto out; 4139 4140 status = ocfs2_find_path(inode, right_path, right_cpos); 4141 if (status) 4142 goto out; 4143 4144 new_el = path_leaf_el(right_path); 4145 rec = &new_el->l_recs[0]; 4146 if (ocfs2_is_empty_extent(rec)) { 4147 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4148 bh = path_leaf_bh(right_path); 4149 eb = (struct ocfs2_extent_block *)bh->b_data; 4150 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, 4151 eb); 4152 goto out; 4153 } 4154 rec = &new_el->l_recs[1]; 4155 } 4156 } 4157 4158 if (rec) { 4159 enum ocfs2_contig_type contig_type; 4160 4161 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4162 4163 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4164 ret = CONTIG_LEFTRIGHT; 4165 else if (ret == CONTIG_NONE) 4166 ret = contig_type; 4167 } 4168 4169 out: 4170 if (left_path) 4171 ocfs2_free_path(left_path); 4172 if (right_path) 4173 ocfs2_free_path(right_path); 4174 4175 return ret; 4176 } 4177 4178 static void ocfs2_figure_contig_type(struct inode *inode, 4179 struct ocfs2_insert_type *insert, 4180 struct ocfs2_extent_list *el, 4181 struct ocfs2_extent_rec *insert_rec, 4182 struct ocfs2_extent_tree *et) 4183 { 4184 int i; 4185 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4186 4187 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4188 4189 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4190 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4191 insert_rec); 4192 if (contig_type != CONTIG_NONE) { 4193 insert->ins_contig_index = i; 4194 break; 4195 } 4196 } 4197 insert->ins_contig = contig_type; 4198 4199 if (insert->ins_contig != CONTIG_NONE) { 4200 struct ocfs2_extent_rec *rec = 4201 &el->l_recs[insert->ins_contig_index]; 4202 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + 4203 le16_to_cpu(insert_rec->e_leaf_clusters); 4204 4205 /* 4206 * Caller might want us to limit the size of extents, don't 4207 * calculate contiguousness if we might exceed that limit. 4208 */ 4209 if (et->et_max_leaf_clusters && 4210 (len > et->et_max_leaf_clusters)) 4211 insert->ins_contig = CONTIG_NONE; 4212 } 4213 } 4214 4215 /* 4216 * This should only be called against the righmost leaf extent list. 4217 * 4218 * ocfs2_figure_appending_type() will figure out whether we'll have to 4219 * insert at the tail of the rightmost leaf. 4220 * 4221 * This should also work against the root extent list for tree's with 0 4222 * depth. If we consider the root extent list to be the rightmost leaf node 4223 * then the logic here makes sense. 4224 */ 4225 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4226 struct ocfs2_extent_list *el, 4227 struct ocfs2_extent_rec *insert_rec) 4228 { 4229 int i; 4230 u32 cpos = le32_to_cpu(insert_rec->e_cpos); 4231 struct ocfs2_extent_rec *rec; 4232 4233 insert->ins_appending = APPEND_NONE; 4234 4235 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4236 4237 if (!el->l_next_free_rec) 4238 goto set_tail_append; 4239 4240 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 4241 /* Were all records empty? */ 4242 if (le16_to_cpu(el->l_next_free_rec) == 1) 4243 goto set_tail_append; 4244 } 4245 4246 i = le16_to_cpu(el->l_next_free_rec) - 1; 4247 rec = &el->l_recs[i]; 4248 4249 if (cpos >= 4250 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 4251 goto set_tail_append; 4252 4253 return; 4254 4255 set_tail_append: 4256 insert->ins_appending = APPEND_TAIL; 4257 } 4258 4259 /* 4260 * Helper function called at the begining of an insert. 4261 * 4262 * This computes a few things that are commonly used in the process of 4263 * inserting into the btree: 4264 * - Whether the new extent is contiguous with an existing one. 4265 * - The current tree depth. 4266 * - Whether the insert is an appending one. 4267 * - The total # of free records in the tree. 4268 * 4269 * All of the information is stored on the ocfs2_insert_type 4270 * structure. 4271 */ 4272 static int ocfs2_figure_insert_type(struct inode *inode, 4273 struct ocfs2_extent_tree *et, 4274 struct buffer_head **last_eb_bh, 4275 struct ocfs2_extent_rec *insert_rec, 4276 int *free_records, 4277 struct ocfs2_insert_type *insert) 4278 { 4279 int ret; 4280 struct ocfs2_extent_block *eb; 4281 struct ocfs2_extent_list *el; 4282 struct ocfs2_path *path = NULL; 4283 struct buffer_head *bh = NULL; 4284 4285 insert->ins_split = SPLIT_NONE; 4286 4287 el = et->et_root_el; 4288 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4289 4290 if (el->l_tree_depth) { 4291 /* 4292 * If we have tree depth, we read in the 4293 * rightmost extent block ahead of time as 4294 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4295 * may want it later. 4296 */ 4297 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), &bh); 4298 if (ret) { 4299 mlog_exit(ret); 4300 goto out; 4301 } 4302 eb = (struct ocfs2_extent_block *) bh->b_data; 4303 el = &eb->h_list; 4304 } 4305 4306 /* 4307 * Unless we have a contiguous insert, we'll need to know if 4308 * there is room left in our allocation tree for another 4309 * extent record. 4310 * 4311 * XXX: This test is simplistic, we can search for empty 4312 * extent records too. 4313 */ 4314 *free_records = le16_to_cpu(el->l_count) - 4315 le16_to_cpu(el->l_next_free_rec); 4316 4317 if (!insert->ins_tree_depth) { 4318 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4319 ocfs2_figure_appending_type(insert, el, insert_rec); 4320 return 0; 4321 } 4322 4323 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4324 if (!path) { 4325 ret = -ENOMEM; 4326 mlog_errno(ret); 4327 goto out; 4328 } 4329 4330 /* 4331 * In the case that we're inserting past what the tree 4332 * currently accounts for, ocfs2_find_path() will return for 4333 * us the rightmost tree path. This is accounted for below in 4334 * the appending code. 4335 */ 4336 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4337 if (ret) { 4338 mlog_errno(ret); 4339 goto out; 4340 } 4341 4342 el = path_leaf_el(path); 4343 4344 /* 4345 * Now that we have the path, there's two things we want to determine: 4346 * 1) Contiguousness (also set contig_index if this is so) 4347 * 4348 * 2) Are we doing an append? We can trivially break this up 4349 * into two types of appends: simple record append, or a 4350 * rotate inside the tail leaf. 4351 */ 4352 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4353 4354 /* 4355 * The insert code isn't quite ready to deal with all cases of 4356 * left contiguousness. Specifically, if it's an insert into 4357 * the 1st record in a leaf, it will require the adjustment of 4358 * cluster count on the last record of the path directly to it's 4359 * left. For now, just catch that case and fool the layers 4360 * above us. This works just fine for tree_depth == 0, which 4361 * is why we allow that above. 4362 */ 4363 if (insert->ins_contig == CONTIG_LEFT && 4364 insert->ins_contig_index == 0) 4365 insert->ins_contig = CONTIG_NONE; 4366 4367 /* 4368 * Ok, so we can simply compare against last_eb to figure out 4369 * whether the path doesn't exist. This will only happen in 4370 * the case that we're doing a tail append, so maybe we can 4371 * take advantage of that information somehow. 4372 */ 4373 if (ocfs2_et_get_last_eb_blk(et) == 4374 path_leaf_bh(path)->b_blocknr) { 4375 /* 4376 * Ok, ocfs2_find_path() returned us the rightmost 4377 * tree path. This might be an appending insert. There are 4378 * two cases: 4379 * 1) We're doing a true append at the tail: 4380 * -This might even be off the end of the leaf 4381 * 2) We're "appending" by rotating in the tail 4382 */ 4383 ocfs2_figure_appending_type(insert, el, insert_rec); 4384 } 4385 4386 out: 4387 ocfs2_free_path(path); 4388 4389 if (ret == 0) 4390 *last_eb_bh = bh; 4391 else 4392 brelse(bh); 4393 return ret; 4394 } 4395 4396 /* 4397 * Insert an extent into an inode btree. 4398 * 4399 * The caller needs to update fe->i_clusters 4400 */ 4401 int ocfs2_insert_extent(struct ocfs2_super *osb, 4402 handle_t *handle, 4403 struct inode *inode, 4404 struct ocfs2_extent_tree *et, 4405 u32 cpos, 4406 u64 start_blk, 4407 u32 new_clusters, 4408 u8 flags, 4409 struct ocfs2_alloc_context *meta_ac) 4410 { 4411 int status; 4412 int uninitialized_var(free_records); 4413 struct buffer_head *last_eb_bh = NULL; 4414 struct ocfs2_insert_type insert = {0, }; 4415 struct ocfs2_extent_rec rec; 4416 4417 mlog(0, "add %u clusters at position %u to inode %llu\n", 4418 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4419 4420 memset(&rec, 0, sizeof(rec)); 4421 rec.e_cpos = cpu_to_le32(cpos); 4422 rec.e_blkno = cpu_to_le64(start_blk); 4423 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4424 rec.e_flags = flags; 4425 status = ocfs2_et_insert_check(inode, et, &rec); 4426 if (status) { 4427 mlog_errno(status); 4428 goto bail; 4429 } 4430 4431 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4432 &free_records, &insert); 4433 if (status < 0) { 4434 mlog_errno(status); 4435 goto bail; 4436 } 4437 4438 mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 4439 "Insert.contig_index: %d, Insert.free_records: %d, " 4440 "Insert.tree_depth: %d\n", 4441 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 4442 free_records, insert.ins_tree_depth); 4443 4444 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4445 status = ocfs2_grow_tree(inode, handle, et, 4446 &insert.ins_tree_depth, &last_eb_bh, 4447 meta_ac); 4448 if (status) { 4449 mlog_errno(status); 4450 goto bail; 4451 } 4452 } 4453 4454 /* Finally, we can add clusters. This might rotate the tree for us. */ 4455 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4456 if (status < 0) 4457 mlog_errno(status); 4458 else if (et->et_ops == &ocfs2_dinode_et_ops) 4459 ocfs2_extent_map_insert_rec(inode, &rec); 4460 4461 bail: 4462 brelse(last_eb_bh); 4463 4464 mlog_exit(status); 4465 return status; 4466 } 4467 4468 /* 4469 * Allcate and add clusters into the extent b-tree. 4470 * The new clusters(clusters_to_add) will be inserted at logical_offset. 4471 * The extent b-tree's root is specified by et, and 4472 * it is not limited to the file storage. Any extent tree can use this 4473 * function if it implements the proper ocfs2_extent_tree. 4474 */ 4475 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4476 struct inode *inode, 4477 u32 *logical_offset, 4478 u32 clusters_to_add, 4479 int mark_unwritten, 4480 struct ocfs2_extent_tree *et, 4481 handle_t *handle, 4482 struct ocfs2_alloc_context *data_ac, 4483 struct ocfs2_alloc_context *meta_ac, 4484 enum ocfs2_alloc_restarted *reason_ret) 4485 { 4486 int status = 0; 4487 int free_extents; 4488 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4489 u32 bit_off, num_bits; 4490 u64 block; 4491 u8 flags = 0; 4492 4493 BUG_ON(!clusters_to_add); 4494 4495 if (mark_unwritten) 4496 flags = OCFS2_EXT_UNWRITTEN; 4497 4498 free_extents = ocfs2_num_free_extents(osb, inode, et); 4499 if (free_extents < 0) { 4500 status = free_extents; 4501 mlog_errno(status); 4502 goto leave; 4503 } 4504 4505 /* there are two cases which could cause us to EAGAIN in the 4506 * we-need-more-metadata case: 4507 * 1) we haven't reserved *any* 4508 * 2) we are so fragmented, we've needed to add metadata too 4509 * many times. */ 4510 if (!free_extents && !meta_ac) { 4511 mlog(0, "we haven't reserved any metadata!\n"); 4512 status = -EAGAIN; 4513 reason = RESTART_META; 4514 goto leave; 4515 } else if ((!free_extents) 4516 && (ocfs2_alloc_context_bits_left(meta_ac) 4517 < ocfs2_extend_meta_needed(et->et_root_el))) { 4518 mlog(0, "filesystem is really fragmented...\n"); 4519 status = -EAGAIN; 4520 reason = RESTART_META; 4521 goto leave; 4522 } 4523 4524 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4525 clusters_to_add, &bit_off, &num_bits); 4526 if (status < 0) { 4527 if (status != -ENOSPC) 4528 mlog_errno(status); 4529 goto leave; 4530 } 4531 4532 BUG_ON(num_bits > clusters_to_add); 4533 4534 /* reserve our write early -- insert_extent may update the inode */ 4535 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 4536 OCFS2_JOURNAL_ACCESS_WRITE); 4537 if (status < 0) { 4538 mlog_errno(status); 4539 goto leave; 4540 } 4541 4542 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4543 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4544 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4545 status = ocfs2_insert_extent(osb, handle, inode, et, 4546 *logical_offset, block, 4547 num_bits, flags, meta_ac); 4548 if (status < 0) { 4549 mlog_errno(status); 4550 goto leave; 4551 } 4552 4553 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4554 if (status < 0) { 4555 mlog_errno(status); 4556 goto leave; 4557 } 4558 4559 clusters_to_add -= num_bits; 4560 *logical_offset += num_bits; 4561 4562 if (clusters_to_add) { 4563 mlog(0, "need to alloc once more, wanted = %u\n", 4564 clusters_to_add); 4565 status = -EAGAIN; 4566 reason = RESTART_TRANS; 4567 } 4568 4569 leave: 4570 mlog_exit(status); 4571 if (reason_ret) 4572 *reason_ret = reason; 4573 return status; 4574 } 4575 4576 static void ocfs2_make_right_split_rec(struct super_block *sb, 4577 struct ocfs2_extent_rec *split_rec, 4578 u32 cpos, 4579 struct ocfs2_extent_rec *rec) 4580 { 4581 u32 rec_cpos = le32_to_cpu(rec->e_cpos); 4582 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); 4583 4584 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4585 4586 split_rec->e_cpos = cpu_to_le32(cpos); 4587 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); 4588 4589 split_rec->e_blkno = rec->e_blkno; 4590 le64_add_cpu(&split_rec->e_blkno, 4591 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); 4592 4593 split_rec->e_flags = rec->e_flags; 4594 } 4595 4596 static int ocfs2_split_and_insert(struct inode *inode, 4597 handle_t *handle, 4598 struct ocfs2_path *path, 4599 struct ocfs2_extent_tree *et, 4600 struct buffer_head **last_eb_bh, 4601 int split_index, 4602 struct ocfs2_extent_rec *orig_split_rec, 4603 struct ocfs2_alloc_context *meta_ac) 4604 { 4605 int ret = 0, depth; 4606 unsigned int insert_range, rec_range, do_leftright = 0; 4607 struct ocfs2_extent_rec tmprec; 4608 struct ocfs2_extent_list *rightmost_el; 4609 struct ocfs2_extent_rec rec; 4610 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4611 struct ocfs2_insert_type insert; 4612 struct ocfs2_extent_block *eb; 4613 4614 leftright: 4615 /* 4616 * Store a copy of the record on the stack - it might move 4617 * around as the tree is manipulated below. 4618 */ 4619 rec = path_leaf_el(path)->l_recs[split_index]; 4620 4621 rightmost_el = et->et_root_el; 4622 4623 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4624 if (depth) { 4625 BUG_ON(!(*last_eb_bh)); 4626 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 4627 rightmost_el = &eb->h_list; 4628 } 4629 4630 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4631 le16_to_cpu(rightmost_el->l_count)) { 4632 ret = ocfs2_grow_tree(inode, handle, et, 4633 &depth, last_eb_bh, meta_ac); 4634 if (ret) { 4635 mlog_errno(ret); 4636 goto out; 4637 } 4638 } 4639 4640 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4641 insert.ins_appending = APPEND_NONE; 4642 insert.ins_contig = CONTIG_NONE; 4643 insert.ins_tree_depth = depth; 4644 4645 insert_range = le32_to_cpu(split_rec.e_cpos) + 4646 le16_to_cpu(split_rec.e_leaf_clusters); 4647 rec_range = le32_to_cpu(rec.e_cpos) + 4648 le16_to_cpu(rec.e_leaf_clusters); 4649 4650 if (split_rec.e_cpos == rec.e_cpos) { 4651 insert.ins_split = SPLIT_LEFT; 4652 } else if (insert_range == rec_range) { 4653 insert.ins_split = SPLIT_RIGHT; 4654 } else { 4655 /* 4656 * Left/right split. We fake this as a right split 4657 * first and then make a second pass as a left split. 4658 */ 4659 insert.ins_split = SPLIT_RIGHT; 4660 4661 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 4662 &rec); 4663 4664 split_rec = tmprec; 4665 4666 BUG_ON(do_leftright); 4667 do_leftright = 1; 4668 } 4669 4670 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 4671 if (ret) { 4672 mlog_errno(ret); 4673 goto out; 4674 } 4675 4676 if (do_leftright == 1) { 4677 u32 cpos; 4678 struct ocfs2_extent_list *el; 4679 4680 do_leftright++; 4681 split_rec = *orig_split_rec; 4682 4683 ocfs2_reinit_path(path, 1); 4684 4685 cpos = le32_to_cpu(split_rec.e_cpos); 4686 ret = ocfs2_find_path(inode, path, cpos); 4687 if (ret) { 4688 mlog_errno(ret); 4689 goto out; 4690 } 4691 4692 el = path_leaf_el(path); 4693 split_index = ocfs2_search_extent_list(el, cpos); 4694 goto leftright; 4695 } 4696 out: 4697 4698 return ret; 4699 } 4700 4701 /* 4702 * Mark part or all of the extent record at split_index in the leaf 4703 * pointed to by path as written. This removes the unwritten 4704 * extent flag. 4705 * 4706 * Care is taken to handle contiguousness so as to not grow the tree. 4707 * 4708 * meta_ac is not strictly necessary - we only truly need it if growth 4709 * of the tree is required. All other cases will degrade into a less 4710 * optimal tree layout. 4711 * 4712 * last_eb_bh should be the rightmost leaf block for any extent 4713 * btree. Since a split may grow the tree or a merge might shrink it, 4714 * the caller cannot trust the contents of that buffer after this call. 4715 * 4716 * This code is optimized for readability - several passes might be 4717 * made over certain portions of the tree. All of those blocks will 4718 * have been brought into cache (and pinned via the journal), so the 4719 * extra overhead is not expressed in terms of disk reads. 4720 */ 4721 static int __ocfs2_mark_extent_written(struct inode *inode, 4722 struct ocfs2_extent_tree *et, 4723 handle_t *handle, 4724 struct ocfs2_path *path, 4725 int split_index, 4726 struct ocfs2_extent_rec *split_rec, 4727 struct ocfs2_alloc_context *meta_ac, 4728 struct ocfs2_cached_dealloc_ctxt *dealloc) 4729 { 4730 int ret = 0; 4731 struct ocfs2_extent_list *el = path_leaf_el(path); 4732 struct buffer_head *last_eb_bh = NULL; 4733 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 4734 struct ocfs2_merge_ctxt ctxt; 4735 struct ocfs2_extent_list *rightmost_el; 4736 4737 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { 4738 ret = -EIO; 4739 mlog_errno(ret); 4740 goto out; 4741 } 4742 4743 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 4744 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 4745 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 4746 ret = -EIO; 4747 mlog_errno(ret); 4748 goto out; 4749 } 4750 4751 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 4752 split_index, 4753 split_rec); 4754 4755 /* 4756 * The core merge / split code wants to know how much room is 4757 * left in this inodes allocation tree, so we pass the 4758 * rightmost extent list. 4759 */ 4760 if (path->p_tree_depth) { 4761 struct ocfs2_extent_block *eb; 4762 4763 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 4764 &last_eb_bh); 4765 if (ret) { 4766 mlog_exit(ret); 4767 goto out; 4768 } 4769 4770 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4771 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 4772 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 4773 ret = -EROFS; 4774 goto out; 4775 } 4776 4777 rightmost_el = &eb->h_list; 4778 } else 4779 rightmost_el = path_root_el(path); 4780 4781 if (rec->e_cpos == split_rec->e_cpos && 4782 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 4783 ctxt.c_split_covers_rec = 1; 4784 else 4785 ctxt.c_split_covers_rec = 0; 4786 4787 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 4788 4789 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", 4790 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, 4791 ctxt.c_split_covers_rec); 4792 4793 if (ctxt.c_contig_type == CONTIG_NONE) { 4794 if (ctxt.c_split_covers_rec) 4795 el->l_recs[split_index] = *split_rec; 4796 else 4797 ret = ocfs2_split_and_insert(inode, handle, path, et, 4798 &last_eb_bh, split_index, 4799 split_rec, meta_ac); 4800 if (ret) 4801 mlog_errno(ret); 4802 } else { 4803 ret = ocfs2_try_to_merge_extent(inode, handle, path, 4804 split_index, split_rec, 4805 dealloc, &ctxt, et); 4806 if (ret) 4807 mlog_errno(ret); 4808 } 4809 4810 out: 4811 brelse(last_eb_bh); 4812 return ret; 4813 } 4814 4815 /* 4816 * Mark the already-existing extent at cpos as written for len clusters. 4817 * 4818 * If the existing extent is larger than the request, initiate a 4819 * split. An attempt will be made at merging with adjacent extents. 4820 * 4821 * The caller is responsible for passing down meta_ac if we'll need it. 4822 */ 4823 int ocfs2_mark_extent_written(struct inode *inode, 4824 struct ocfs2_extent_tree *et, 4825 handle_t *handle, u32 cpos, u32 len, u32 phys, 4826 struct ocfs2_alloc_context *meta_ac, 4827 struct ocfs2_cached_dealloc_ctxt *dealloc) 4828 { 4829 int ret, index; 4830 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 4831 struct ocfs2_extent_rec split_rec; 4832 struct ocfs2_path *left_path = NULL; 4833 struct ocfs2_extent_list *el; 4834 4835 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", 4836 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); 4837 4838 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 4839 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 4840 "that are being written to, but the feature bit " 4841 "is not set in the super block.", 4842 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4843 ret = -EROFS; 4844 goto out; 4845 } 4846 4847 /* 4848 * XXX: This should be fixed up so that we just re-insert the 4849 * next extent records. 4850 * 4851 * XXX: This is a hack on the extent tree, maybe it should be 4852 * an op? 4853 */ 4854 if (et->et_ops == &ocfs2_dinode_et_ops) 4855 ocfs2_extent_map_trunc(inode, 0); 4856 4857 left_path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 4858 if (!left_path) { 4859 ret = -ENOMEM; 4860 mlog_errno(ret); 4861 goto out; 4862 } 4863 4864 ret = ocfs2_find_path(inode, left_path, cpos); 4865 if (ret) { 4866 mlog_errno(ret); 4867 goto out; 4868 } 4869 el = path_leaf_el(left_path); 4870 4871 index = ocfs2_search_extent_list(el, cpos); 4872 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 4873 ocfs2_error(inode->i_sb, 4874 "Inode %llu has an extent at cpos %u which can no " 4875 "longer be found.\n", 4876 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 4877 ret = -EROFS; 4878 goto out; 4879 } 4880 4881 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4882 split_rec.e_cpos = cpu_to_le32(cpos); 4883 split_rec.e_leaf_clusters = cpu_to_le16(len); 4884 split_rec.e_blkno = cpu_to_le64(start_blkno); 4885 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 4886 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 4887 4888 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 4889 index, &split_rec, meta_ac, 4890 dealloc); 4891 if (ret) 4892 mlog_errno(ret); 4893 4894 out: 4895 ocfs2_free_path(left_path); 4896 return ret; 4897 } 4898 4899 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 4900 handle_t *handle, struct ocfs2_path *path, 4901 int index, u32 new_range, 4902 struct ocfs2_alloc_context *meta_ac) 4903 { 4904 int ret, depth, credits = handle->h_buffer_credits; 4905 struct buffer_head *last_eb_bh = NULL; 4906 struct ocfs2_extent_block *eb; 4907 struct ocfs2_extent_list *rightmost_el, *el; 4908 struct ocfs2_extent_rec split_rec; 4909 struct ocfs2_extent_rec *rec; 4910 struct ocfs2_insert_type insert; 4911 4912 /* 4913 * Setup the record to split before we grow the tree. 4914 */ 4915 el = path_leaf_el(path); 4916 rec = &el->l_recs[index]; 4917 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 4918 4919 depth = path->p_tree_depth; 4920 if (depth > 0) { 4921 ret = ocfs2_read_block(inode, ocfs2_et_get_last_eb_blk(et), 4922 &last_eb_bh); 4923 if (ret < 0) { 4924 mlog_errno(ret); 4925 goto out; 4926 } 4927 4928 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4929 rightmost_el = &eb->h_list; 4930 } else 4931 rightmost_el = path_leaf_el(path); 4932 4933 credits += path->p_tree_depth + 4934 ocfs2_extend_meta_needed(et->et_root_el); 4935 ret = ocfs2_extend_trans(handle, credits); 4936 if (ret) { 4937 mlog_errno(ret); 4938 goto out; 4939 } 4940 4941 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4942 le16_to_cpu(rightmost_el->l_count)) { 4943 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 4944 meta_ac); 4945 if (ret) { 4946 mlog_errno(ret); 4947 goto out; 4948 } 4949 } 4950 4951 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4952 insert.ins_appending = APPEND_NONE; 4953 insert.ins_contig = CONTIG_NONE; 4954 insert.ins_split = SPLIT_RIGHT; 4955 insert.ins_tree_depth = depth; 4956 4957 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 4958 if (ret) 4959 mlog_errno(ret); 4960 4961 out: 4962 brelse(last_eb_bh); 4963 return ret; 4964 } 4965 4966 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 4967 struct ocfs2_path *path, int index, 4968 struct ocfs2_cached_dealloc_ctxt *dealloc, 4969 u32 cpos, u32 len, 4970 struct ocfs2_extent_tree *et) 4971 { 4972 int ret; 4973 u32 left_cpos, rec_range, trunc_range; 4974 int wants_rotate = 0, is_rightmost_tree_rec = 0; 4975 struct super_block *sb = inode->i_sb; 4976 struct ocfs2_path *left_path = NULL; 4977 struct ocfs2_extent_list *el = path_leaf_el(path); 4978 struct ocfs2_extent_rec *rec; 4979 struct ocfs2_extent_block *eb; 4980 4981 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 4982 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 4983 if (ret) { 4984 mlog_errno(ret); 4985 goto out; 4986 } 4987 4988 index--; 4989 } 4990 4991 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && 4992 path->p_tree_depth) { 4993 /* 4994 * Check whether this is the rightmost tree record. If 4995 * we remove all of this record or part of its right 4996 * edge then an update of the record lengths above it 4997 * will be required. 4998 */ 4999 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 5000 if (eb->h_next_leaf_blk == 0) 5001 is_rightmost_tree_rec = 1; 5002 } 5003 5004 rec = &el->l_recs[index]; 5005 if (index == 0 && path->p_tree_depth && 5006 le32_to_cpu(rec->e_cpos) == cpos) { 5007 /* 5008 * Changing the leftmost offset (via partial or whole 5009 * record truncate) of an interior (or rightmost) path 5010 * means we have to update the subtree that is formed 5011 * by this leaf and the one to it's left. 5012 * 5013 * There are two cases we can skip: 5014 * 1) Path is the leftmost one in our inode tree. 5015 * 2) The leaf is rightmost and will be empty after 5016 * we remove the extent record - the rotate code 5017 * knows how to update the newly formed edge. 5018 */ 5019 5020 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 5021 &left_cpos); 5022 if (ret) { 5023 mlog_errno(ret); 5024 goto out; 5025 } 5026 5027 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5028 left_path = ocfs2_new_path(path_root_bh(path), 5029 path_root_el(path)); 5030 if (!left_path) { 5031 ret = -ENOMEM; 5032 mlog_errno(ret); 5033 goto out; 5034 } 5035 5036 ret = ocfs2_find_path(inode, left_path, left_cpos); 5037 if (ret) { 5038 mlog_errno(ret); 5039 goto out; 5040 } 5041 } 5042 } 5043 5044 ret = ocfs2_extend_rotate_transaction(handle, 0, 5045 handle->h_buffer_credits, 5046 path); 5047 if (ret) { 5048 mlog_errno(ret); 5049 goto out; 5050 } 5051 5052 ret = ocfs2_journal_access_path(inode, handle, path); 5053 if (ret) { 5054 mlog_errno(ret); 5055 goto out; 5056 } 5057 5058 ret = ocfs2_journal_access_path(inode, handle, left_path); 5059 if (ret) { 5060 mlog_errno(ret); 5061 goto out; 5062 } 5063 5064 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5065 trunc_range = cpos + len; 5066 5067 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { 5068 int next_free; 5069 5070 memset(rec, 0, sizeof(*rec)); 5071 ocfs2_cleanup_merge(el, index); 5072 wants_rotate = 1; 5073 5074 next_free = le16_to_cpu(el->l_next_free_rec); 5075 if (is_rightmost_tree_rec && next_free > 1) { 5076 /* 5077 * We skip the edge update if this path will 5078 * be deleted by the rotate code. 5079 */ 5080 rec = &el->l_recs[next_free - 1]; 5081 ocfs2_adjust_rightmost_records(inode, handle, path, 5082 rec); 5083 } 5084 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5085 /* Remove leftmost portion of the record. */ 5086 le32_add_cpu(&rec->e_cpos, len); 5087 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); 5088 le16_add_cpu(&rec->e_leaf_clusters, -len); 5089 } else if (rec_range == trunc_range) { 5090 /* Remove rightmost portion of the record */ 5091 le16_add_cpu(&rec->e_leaf_clusters, -len); 5092 if (is_rightmost_tree_rec) 5093 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5094 } else { 5095 /* Caller should have trapped this. */ 5096 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5097 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5098 le32_to_cpu(rec->e_cpos), 5099 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5100 BUG(); 5101 } 5102 5103 if (left_path) { 5104 int subtree_index; 5105 5106 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5107 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5108 subtree_index); 5109 } 5110 5111 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5112 5113 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5114 if (ret) { 5115 mlog_errno(ret); 5116 goto out; 5117 } 5118 5119 out: 5120 ocfs2_free_path(left_path); 5121 return ret; 5122 } 5123 5124 int ocfs2_remove_extent(struct inode *inode, 5125 struct ocfs2_extent_tree *et, 5126 u32 cpos, u32 len, handle_t *handle, 5127 struct ocfs2_alloc_context *meta_ac, 5128 struct ocfs2_cached_dealloc_ctxt *dealloc) 5129 { 5130 int ret, index; 5131 u32 rec_range, trunc_range; 5132 struct ocfs2_extent_rec *rec; 5133 struct ocfs2_extent_list *el; 5134 struct ocfs2_path *path = NULL; 5135 5136 ocfs2_extent_map_trunc(inode, 0); 5137 5138 path = ocfs2_new_path(et->et_root_bh, et->et_root_el); 5139 if (!path) { 5140 ret = -ENOMEM; 5141 mlog_errno(ret); 5142 goto out; 5143 } 5144 5145 ret = ocfs2_find_path(inode, path, cpos); 5146 if (ret) { 5147 mlog_errno(ret); 5148 goto out; 5149 } 5150 5151 el = path_leaf_el(path); 5152 index = ocfs2_search_extent_list(el, cpos); 5153 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5154 ocfs2_error(inode->i_sb, 5155 "Inode %llu has an extent at cpos %u which can no " 5156 "longer be found.\n", 5157 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5158 ret = -EROFS; 5159 goto out; 5160 } 5161 5162 /* 5163 * We have 3 cases of extent removal: 5164 * 1) Range covers the entire extent rec 5165 * 2) Range begins or ends on one edge of the extent rec 5166 * 3) Range is in the middle of the extent rec (no shared edges) 5167 * 5168 * For case 1 we remove the extent rec and left rotate to 5169 * fill the hole. 5170 * 5171 * For case 2 we just shrink the existing extent rec, with a 5172 * tree update if the shrinking edge is also the edge of an 5173 * extent block. 5174 * 5175 * For case 3 we do a right split to turn the extent rec into 5176 * something case 2 can handle. 5177 */ 5178 rec = &el->l_recs[index]; 5179 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5180 trunc_range = cpos + len; 5181 5182 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5183 5184 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5185 "(cpos %u, len %u)\n", 5186 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5187 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5188 5189 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5190 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5191 cpos, len, et); 5192 if (ret) { 5193 mlog_errno(ret); 5194 goto out; 5195 } 5196 } else { 5197 ret = ocfs2_split_tree(inode, et, handle, path, index, 5198 trunc_range, meta_ac); 5199 if (ret) { 5200 mlog_errno(ret); 5201 goto out; 5202 } 5203 5204 /* 5205 * The split could have manipulated the tree enough to 5206 * move the record location, so we have to look for it again. 5207 */ 5208 ocfs2_reinit_path(path, 1); 5209 5210 ret = ocfs2_find_path(inode, path, cpos); 5211 if (ret) { 5212 mlog_errno(ret); 5213 goto out; 5214 } 5215 5216 el = path_leaf_el(path); 5217 index = ocfs2_search_extent_list(el, cpos); 5218 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5219 ocfs2_error(inode->i_sb, 5220 "Inode %llu: split at cpos %u lost record.", 5221 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5222 cpos); 5223 ret = -EROFS; 5224 goto out; 5225 } 5226 5227 /* 5228 * Double check our values here. If anything is fishy, 5229 * it's easier to catch it at the top level. 5230 */ 5231 rec = &el->l_recs[index]; 5232 rec_range = le32_to_cpu(rec->e_cpos) + 5233 ocfs2_rec_clusters(el, rec); 5234 if (rec_range != trunc_range) { 5235 ocfs2_error(inode->i_sb, 5236 "Inode %llu: error after split at cpos %u" 5237 "trunc len %u, existing record is (%u,%u)", 5238 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5239 cpos, len, le32_to_cpu(rec->e_cpos), 5240 ocfs2_rec_clusters(el, rec)); 5241 ret = -EROFS; 5242 goto out; 5243 } 5244 5245 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5246 cpos, len, et); 5247 if (ret) { 5248 mlog_errno(ret); 5249 goto out; 5250 } 5251 } 5252 5253 out: 5254 ocfs2_free_path(path); 5255 return ret; 5256 } 5257 5258 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5259 { 5260 struct buffer_head *tl_bh = osb->osb_tl_bh; 5261 struct ocfs2_dinode *di; 5262 struct ocfs2_truncate_log *tl; 5263 5264 di = (struct ocfs2_dinode *) tl_bh->b_data; 5265 tl = &di->id2.i_dealloc; 5266 5267 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 5268 "slot %d, invalid truncate log parameters: used = " 5269 "%u, count = %u\n", osb->slot_num, 5270 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 5271 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 5272 } 5273 5274 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 5275 unsigned int new_start) 5276 { 5277 unsigned int tail_index; 5278 unsigned int current_tail; 5279 5280 /* No records, nothing to coalesce */ 5281 if (!le16_to_cpu(tl->tl_used)) 5282 return 0; 5283 5284 tail_index = le16_to_cpu(tl->tl_used) - 1; 5285 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 5286 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 5287 5288 return current_tail == new_start; 5289 } 5290 5291 int ocfs2_truncate_log_append(struct ocfs2_super *osb, 5292 handle_t *handle, 5293 u64 start_blk, 5294 unsigned int num_clusters) 5295 { 5296 int status, index; 5297 unsigned int start_cluster, tl_count; 5298 struct inode *tl_inode = osb->osb_tl_inode; 5299 struct buffer_head *tl_bh = osb->osb_tl_bh; 5300 struct ocfs2_dinode *di; 5301 struct ocfs2_truncate_log *tl; 5302 5303 mlog_entry("start_blk = %llu, num_clusters = %u\n", 5304 (unsigned long long)start_blk, num_clusters); 5305 5306 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5307 5308 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5309 5310 di = (struct ocfs2_dinode *) tl_bh->b_data; 5311 tl = &di->id2.i_dealloc; 5312 if (!OCFS2_IS_VALID_DINODE(di)) { 5313 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 5314 status = -EIO; 5315 goto bail; 5316 } 5317 5318 tl_count = le16_to_cpu(tl->tl_count); 5319 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5320 tl_count == 0, 5321 "Truncate record count on #%llu invalid " 5322 "wanted %u, actual %u\n", 5323 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5324 ocfs2_truncate_recs_per_inode(osb->sb), 5325 le16_to_cpu(tl->tl_count)); 5326 5327 /* Caller should have known to flush before calling us. */ 5328 index = le16_to_cpu(tl->tl_used); 5329 if (index >= tl_count) { 5330 status = -ENOSPC; 5331 mlog_errno(status); 5332 goto bail; 5333 } 5334 5335 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5336 OCFS2_JOURNAL_ACCESS_WRITE); 5337 if (status < 0) { 5338 mlog_errno(status); 5339 goto bail; 5340 } 5341 5342 mlog(0, "Log truncate of %u clusters starting at cluster %u to " 5343 "%llu (index = %d)\n", num_clusters, start_cluster, 5344 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 5345 5346 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5347 /* 5348 * Move index back to the record we are coalescing with. 5349 * ocfs2_truncate_log_can_coalesce() guarantees nonzero 5350 */ 5351 index--; 5352 5353 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5354 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 5355 index, le32_to_cpu(tl->tl_recs[index].t_start), 5356 num_clusters); 5357 } else { 5358 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5359 tl->tl_used = cpu_to_le16(index + 1); 5360 } 5361 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5362 5363 status = ocfs2_journal_dirty(handle, tl_bh); 5364 if (status < 0) { 5365 mlog_errno(status); 5366 goto bail; 5367 } 5368 5369 bail: 5370 mlog_exit(status); 5371 return status; 5372 } 5373 5374 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5375 handle_t *handle, 5376 struct inode *data_alloc_inode, 5377 struct buffer_head *data_alloc_bh) 5378 { 5379 int status = 0; 5380 int i; 5381 unsigned int num_clusters; 5382 u64 start_blk; 5383 struct ocfs2_truncate_rec rec; 5384 struct ocfs2_dinode *di; 5385 struct ocfs2_truncate_log *tl; 5386 struct inode *tl_inode = osb->osb_tl_inode; 5387 struct buffer_head *tl_bh = osb->osb_tl_bh; 5388 5389 mlog_entry_void(); 5390 5391 di = (struct ocfs2_dinode *) tl_bh->b_data; 5392 tl = &di->id2.i_dealloc; 5393 i = le16_to_cpu(tl->tl_used) - 1; 5394 while (i >= 0) { 5395 /* Caller has given us at least enough credits to 5396 * update the truncate log dinode */ 5397 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5398 OCFS2_JOURNAL_ACCESS_WRITE); 5399 if (status < 0) { 5400 mlog_errno(status); 5401 goto bail; 5402 } 5403 5404 tl->tl_used = cpu_to_le16(i); 5405 5406 status = ocfs2_journal_dirty(handle, tl_bh); 5407 if (status < 0) { 5408 mlog_errno(status); 5409 goto bail; 5410 } 5411 5412 /* TODO: Perhaps we can calculate the bulk of the 5413 * credits up front rather than extending like 5414 * this. */ 5415 status = ocfs2_extend_trans(handle, 5416 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5417 if (status < 0) { 5418 mlog_errno(status); 5419 goto bail; 5420 } 5421 5422 rec = tl->tl_recs[i]; 5423 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5424 le32_to_cpu(rec.t_start)); 5425 num_clusters = le32_to_cpu(rec.t_clusters); 5426 5427 /* if start_blk is not set, we ignore the record as 5428 * invalid. */ 5429 if (start_blk) { 5430 mlog(0, "free record %d, start = %u, clusters = %u\n", 5431 i, le32_to_cpu(rec.t_start), num_clusters); 5432 5433 status = ocfs2_free_clusters(handle, data_alloc_inode, 5434 data_alloc_bh, start_blk, 5435 num_clusters); 5436 if (status < 0) { 5437 mlog_errno(status); 5438 goto bail; 5439 } 5440 } 5441 i--; 5442 } 5443 5444 bail: 5445 mlog_exit(status); 5446 return status; 5447 } 5448 5449 /* Expects you to already be holding tl_inode->i_mutex */ 5450 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5451 { 5452 int status; 5453 unsigned int num_to_flush; 5454 handle_t *handle; 5455 struct inode *tl_inode = osb->osb_tl_inode; 5456 struct inode *data_alloc_inode = NULL; 5457 struct buffer_head *tl_bh = osb->osb_tl_bh; 5458 struct buffer_head *data_alloc_bh = NULL; 5459 struct ocfs2_dinode *di; 5460 struct ocfs2_truncate_log *tl; 5461 5462 mlog_entry_void(); 5463 5464 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5465 5466 di = (struct ocfs2_dinode *) tl_bh->b_data; 5467 tl = &di->id2.i_dealloc; 5468 if (!OCFS2_IS_VALID_DINODE(di)) { 5469 OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 5470 status = -EIO; 5471 goto out; 5472 } 5473 5474 num_to_flush = le16_to_cpu(tl->tl_used); 5475 mlog(0, "Flush %u records from truncate log #%llu\n", 5476 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5477 if (!num_to_flush) { 5478 status = 0; 5479 goto out; 5480 } 5481 5482 data_alloc_inode = ocfs2_get_system_file_inode(osb, 5483 GLOBAL_BITMAP_SYSTEM_INODE, 5484 OCFS2_INVALID_SLOT); 5485 if (!data_alloc_inode) { 5486 status = -EINVAL; 5487 mlog(ML_ERROR, "Could not get bitmap inode!\n"); 5488 goto out; 5489 } 5490 5491 mutex_lock(&data_alloc_inode->i_mutex); 5492 5493 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); 5494 if (status < 0) { 5495 mlog_errno(status); 5496 goto out_mutex; 5497 } 5498 5499 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5500 if (IS_ERR(handle)) { 5501 status = PTR_ERR(handle); 5502 mlog_errno(status); 5503 goto out_unlock; 5504 } 5505 5506 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 5507 data_alloc_bh); 5508 if (status < 0) 5509 mlog_errno(status); 5510 5511 ocfs2_commit_trans(osb, handle); 5512 5513 out_unlock: 5514 brelse(data_alloc_bh); 5515 ocfs2_inode_unlock(data_alloc_inode, 1); 5516 5517 out_mutex: 5518 mutex_unlock(&data_alloc_inode->i_mutex); 5519 iput(data_alloc_inode); 5520 5521 out: 5522 mlog_exit(status); 5523 return status; 5524 } 5525 5526 int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5527 { 5528 int status; 5529 struct inode *tl_inode = osb->osb_tl_inode; 5530 5531 mutex_lock(&tl_inode->i_mutex); 5532 status = __ocfs2_flush_truncate_log(osb); 5533 mutex_unlock(&tl_inode->i_mutex); 5534 5535 return status; 5536 } 5537 5538 static void ocfs2_truncate_log_worker(struct work_struct *work) 5539 { 5540 int status; 5541 struct ocfs2_super *osb = 5542 container_of(work, struct ocfs2_super, 5543 osb_truncate_log_wq.work); 5544 5545 mlog_entry_void(); 5546 5547 status = ocfs2_flush_truncate_log(osb); 5548 if (status < 0) 5549 mlog_errno(status); 5550 else 5551 ocfs2_init_inode_steal_slot(osb); 5552 5553 mlog_exit(status); 5554 } 5555 5556 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 5557 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 5558 int cancel) 5559 { 5560 if (osb->osb_tl_inode) { 5561 /* We want to push off log flushes while truncates are 5562 * still running. */ 5563 if (cancel) 5564 cancel_delayed_work(&osb->osb_truncate_log_wq); 5565 5566 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 5567 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 5568 } 5569 } 5570 5571 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 5572 int slot_num, 5573 struct inode **tl_inode, 5574 struct buffer_head **tl_bh) 5575 { 5576 int status; 5577 struct inode *inode = NULL; 5578 struct buffer_head *bh = NULL; 5579 5580 inode = ocfs2_get_system_file_inode(osb, 5581 TRUNCATE_LOG_SYSTEM_INODE, 5582 slot_num); 5583 if (!inode) { 5584 status = -EINVAL; 5585 mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 5586 goto bail; 5587 } 5588 5589 status = ocfs2_read_block(inode, OCFS2_I(inode)->ip_blkno, &bh); 5590 if (status < 0) { 5591 iput(inode); 5592 mlog_errno(status); 5593 goto bail; 5594 } 5595 5596 *tl_inode = inode; 5597 *tl_bh = bh; 5598 bail: 5599 mlog_exit(status); 5600 return status; 5601 } 5602 5603 /* called during the 1st stage of node recovery. we stamp a clean 5604 * truncate log and pass back a copy for processing later. if the 5605 * truncate log does not require processing, a *tl_copy is set to 5606 * NULL. */ 5607 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 5608 int slot_num, 5609 struct ocfs2_dinode **tl_copy) 5610 { 5611 int status; 5612 struct inode *tl_inode = NULL; 5613 struct buffer_head *tl_bh = NULL; 5614 struct ocfs2_dinode *di; 5615 struct ocfs2_truncate_log *tl; 5616 5617 *tl_copy = NULL; 5618 5619 mlog(0, "recover truncate log from slot %d\n", slot_num); 5620 5621 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 5622 if (status < 0) { 5623 mlog_errno(status); 5624 goto bail; 5625 } 5626 5627 di = (struct ocfs2_dinode *) tl_bh->b_data; 5628 tl = &di->id2.i_dealloc; 5629 if (!OCFS2_IS_VALID_DINODE(di)) { 5630 OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); 5631 status = -EIO; 5632 goto bail; 5633 } 5634 5635 if (le16_to_cpu(tl->tl_used)) { 5636 mlog(0, "We'll have %u logs to recover\n", 5637 le16_to_cpu(tl->tl_used)); 5638 5639 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 5640 if (!(*tl_copy)) { 5641 status = -ENOMEM; 5642 mlog_errno(status); 5643 goto bail; 5644 } 5645 5646 /* Assuming the write-out below goes well, this copy 5647 * will be passed back to recovery for processing. */ 5648 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 5649 5650 /* All we need to do to clear the truncate log is set 5651 * tl_used. */ 5652 tl->tl_used = 0; 5653 5654 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5655 if (status < 0) { 5656 mlog_errno(status); 5657 goto bail; 5658 } 5659 } 5660 5661 bail: 5662 if (tl_inode) 5663 iput(tl_inode); 5664 brelse(tl_bh); 5665 5666 if (status < 0 && (*tl_copy)) { 5667 kfree(*tl_copy); 5668 *tl_copy = NULL; 5669 } 5670 5671 mlog_exit(status); 5672 return status; 5673 } 5674 5675 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 5676 struct ocfs2_dinode *tl_copy) 5677 { 5678 int status = 0; 5679 int i; 5680 unsigned int clusters, num_recs, start_cluster; 5681 u64 start_blk; 5682 handle_t *handle; 5683 struct inode *tl_inode = osb->osb_tl_inode; 5684 struct ocfs2_truncate_log *tl; 5685 5686 mlog_entry_void(); 5687 5688 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 5689 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 5690 return -EINVAL; 5691 } 5692 5693 tl = &tl_copy->id2.i_dealloc; 5694 num_recs = le16_to_cpu(tl->tl_used); 5695 mlog(0, "cleanup %u records from %llu\n", num_recs, 5696 (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 5697 5698 mutex_lock(&tl_inode->i_mutex); 5699 for(i = 0; i < num_recs; i++) { 5700 if (ocfs2_truncate_log_needs_flush(osb)) { 5701 status = __ocfs2_flush_truncate_log(osb); 5702 if (status < 0) { 5703 mlog_errno(status); 5704 goto bail_up; 5705 } 5706 } 5707 5708 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5709 if (IS_ERR(handle)) { 5710 status = PTR_ERR(handle); 5711 mlog_errno(status); 5712 goto bail_up; 5713 } 5714 5715 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 5716 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 5717 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 5718 5719 status = ocfs2_truncate_log_append(osb, handle, 5720 start_blk, clusters); 5721 ocfs2_commit_trans(osb, handle); 5722 if (status < 0) { 5723 mlog_errno(status); 5724 goto bail_up; 5725 } 5726 } 5727 5728 bail_up: 5729 mutex_unlock(&tl_inode->i_mutex); 5730 5731 mlog_exit(status); 5732 return status; 5733 } 5734 5735 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 5736 { 5737 int status; 5738 struct inode *tl_inode = osb->osb_tl_inode; 5739 5740 mlog_entry_void(); 5741 5742 if (tl_inode) { 5743 cancel_delayed_work(&osb->osb_truncate_log_wq); 5744 flush_workqueue(ocfs2_wq); 5745 5746 status = ocfs2_flush_truncate_log(osb); 5747 if (status < 0) 5748 mlog_errno(status); 5749 5750 brelse(osb->osb_tl_bh); 5751 iput(osb->osb_tl_inode); 5752 } 5753 5754 mlog_exit_void(); 5755 } 5756 5757 int ocfs2_truncate_log_init(struct ocfs2_super *osb) 5758 { 5759 int status; 5760 struct inode *tl_inode = NULL; 5761 struct buffer_head *tl_bh = NULL; 5762 5763 mlog_entry_void(); 5764 5765 status = ocfs2_get_truncate_log_info(osb, 5766 osb->slot_num, 5767 &tl_inode, 5768 &tl_bh); 5769 if (status < 0) 5770 mlog_errno(status); 5771 5772 /* ocfs2_truncate_log_shutdown keys on the existence of 5773 * osb->osb_tl_inode so we don't set any of the osb variables 5774 * until we're sure all is well. */ 5775 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 5776 ocfs2_truncate_log_worker); 5777 osb->osb_tl_bh = tl_bh; 5778 osb->osb_tl_inode = tl_inode; 5779 5780 mlog_exit(status); 5781 return status; 5782 } 5783 5784 /* 5785 * Delayed de-allocation of suballocator blocks. 5786 * 5787 * Some sets of block de-allocations might involve multiple suballocator inodes. 5788 * 5789 * The locking for this can get extremely complicated, especially when 5790 * the suballocator inodes to delete from aren't known until deep 5791 * within an unrelated codepath. 5792 * 5793 * ocfs2_extent_block structures are a good example of this - an inode 5794 * btree could have been grown by any number of nodes each allocating 5795 * out of their own suballoc inode. 5796 * 5797 * These structures allow the delay of block de-allocation until a 5798 * later time, when locking of multiple cluster inodes won't cause 5799 * deadlock. 5800 */ 5801 5802 /* 5803 * Describes a single block free from a suballocator 5804 */ 5805 struct ocfs2_cached_block_free { 5806 struct ocfs2_cached_block_free *free_next; 5807 u64 free_blk; 5808 unsigned int free_bit; 5809 }; 5810 5811 struct ocfs2_per_slot_free_list { 5812 struct ocfs2_per_slot_free_list *f_next_suballocator; 5813 int f_inode_type; 5814 int f_slot; 5815 struct ocfs2_cached_block_free *f_first; 5816 }; 5817 5818 static int ocfs2_free_cached_items(struct ocfs2_super *osb, 5819 int sysfile_type, 5820 int slot, 5821 struct ocfs2_cached_block_free *head) 5822 { 5823 int ret; 5824 u64 bg_blkno; 5825 handle_t *handle; 5826 struct inode *inode; 5827 struct buffer_head *di_bh = NULL; 5828 struct ocfs2_cached_block_free *tmp; 5829 5830 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 5831 if (!inode) { 5832 ret = -EINVAL; 5833 mlog_errno(ret); 5834 goto out; 5835 } 5836 5837 mutex_lock(&inode->i_mutex); 5838 5839 ret = ocfs2_inode_lock(inode, &di_bh, 1); 5840 if (ret) { 5841 mlog_errno(ret); 5842 goto out_mutex; 5843 } 5844 5845 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 5846 if (IS_ERR(handle)) { 5847 ret = PTR_ERR(handle); 5848 mlog_errno(ret); 5849 goto out_unlock; 5850 } 5851 5852 while (head) { 5853 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 5854 head->free_bit); 5855 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 5856 head->free_bit, (unsigned long long)head->free_blk); 5857 5858 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 5859 head->free_bit, bg_blkno, 1); 5860 if (ret) { 5861 mlog_errno(ret); 5862 goto out_journal; 5863 } 5864 5865 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 5866 if (ret) { 5867 mlog_errno(ret); 5868 goto out_journal; 5869 } 5870 5871 tmp = head; 5872 head = head->free_next; 5873 kfree(tmp); 5874 } 5875 5876 out_journal: 5877 ocfs2_commit_trans(osb, handle); 5878 5879 out_unlock: 5880 ocfs2_inode_unlock(inode, 1); 5881 brelse(di_bh); 5882 out_mutex: 5883 mutex_unlock(&inode->i_mutex); 5884 iput(inode); 5885 out: 5886 while(head) { 5887 /* Premature exit may have left some dangling items. */ 5888 tmp = head; 5889 head = head->free_next; 5890 kfree(tmp); 5891 } 5892 5893 return ret; 5894 } 5895 5896 int ocfs2_run_deallocs(struct ocfs2_super *osb, 5897 struct ocfs2_cached_dealloc_ctxt *ctxt) 5898 { 5899 int ret = 0, ret2; 5900 struct ocfs2_per_slot_free_list *fl; 5901 5902 if (!ctxt) 5903 return 0; 5904 5905 while (ctxt->c_first_suballocator) { 5906 fl = ctxt->c_first_suballocator; 5907 5908 if (fl->f_first) { 5909 mlog(0, "Free items: (type %u, slot %d)\n", 5910 fl->f_inode_type, fl->f_slot); 5911 ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 5912 fl->f_slot, fl->f_first); 5913 if (ret2) 5914 mlog_errno(ret2); 5915 if (!ret) 5916 ret = ret2; 5917 } 5918 5919 ctxt->c_first_suballocator = fl->f_next_suballocator; 5920 kfree(fl); 5921 } 5922 5923 return ret; 5924 } 5925 5926 static struct ocfs2_per_slot_free_list * 5927 ocfs2_find_per_slot_free_list(int type, 5928 int slot, 5929 struct ocfs2_cached_dealloc_ctxt *ctxt) 5930 { 5931 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 5932 5933 while (fl) { 5934 if (fl->f_inode_type == type && fl->f_slot == slot) 5935 return fl; 5936 5937 fl = fl->f_next_suballocator; 5938 } 5939 5940 fl = kmalloc(sizeof(*fl), GFP_NOFS); 5941 if (fl) { 5942 fl->f_inode_type = type; 5943 fl->f_slot = slot; 5944 fl->f_first = NULL; 5945 fl->f_next_suballocator = ctxt->c_first_suballocator; 5946 5947 ctxt->c_first_suballocator = fl; 5948 } 5949 return fl; 5950 } 5951 5952 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 5953 int type, int slot, u64 blkno, 5954 unsigned int bit) 5955 { 5956 int ret; 5957 struct ocfs2_per_slot_free_list *fl; 5958 struct ocfs2_cached_block_free *item; 5959 5960 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 5961 if (fl == NULL) { 5962 ret = -ENOMEM; 5963 mlog_errno(ret); 5964 goto out; 5965 } 5966 5967 item = kmalloc(sizeof(*item), GFP_NOFS); 5968 if (item == NULL) { 5969 ret = -ENOMEM; 5970 mlog_errno(ret); 5971 goto out; 5972 } 5973 5974 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 5975 type, slot, bit, (unsigned long long)blkno); 5976 5977 item->free_blk = blkno; 5978 item->free_bit = bit; 5979 item->free_next = fl->f_first; 5980 5981 fl->f_first = item; 5982 5983 ret = 0; 5984 out: 5985 return ret; 5986 } 5987 5988 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 5989 struct ocfs2_extent_block *eb) 5990 { 5991 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 5992 le16_to_cpu(eb->h_suballoc_slot), 5993 le64_to_cpu(eb->h_blkno), 5994 le16_to_cpu(eb->h_suballoc_bit)); 5995 } 5996 5997 /* This function will figure out whether the currently last extent 5998 * block will be deleted, and if it will, what the new last extent 5999 * block will be so we can update his h_next_leaf_blk field, as well 6000 * as the dinodes i_last_eb_blk */ 6001 static int ocfs2_find_new_last_ext_blk(struct inode *inode, 6002 unsigned int clusters_to_del, 6003 struct ocfs2_path *path, 6004 struct buffer_head **new_last_eb) 6005 { 6006 int next_free, ret = 0; 6007 u32 cpos; 6008 struct ocfs2_extent_rec *rec; 6009 struct ocfs2_extent_block *eb; 6010 struct ocfs2_extent_list *el; 6011 struct buffer_head *bh = NULL; 6012 6013 *new_last_eb = NULL; 6014 6015 /* we have no tree, so of course, no last_eb. */ 6016 if (!path->p_tree_depth) 6017 goto out; 6018 6019 /* trunc to zero special case - this makes tree_depth = 0 6020 * regardless of what it is. */ 6021 if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 6022 goto out; 6023 6024 el = path_leaf_el(path); 6025 BUG_ON(!el->l_next_free_rec); 6026 6027 /* 6028 * Make sure that this extent list will actually be empty 6029 * after we clear away the data. We can shortcut out if 6030 * there's more than one non-empty extent in the 6031 * list. Otherwise, a check of the remaining extent is 6032 * necessary. 6033 */ 6034 next_free = le16_to_cpu(el->l_next_free_rec); 6035 rec = NULL; 6036 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6037 if (next_free > 2) 6038 goto out; 6039 6040 /* We may have a valid extent in index 1, check it. */ 6041 if (next_free == 2) 6042 rec = &el->l_recs[1]; 6043 6044 /* 6045 * Fall through - no more nonempty extents, so we want 6046 * to delete this leaf. 6047 */ 6048 } else { 6049 if (next_free > 1) 6050 goto out; 6051 6052 rec = &el->l_recs[0]; 6053 } 6054 6055 if (rec) { 6056 /* 6057 * Check it we'll only be trimming off the end of this 6058 * cluster. 6059 */ 6060 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 6061 goto out; 6062 } 6063 6064 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 6065 if (ret) { 6066 mlog_errno(ret); 6067 goto out; 6068 } 6069 6070 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6071 if (ret) { 6072 mlog_errno(ret); 6073 goto out; 6074 } 6075 6076 eb = (struct ocfs2_extent_block *) bh->b_data; 6077 el = &eb->h_list; 6078 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6079 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6080 ret = -EROFS; 6081 goto out; 6082 } 6083 6084 *new_last_eb = bh; 6085 get_bh(*new_last_eb); 6086 mlog(0, "returning block %llu, (cpos: %u)\n", 6087 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 6088 out: 6089 brelse(bh); 6090 6091 return ret; 6092 } 6093 6094 /* 6095 * Trim some clusters off the rightmost edge of a tree. Only called 6096 * during truncate. 6097 * 6098 * The caller needs to: 6099 * - start journaling of each path component. 6100 * - compute and fully set up any new last ext block 6101 */ 6102 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6103 handle_t *handle, struct ocfs2_truncate_context *tc, 6104 u32 clusters_to_del, u64 *delete_start) 6105 { 6106 int ret, i, index = path->p_tree_depth; 6107 u32 new_edge = 0; 6108 u64 deleted_eb = 0; 6109 struct buffer_head *bh; 6110 struct ocfs2_extent_list *el; 6111 struct ocfs2_extent_rec *rec; 6112 6113 *delete_start = 0; 6114 6115 while (index >= 0) { 6116 bh = path->p_node[index].bh; 6117 el = path->p_node[index].el; 6118 6119 mlog(0, "traveling tree (index = %d, block = %llu)\n", 6120 index, (unsigned long long)bh->b_blocknr); 6121 6122 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 6123 6124 if (index != 6125 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 6126 ocfs2_error(inode->i_sb, 6127 "Inode %lu has invalid ext. block %llu", 6128 inode->i_ino, 6129 (unsigned long long)bh->b_blocknr); 6130 ret = -EROFS; 6131 goto out; 6132 } 6133 6134 find_tail_record: 6135 i = le16_to_cpu(el->l_next_free_rec) - 1; 6136 rec = &el->l_recs[i]; 6137 6138 mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 6139 "next = %u\n", i, le32_to_cpu(rec->e_cpos), 6140 ocfs2_rec_clusters(el, rec), 6141 (unsigned long long)le64_to_cpu(rec->e_blkno), 6142 le16_to_cpu(el->l_next_free_rec)); 6143 6144 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 6145 6146 if (le16_to_cpu(el->l_tree_depth) == 0) { 6147 /* 6148 * If the leaf block contains a single empty 6149 * extent and no records, we can just remove 6150 * the block. 6151 */ 6152 if (i == 0 && ocfs2_is_empty_extent(rec)) { 6153 memset(rec, 0, 6154 sizeof(struct ocfs2_extent_rec)); 6155 el->l_next_free_rec = cpu_to_le16(0); 6156 6157 goto delete; 6158 } 6159 6160 /* 6161 * Remove any empty extents by shifting things 6162 * left. That should make life much easier on 6163 * the code below. This condition is rare 6164 * enough that we shouldn't see a performance 6165 * hit. 6166 */ 6167 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6168 le16_add_cpu(&el->l_next_free_rec, -1); 6169 6170 for(i = 0; 6171 i < le16_to_cpu(el->l_next_free_rec); i++) 6172 el->l_recs[i] = el->l_recs[i + 1]; 6173 6174 memset(&el->l_recs[i], 0, 6175 sizeof(struct ocfs2_extent_rec)); 6176 6177 /* 6178 * We've modified our extent list. The 6179 * simplest way to handle this change 6180 * is to being the search from the 6181 * start again. 6182 */ 6183 goto find_tail_record; 6184 } 6185 6186 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 6187 6188 /* 6189 * We'll use "new_edge" on our way back up the 6190 * tree to know what our rightmost cpos is. 6191 */ 6192 new_edge = le16_to_cpu(rec->e_leaf_clusters); 6193 new_edge += le32_to_cpu(rec->e_cpos); 6194 6195 /* 6196 * The caller will use this to delete data blocks. 6197 */ 6198 *delete_start = le64_to_cpu(rec->e_blkno) 6199 + ocfs2_clusters_to_blocks(inode->i_sb, 6200 le16_to_cpu(rec->e_leaf_clusters)); 6201 6202 /* 6203 * If it's now empty, remove this record. 6204 */ 6205 if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 6206 memset(rec, 0, 6207 sizeof(struct ocfs2_extent_rec)); 6208 le16_add_cpu(&el->l_next_free_rec, -1); 6209 } 6210 } else { 6211 if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 6212 memset(rec, 0, 6213 sizeof(struct ocfs2_extent_rec)); 6214 le16_add_cpu(&el->l_next_free_rec, -1); 6215 6216 goto delete; 6217 } 6218 6219 /* Can this actually happen? */ 6220 if (le16_to_cpu(el->l_next_free_rec) == 0) 6221 goto delete; 6222 6223 /* 6224 * We never actually deleted any clusters 6225 * because our leaf was empty. There's no 6226 * reason to adjust the rightmost edge then. 6227 */ 6228 if (new_edge == 0) 6229 goto delete; 6230 6231 rec->e_int_clusters = cpu_to_le32(new_edge); 6232 le32_add_cpu(&rec->e_int_clusters, 6233 -le32_to_cpu(rec->e_cpos)); 6234 6235 /* 6236 * A deleted child record should have been 6237 * caught above. 6238 */ 6239 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 6240 } 6241 6242 delete: 6243 ret = ocfs2_journal_dirty(handle, bh); 6244 if (ret) { 6245 mlog_errno(ret); 6246 goto out; 6247 } 6248 6249 mlog(0, "extent list container %llu, after: record %d: " 6250 "(%u, %u, %llu), next = %u.\n", 6251 (unsigned long long)bh->b_blocknr, i, 6252 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 6253 (unsigned long long)le64_to_cpu(rec->e_blkno), 6254 le16_to_cpu(el->l_next_free_rec)); 6255 6256 /* 6257 * We must be careful to only attempt delete of an 6258 * extent block (and not the root inode block). 6259 */ 6260 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 6261 struct ocfs2_extent_block *eb = 6262 (struct ocfs2_extent_block *)bh->b_data; 6263 6264 /* 6265 * Save this for use when processing the 6266 * parent block. 6267 */ 6268 deleted_eb = le64_to_cpu(eb->h_blkno); 6269 6270 mlog(0, "deleting this extent block.\n"); 6271 6272 ocfs2_remove_from_cache(inode, bh); 6273 6274 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6275 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6276 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 6277 6278 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 6279 /* An error here is not fatal. */ 6280 if (ret < 0) 6281 mlog_errno(ret); 6282 } else { 6283 deleted_eb = 0; 6284 } 6285 6286 index--; 6287 } 6288 6289 ret = 0; 6290 out: 6291 return ret; 6292 } 6293 6294 static int ocfs2_do_truncate(struct ocfs2_super *osb, 6295 unsigned int clusters_to_del, 6296 struct inode *inode, 6297 struct buffer_head *fe_bh, 6298 handle_t *handle, 6299 struct ocfs2_truncate_context *tc, 6300 struct ocfs2_path *path) 6301 { 6302 int status; 6303 struct ocfs2_dinode *fe; 6304 struct ocfs2_extent_block *last_eb = NULL; 6305 struct ocfs2_extent_list *el; 6306 struct buffer_head *last_eb_bh = NULL; 6307 u64 delete_blk = 0; 6308 6309 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6310 6311 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 6312 path, &last_eb_bh); 6313 if (status < 0) { 6314 mlog_errno(status); 6315 goto bail; 6316 } 6317 6318 /* 6319 * Each component will be touched, so we might as well journal 6320 * here to avoid having to handle errors later. 6321 */ 6322 status = ocfs2_journal_access_path(inode, handle, path); 6323 if (status < 0) { 6324 mlog_errno(status); 6325 goto bail; 6326 } 6327 6328 if (last_eb_bh) { 6329 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6330 OCFS2_JOURNAL_ACCESS_WRITE); 6331 if (status < 0) { 6332 mlog_errno(status); 6333 goto bail; 6334 } 6335 6336 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6337 } 6338 6339 el = &(fe->id2.i_list); 6340 6341 /* 6342 * Lower levels depend on this never happening, but it's best 6343 * to check it up here before changing the tree. 6344 */ 6345 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 6346 ocfs2_error(inode->i_sb, 6347 "Inode %lu has an empty extent record, depth %u\n", 6348 inode->i_ino, le16_to_cpu(el->l_tree_depth)); 6349 status = -EROFS; 6350 goto bail; 6351 } 6352 6353 spin_lock(&OCFS2_I(inode)->ip_lock); 6354 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6355 clusters_to_del; 6356 spin_unlock(&OCFS2_I(inode)->ip_lock); 6357 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 6358 inode->i_blocks = ocfs2_inode_sector_count(inode); 6359 6360 status = ocfs2_trim_tree(inode, path, handle, tc, 6361 clusters_to_del, &delete_blk); 6362 if (status) { 6363 mlog_errno(status); 6364 goto bail; 6365 } 6366 6367 if (le32_to_cpu(fe->i_clusters) == 0) { 6368 /* trunc to zero is a special case. */ 6369 el->l_tree_depth = 0; 6370 fe->i_last_eb_blk = 0; 6371 } else if (last_eb) 6372 fe->i_last_eb_blk = last_eb->h_blkno; 6373 6374 status = ocfs2_journal_dirty(handle, fe_bh); 6375 if (status < 0) { 6376 mlog_errno(status); 6377 goto bail; 6378 } 6379 6380 if (last_eb) { 6381 /* If there will be a new last extent block, then by 6382 * definition, there cannot be any leaves to the right of 6383 * him. */ 6384 last_eb->h_next_leaf_blk = 0; 6385 status = ocfs2_journal_dirty(handle, last_eb_bh); 6386 if (status < 0) { 6387 mlog_errno(status); 6388 goto bail; 6389 } 6390 } 6391 6392 if (delete_blk) { 6393 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6394 clusters_to_del); 6395 if (status < 0) { 6396 mlog_errno(status); 6397 goto bail; 6398 } 6399 } 6400 status = 0; 6401 bail: 6402 6403 mlog_exit(status); 6404 return status; 6405 } 6406 6407 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6408 { 6409 set_buffer_uptodate(bh); 6410 mark_buffer_dirty(bh); 6411 return 0; 6412 } 6413 6414 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 6415 unsigned int from, unsigned int to, 6416 struct page *page, int zero, u64 *phys) 6417 { 6418 int ret, partial = 0; 6419 6420 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 6421 if (ret) 6422 mlog_errno(ret); 6423 6424 if (zero) 6425 zero_user_segment(page, from, to); 6426 6427 /* 6428 * Need to set the buffers we zero'd into uptodate 6429 * here if they aren't - ocfs2_map_page_blocks() 6430 * might've skipped some 6431 */ 6432 ret = walk_page_buffers(handle, page_buffers(page), 6433 from, to, &partial, 6434 ocfs2_zero_func); 6435 if (ret < 0) 6436 mlog_errno(ret); 6437 else if (ocfs2_should_order_data(inode)) { 6438 ret = ocfs2_jbd2_file_inode(handle, inode); 6439 #ifdef CONFIG_OCFS2_COMPAT_JBD 6440 ret = walk_page_buffers(handle, page_buffers(page), 6441 from, to, &partial, 6442 ocfs2_journal_dirty_data); 6443 #endif 6444 if (ret < 0) 6445 mlog_errno(ret); 6446 } 6447 6448 if (!partial) 6449 SetPageUptodate(page); 6450 6451 flush_dcache_page(page); 6452 } 6453 6454 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 6455 loff_t end, struct page **pages, 6456 int numpages, u64 phys, handle_t *handle) 6457 { 6458 int i; 6459 struct page *page; 6460 unsigned int from, to = PAGE_CACHE_SIZE; 6461 struct super_block *sb = inode->i_sb; 6462 6463 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 6464 6465 if (numpages == 0) 6466 goto out; 6467 6468 to = PAGE_CACHE_SIZE; 6469 for(i = 0; i < numpages; i++) { 6470 page = pages[i]; 6471 6472 from = start & (PAGE_CACHE_SIZE - 1); 6473 if ((end >> PAGE_CACHE_SHIFT) == page->index) 6474 to = end & (PAGE_CACHE_SIZE - 1); 6475 6476 BUG_ON(from > PAGE_CACHE_SIZE); 6477 BUG_ON(to > PAGE_CACHE_SIZE); 6478 6479 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, 6480 &phys); 6481 6482 start = (page->index + 1) << PAGE_CACHE_SHIFT; 6483 } 6484 out: 6485 if (pages) 6486 ocfs2_unlock_and_free_pages(pages, numpages); 6487 } 6488 6489 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 6490 struct page **pages, int *num) 6491 { 6492 int numpages, ret = 0; 6493 struct super_block *sb = inode->i_sb; 6494 struct address_space *mapping = inode->i_mapping; 6495 unsigned long index; 6496 loff_t last_page_bytes; 6497 6498 BUG_ON(start > end); 6499 6500 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 6501 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 6502 6503 numpages = 0; 6504 last_page_bytes = PAGE_ALIGN(end); 6505 index = start >> PAGE_CACHE_SHIFT; 6506 do { 6507 pages[numpages] = grab_cache_page(mapping, index); 6508 if (!pages[numpages]) { 6509 ret = -ENOMEM; 6510 mlog_errno(ret); 6511 goto out; 6512 } 6513 6514 numpages++; 6515 index++; 6516 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); 6517 6518 out: 6519 if (ret != 0) { 6520 if (pages) 6521 ocfs2_unlock_and_free_pages(pages, numpages); 6522 numpages = 0; 6523 } 6524 6525 *num = numpages; 6526 6527 return ret; 6528 } 6529 6530 /* 6531 * Zero the area past i_size but still within an allocated 6532 * cluster. This avoids exposing nonzero data on subsequent file 6533 * extends. 6534 * 6535 * We need to call this before i_size is updated on the inode because 6536 * otherwise block_write_full_page() will skip writeout of pages past 6537 * i_size. The new_i_size parameter is passed for this reason. 6538 */ 6539 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 6540 u64 range_start, u64 range_end) 6541 { 6542 int ret = 0, numpages; 6543 struct page **pages = NULL; 6544 u64 phys; 6545 unsigned int ext_flags; 6546 struct super_block *sb = inode->i_sb; 6547 6548 /* 6549 * File systems which don't support sparse files zero on every 6550 * extend. 6551 */ 6552 if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) 6553 return 0; 6554 6555 pages = kcalloc(ocfs2_pages_per_cluster(sb), 6556 sizeof(struct page *), GFP_NOFS); 6557 if (pages == NULL) { 6558 ret = -ENOMEM; 6559 mlog_errno(ret); 6560 goto out; 6561 } 6562 6563 if (range_start == range_end) 6564 goto out; 6565 6566 ret = ocfs2_extent_map_get_blocks(inode, 6567 range_start >> sb->s_blocksize_bits, 6568 &phys, NULL, &ext_flags); 6569 if (ret) { 6570 mlog_errno(ret); 6571 goto out; 6572 } 6573 6574 /* 6575 * Tail is a hole, or is marked unwritten. In either case, we 6576 * can count on read and write to return/push zero's. 6577 */ 6578 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) 6579 goto out; 6580 6581 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 6582 &numpages); 6583 if (ret) { 6584 mlog_errno(ret); 6585 goto out; 6586 } 6587 6588 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 6589 numpages, phys, handle); 6590 6591 /* 6592 * Initiate writeout of the pages we zero'd here. We don't 6593 * wait on them - the truncate_inode_pages() call later will 6594 * do that for us. 6595 */ 6596 ret = do_sync_mapping_range(inode->i_mapping, range_start, 6597 range_end - 1, SYNC_FILE_RANGE_WRITE); 6598 if (ret) 6599 mlog_errno(ret); 6600 6601 out: 6602 if (pages) 6603 kfree(pages); 6604 6605 return ret; 6606 } 6607 6608 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, 6609 struct ocfs2_dinode *di) 6610 { 6611 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 6612 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); 6613 6614 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) 6615 memset(&di->id2, 0, blocksize - 6616 offsetof(struct ocfs2_dinode, id2) - 6617 xattrsize); 6618 else 6619 memset(&di->id2, 0, blocksize - 6620 offsetof(struct ocfs2_dinode, id2)); 6621 } 6622 6623 void ocfs2_dinode_new_extent_list(struct inode *inode, 6624 struct ocfs2_dinode *di) 6625 { 6626 ocfs2_zero_dinode_id2_with_xattr(inode, di); 6627 di->id2.i_list.l_tree_depth = 0; 6628 di->id2.i_list.l_next_free_rec = 0; 6629 di->id2.i_list.l_count = cpu_to_le16( 6630 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); 6631 } 6632 6633 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 6634 { 6635 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6636 struct ocfs2_inline_data *idata = &di->id2.i_data; 6637 6638 spin_lock(&oi->ip_lock); 6639 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 6640 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6641 spin_unlock(&oi->ip_lock); 6642 6643 /* 6644 * We clear the entire i_data structure here so that all 6645 * fields can be properly initialized. 6646 */ 6647 ocfs2_zero_dinode_id2_with_xattr(inode, di); 6648 6649 idata->id_count = cpu_to_le16( 6650 ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); 6651 } 6652 6653 int ocfs2_convert_inline_data_to_extents(struct inode *inode, 6654 struct buffer_head *di_bh) 6655 { 6656 int ret, i, has_data, num_pages = 0; 6657 handle_t *handle; 6658 u64 uninitialized_var(block); 6659 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6660 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6661 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 6662 struct ocfs2_alloc_context *data_ac = NULL; 6663 struct page **pages = NULL; 6664 loff_t end = osb->s_clustersize; 6665 struct ocfs2_extent_tree et; 6666 6667 has_data = i_size_read(inode) ? 1 : 0; 6668 6669 if (has_data) { 6670 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), 6671 sizeof(struct page *), GFP_NOFS); 6672 if (pages == NULL) { 6673 ret = -ENOMEM; 6674 mlog_errno(ret); 6675 goto out; 6676 } 6677 6678 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 6679 if (ret) { 6680 mlog_errno(ret); 6681 goto out; 6682 } 6683 } 6684 6685 handle = ocfs2_start_trans(osb, OCFS2_INLINE_TO_EXTENTS_CREDITS); 6686 if (IS_ERR(handle)) { 6687 ret = PTR_ERR(handle); 6688 mlog_errno(ret); 6689 goto out_unlock; 6690 } 6691 6692 ret = ocfs2_journal_access(handle, inode, di_bh, 6693 OCFS2_JOURNAL_ACCESS_WRITE); 6694 if (ret) { 6695 mlog_errno(ret); 6696 goto out_commit; 6697 } 6698 6699 if (has_data) { 6700 u32 bit_off, num; 6701 unsigned int page_end; 6702 u64 phys; 6703 6704 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6705 &num); 6706 if (ret) { 6707 mlog_errno(ret); 6708 goto out_commit; 6709 } 6710 6711 /* 6712 * Save two copies, one for insert, and one that can 6713 * be changed by ocfs2_map_and_dirty_page() below. 6714 */ 6715 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 6716 6717 /* 6718 * Non sparse file systems zero on extend, so no need 6719 * to do that now. 6720 */ 6721 if (!ocfs2_sparse_alloc(osb) && 6722 PAGE_CACHE_SIZE < osb->s_clustersize) 6723 end = PAGE_CACHE_SIZE; 6724 6725 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); 6726 if (ret) { 6727 mlog_errno(ret); 6728 goto out_commit; 6729 } 6730 6731 /* 6732 * This should populate the 1st page for us and mark 6733 * it up to date. 6734 */ 6735 ret = ocfs2_read_inline_data(inode, pages[0], di_bh); 6736 if (ret) { 6737 mlog_errno(ret); 6738 goto out_commit; 6739 } 6740 6741 page_end = PAGE_CACHE_SIZE; 6742 if (PAGE_CACHE_SIZE > osb->s_clustersize) 6743 page_end = osb->s_clustersize; 6744 6745 for (i = 0; i < num_pages; i++) 6746 ocfs2_map_and_dirty_page(inode, handle, 0, page_end, 6747 pages[i], i > 0, &phys); 6748 } 6749 6750 spin_lock(&oi->ip_lock); 6751 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 6752 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6753 spin_unlock(&oi->ip_lock); 6754 6755 ocfs2_dinode_new_extent_list(inode, di); 6756 6757 ocfs2_journal_dirty(handle, di_bh); 6758 6759 if (has_data) { 6760 /* 6761 * An error at this point should be extremely rare. If 6762 * this proves to be false, we could always re-build 6763 * the in-inode data from our pages. 6764 */ 6765 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 6766 ret = ocfs2_insert_extent(osb, handle, inode, &et, 6767 0, block, 1, 0, NULL); 6768 if (ret) { 6769 mlog_errno(ret); 6770 goto out_commit; 6771 } 6772 6773 inode->i_blocks = ocfs2_inode_sector_count(inode); 6774 } 6775 6776 out_commit: 6777 ocfs2_commit_trans(osb, handle); 6778 6779 out_unlock: 6780 if (data_ac) 6781 ocfs2_free_alloc_context(data_ac); 6782 6783 out: 6784 if (pages) { 6785 ocfs2_unlock_and_free_pages(pages, num_pages); 6786 kfree(pages); 6787 } 6788 6789 return ret; 6790 } 6791 6792 /* 6793 * It is expected, that by the time you call this function, 6794 * inode->i_size and fe->i_size have been adjusted. 6795 * 6796 * WARNING: This will kfree the truncate context 6797 */ 6798 int ocfs2_commit_truncate(struct ocfs2_super *osb, 6799 struct inode *inode, 6800 struct buffer_head *fe_bh, 6801 struct ocfs2_truncate_context *tc) 6802 { 6803 int status, i, credits, tl_sem = 0; 6804 u32 clusters_to_del, new_highest_cpos, range; 6805 struct ocfs2_extent_list *el; 6806 handle_t *handle = NULL; 6807 struct inode *tl_inode = osb->osb_tl_inode; 6808 struct ocfs2_path *path = NULL; 6809 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 6810 6811 mlog_entry_void(); 6812 6813 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 6814 i_size_read(inode)); 6815 6816 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 6817 if (!path) { 6818 status = -ENOMEM; 6819 mlog_errno(status); 6820 goto bail; 6821 } 6822 6823 ocfs2_extent_map_trunc(inode, new_highest_cpos); 6824 6825 start: 6826 /* 6827 * Check that we still have allocation to delete. 6828 */ 6829 if (OCFS2_I(inode)->ip_clusters == 0) { 6830 status = 0; 6831 goto bail; 6832 } 6833 6834 /* 6835 * Truncate always works against the rightmost tree branch. 6836 */ 6837 status = ocfs2_find_path(inode, path, UINT_MAX); 6838 if (status) { 6839 mlog_errno(status); 6840 goto bail; 6841 } 6842 6843 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 6844 OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 6845 6846 /* 6847 * By now, el will point to the extent list on the bottom most 6848 * portion of this tree. Only the tail record is considered in 6849 * each pass. 6850 * 6851 * We handle the following cases, in order: 6852 * - empty extent: delete the remaining branch 6853 * - remove the entire record 6854 * - remove a partial record 6855 * - no record needs to be removed (truncate has completed) 6856 */ 6857 el = path_leaf_el(path); 6858 if (le16_to_cpu(el->l_next_free_rec) == 0) { 6859 ocfs2_error(inode->i_sb, 6860 "Inode %llu has empty extent block at %llu\n", 6861 (unsigned long long)OCFS2_I(inode)->ip_blkno, 6862 (unsigned long long)path_leaf_bh(path)->b_blocknr); 6863 status = -EROFS; 6864 goto bail; 6865 } 6866 6867 i = le16_to_cpu(el->l_next_free_rec) - 1; 6868 range = le32_to_cpu(el->l_recs[i].e_cpos) + 6869 ocfs2_rec_clusters(el, &el->l_recs[i]); 6870 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 6871 clusters_to_del = 0; 6872 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 6873 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 6874 } else if (range > new_highest_cpos) { 6875 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 6876 le32_to_cpu(el->l_recs[i].e_cpos)) - 6877 new_highest_cpos; 6878 } else { 6879 status = 0; 6880 goto bail; 6881 } 6882 6883 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 6884 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 6885 6886 mutex_lock(&tl_inode->i_mutex); 6887 tl_sem = 1; 6888 /* ocfs2_truncate_log_needs_flush guarantees us at least one 6889 * record is free for use. If there isn't any, we flush to get 6890 * an empty truncate log. */ 6891 if (ocfs2_truncate_log_needs_flush(osb)) { 6892 status = __ocfs2_flush_truncate_log(osb); 6893 if (status < 0) { 6894 mlog_errno(status); 6895 goto bail; 6896 } 6897 } 6898 6899 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 6900 (struct ocfs2_dinode *)fe_bh->b_data, 6901 el); 6902 handle = ocfs2_start_trans(osb, credits); 6903 if (IS_ERR(handle)) { 6904 status = PTR_ERR(handle); 6905 handle = NULL; 6906 mlog_errno(status); 6907 goto bail; 6908 } 6909 6910 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 6911 tc, path); 6912 if (status < 0) { 6913 mlog_errno(status); 6914 goto bail; 6915 } 6916 6917 mutex_unlock(&tl_inode->i_mutex); 6918 tl_sem = 0; 6919 6920 ocfs2_commit_trans(osb, handle); 6921 handle = NULL; 6922 6923 ocfs2_reinit_path(path, 1); 6924 6925 /* 6926 * The check above will catch the case where we've truncated 6927 * away all allocation. 6928 */ 6929 goto start; 6930 6931 bail: 6932 6933 ocfs2_schedule_truncate_log_flush(osb, 1); 6934 6935 if (tl_sem) 6936 mutex_unlock(&tl_inode->i_mutex); 6937 6938 if (handle) 6939 ocfs2_commit_trans(osb, handle); 6940 6941 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 6942 6943 ocfs2_free_path(path); 6944 6945 /* This will drop the ext_alloc cluster lock for us */ 6946 ocfs2_free_truncate_context(tc); 6947 6948 mlog_exit(status); 6949 return status; 6950 } 6951 6952 /* 6953 * Expects the inode to already be locked. 6954 */ 6955 int ocfs2_prepare_truncate(struct ocfs2_super *osb, 6956 struct inode *inode, 6957 struct buffer_head *fe_bh, 6958 struct ocfs2_truncate_context **tc) 6959 { 6960 int status; 6961 unsigned int new_i_clusters; 6962 struct ocfs2_dinode *fe; 6963 struct ocfs2_extent_block *eb; 6964 struct buffer_head *last_eb_bh = NULL; 6965 6966 mlog_entry_void(); 6967 6968 *tc = NULL; 6969 6970 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 6971 i_size_read(inode)); 6972 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6973 6974 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 6975 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 6976 (unsigned long long)le64_to_cpu(fe->i_size)); 6977 6978 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 6979 if (!(*tc)) { 6980 status = -ENOMEM; 6981 mlog_errno(status); 6982 goto bail; 6983 } 6984 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 6985 6986 if (fe->id2.i_list.l_tree_depth) { 6987 status = ocfs2_read_block(inode, le64_to_cpu(fe->i_last_eb_blk), 6988 &last_eb_bh); 6989 if (status < 0) { 6990 mlog_errno(status); 6991 goto bail; 6992 } 6993 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6994 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 6995 OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 6996 6997 brelse(last_eb_bh); 6998 status = -EIO; 6999 goto bail; 7000 } 7001 } 7002 7003 (*tc)->tc_last_eb_bh = last_eb_bh; 7004 7005 status = 0; 7006 bail: 7007 if (status < 0) { 7008 if (*tc) 7009 ocfs2_free_truncate_context(*tc); 7010 *tc = NULL; 7011 } 7012 mlog_exit_void(); 7013 return status; 7014 } 7015 7016 /* 7017 * 'start' is inclusive, 'end' is not. 7018 */ 7019 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7020 unsigned int start, unsigned int end, int trunc) 7021 { 7022 int ret; 7023 unsigned int numbytes; 7024 handle_t *handle; 7025 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7026 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7027 struct ocfs2_inline_data *idata = &di->id2.i_data; 7028 7029 if (end > i_size_read(inode)) 7030 end = i_size_read(inode); 7031 7032 BUG_ON(start >= end); 7033 7034 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 7035 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7036 !ocfs2_supports_inline_data(osb)) { 7037 ocfs2_error(inode->i_sb, 7038 "Inline data flags for inode %llu don't agree! " 7039 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", 7040 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7041 le16_to_cpu(di->i_dyn_features), 7042 OCFS2_I(inode)->ip_dyn_features, 7043 osb->s_feature_incompat); 7044 ret = -EROFS; 7045 goto out; 7046 } 7047 7048 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 7049 if (IS_ERR(handle)) { 7050 ret = PTR_ERR(handle); 7051 mlog_errno(ret); 7052 goto out; 7053 } 7054 7055 ret = ocfs2_journal_access(handle, inode, di_bh, 7056 OCFS2_JOURNAL_ACCESS_WRITE); 7057 if (ret) { 7058 mlog_errno(ret); 7059 goto out_commit; 7060 } 7061 7062 numbytes = end - start; 7063 memset(idata->id_data + start, 0, numbytes); 7064 7065 /* 7066 * No need to worry about the data page here - it's been 7067 * truncated already and inline data doesn't need it for 7068 * pushing zero's to disk, so we'll let readpage pick it up 7069 * later. 7070 */ 7071 if (trunc) { 7072 i_size_write(inode, start); 7073 di->i_size = cpu_to_le64(start); 7074 } 7075 7076 inode->i_blocks = ocfs2_inode_sector_count(inode); 7077 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 7078 7079 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7080 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7081 7082 ocfs2_journal_dirty(handle, di_bh); 7083 7084 out_commit: 7085 ocfs2_commit_trans(osb, handle); 7086 7087 out: 7088 return ret; 7089 } 7090 7091 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 7092 { 7093 /* 7094 * The caller is responsible for completing deallocation 7095 * before freeing the context. 7096 */ 7097 if (tc->tc_dealloc.c_first_suballocator != NULL) 7098 mlog(ML_NOTICE, 7099 "Truncate completion has non-empty dealloc context\n"); 7100 7101 brelse(tc->tc_last_eb_bh); 7102 7103 kfree(tc); 7104 } 7105