1 /* -*- mode: c; c-basic-offset: 8; -*- 2 * vim: noexpandtab sw=8 ts=8 sts=0: 3 * 4 * alloc.c 5 * 6 * Extent allocs and frees 7 * 8 * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9 * 10 * This program is free software; you can redistribute it and/or 11 * modify it under the terms of the GNU General Public 12 * License as published by the Free Software Foundation; either 13 * version 2 of the License, or (at your option) any later version. 14 * 15 * This program is distributed in the hope that it will be useful, 16 * but WITHOUT ANY WARRANTY; without even the implied warranty of 17 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18 * General Public License for more details. 19 * 20 * You should have received a copy of the GNU General Public 21 * License along with this program; if not, write to the 22 * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23 * Boston, MA 021110-1307, USA. 24 */ 25 26 #include <linux/fs.h> 27 #include <linux/types.h> 28 #include <linux/slab.h> 29 #include <linux/highmem.h> 30 #include <linux/swap.h> 31 #include <linux/quotaops.h> 32 33 #define MLOG_MASK_PREFIX ML_DISK_ALLOC 34 #include <cluster/masklog.h> 35 36 #include "ocfs2.h" 37 38 #include "alloc.h" 39 #include "aops.h" 40 #include "blockcheck.h" 41 #include "dlmglue.h" 42 #include "extent_map.h" 43 #include "inode.h" 44 #include "journal.h" 45 #include "localalloc.h" 46 #include "suballoc.h" 47 #include "sysfile.h" 48 #include "file.h" 49 #include "super.h" 50 #include "uptodate.h" 51 52 #include "buffer_head_io.h" 53 54 55 /* 56 * Operations for a specific extent tree type. 57 * 58 * To implement an on-disk btree (extent tree) type in ocfs2, add 59 * an ocfs2_extent_tree_operations structure and the matching 60 * ocfs2_init_<thingy>_extent_tree() function. That's pretty much it 61 * for the allocation portion of the extent tree. 62 */ 63 struct ocfs2_extent_tree_operations { 64 /* 65 * last_eb_blk is the block number of the right most leaf extent 66 * block. Most on-disk structures containing an extent tree store 67 * this value for fast access. The ->eo_set_last_eb_blk() and 68 * ->eo_get_last_eb_blk() operations access this value. They are 69 * both required. 70 */ 71 void (*eo_set_last_eb_blk)(struct ocfs2_extent_tree *et, 72 u64 blkno); 73 u64 (*eo_get_last_eb_blk)(struct ocfs2_extent_tree *et); 74 75 /* 76 * The on-disk structure usually keeps track of how many total 77 * clusters are stored in this extent tree. This function updates 78 * that value. new_clusters is the delta, and must be 79 * added to the total. Required. 80 */ 81 void (*eo_update_clusters)(struct inode *inode, 82 struct ocfs2_extent_tree *et, 83 u32 new_clusters); 84 85 /* 86 * If ->eo_insert_check() exists, it is called before rec is 87 * inserted into the extent tree. It is optional. 88 */ 89 int (*eo_insert_check)(struct inode *inode, 90 struct ocfs2_extent_tree *et, 91 struct ocfs2_extent_rec *rec); 92 int (*eo_sanity_check)(struct inode *inode, struct ocfs2_extent_tree *et); 93 94 /* 95 * -------------------------------------------------------------- 96 * The remaining are internal to ocfs2_extent_tree and don't have 97 * accessor functions 98 */ 99 100 /* 101 * ->eo_fill_root_el() takes et->et_object and sets et->et_root_el. 102 * It is required. 103 */ 104 void (*eo_fill_root_el)(struct ocfs2_extent_tree *et); 105 106 /* 107 * ->eo_fill_max_leaf_clusters sets et->et_max_leaf_clusters if 108 * it exists. If it does not, et->et_max_leaf_clusters is set 109 * to 0 (unlimited). Optional. 110 */ 111 void (*eo_fill_max_leaf_clusters)(struct inode *inode, 112 struct ocfs2_extent_tree *et); 113 }; 114 115 116 /* 117 * Pre-declare ocfs2_dinode_et_ops so we can use it as a sanity check 118 * in the methods. 119 */ 120 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et); 121 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 122 u64 blkno); 123 static void ocfs2_dinode_update_clusters(struct inode *inode, 124 struct ocfs2_extent_tree *et, 125 u32 clusters); 126 static int ocfs2_dinode_insert_check(struct inode *inode, 127 struct ocfs2_extent_tree *et, 128 struct ocfs2_extent_rec *rec); 129 static int ocfs2_dinode_sanity_check(struct inode *inode, 130 struct ocfs2_extent_tree *et); 131 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et); 132 static struct ocfs2_extent_tree_operations ocfs2_dinode_et_ops = { 133 .eo_set_last_eb_blk = ocfs2_dinode_set_last_eb_blk, 134 .eo_get_last_eb_blk = ocfs2_dinode_get_last_eb_blk, 135 .eo_update_clusters = ocfs2_dinode_update_clusters, 136 .eo_insert_check = ocfs2_dinode_insert_check, 137 .eo_sanity_check = ocfs2_dinode_sanity_check, 138 .eo_fill_root_el = ocfs2_dinode_fill_root_el, 139 }; 140 141 static void ocfs2_dinode_set_last_eb_blk(struct ocfs2_extent_tree *et, 142 u64 blkno) 143 { 144 struct ocfs2_dinode *di = et->et_object; 145 146 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 147 di->i_last_eb_blk = cpu_to_le64(blkno); 148 } 149 150 static u64 ocfs2_dinode_get_last_eb_blk(struct ocfs2_extent_tree *et) 151 { 152 struct ocfs2_dinode *di = et->et_object; 153 154 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 155 return le64_to_cpu(di->i_last_eb_blk); 156 } 157 158 static void ocfs2_dinode_update_clusters(struct inode *inode, 159 struct ocfs2_extent_tree *et, 160 u32 clusters) 161 { 162 struct ocfs2_dinode *di = et->et_object; 163 164 le32_add_cpu(&di->i_clusters, clusters); 165 spin_lock(&OCFS2_I(inode)->ip_lock); 166 OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 167 spin_unlock(&OCFS2_I(inode)->ip_lock); 168 } 169 170 static int ocfs2_dinode_insert_check(struct inode *inode, 171 struct ocfs2_extent_tree *et, 172 struct ocfs2_extent_rec *rec) 173 { 174 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 175 176 BUG_ON(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL); 177 mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 178 (OCFS2_I(inode)->ip_clusters != rec->e_cpos), 179 "Device %s, asking for sparse allocation: inode %llu, " 180 "cpos %u, clusters %u\n", 181 osb->dev_str, 182 (unsigned long long)OCFS2_I(inode)->ip_blkno, 183 rec->e_cpos, 184 OCFS2_I(inode)->ip_clusters); 185 186 return 0; 187 } 188 189 static int ocfs2_dinode_sanity_check(struct inode *inode, 190 struct ocfs2_extent_tree *et) 191 { 192 struct ocfs2_dinode *di = et->et_object; 193 194 BUG_ON(et->et_ops != &ocfs2_dinode_et_ops); 195 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 196 197 return 0; 198 } 199 200 static void ocfs2_dinode_fill_root_el(struct ocfs2_extent_tree *et) 201 { 202 struct ocfs2_dinode *di = et->et_object; 203 204 et->et_root_el = &di->id2.i_list; 205 } 206 207 208 static void ocfs2_xattr_value_fill_root_el(struct ocfs2_extent_tree *et) 209 { 210 struct ocfs2_xattr_value_root *xv = et->et_object; 211 212 et->et_root_el = &xv->xr_list; 213 } 214 215 static void ocfs2_xattr_value_set_last_eb_blk(struct ocfs2_extent_tree *et, 216 u64 blkno) 217 { 218 struct ocfs2_xattr_value_root *xv = 219 (struct ocfs2_xattr_value_root *)et->et_object; 220 221 xv->xr_last_eb_blk = cpu_to_le64(blkno); 222 } 223 224 static u64 ocfs2_xattr_value_get_last_eb_blk(struct ocfs2_extent_tree *et) 225 { 226 struct ocfs2_xattr_value_root *xv = 227 (struct ocfs2_xattr_value_root *) et->et_object; 228 229 return le64_to_cpu(xv->xr_last_eb_blk); 230 } 231 232 static void ocfs2_xattr_value_update_clusters(struct inode *inode, 233 struct ocfs2_extent_tree *et, 234 u32 clusters) 235 { 236 struct ocfs2_xattr_value_root *xv = 237 (struct ocfs2_xattr_value_root *)et->et_object; 238 239 le32_add_cpu(&xv->xr_clusters, clusters); 240 } 241 242 static struct ocfs2_extent_tree_operations ocfs2_xattr_value_et_ops = { 243 .eo_set_last_eb_blk = ocfs2_xattr_value_set_last_eb_blk, 244 .eo_get_last_eb_blk = ocfs2_xattr_value_get_last_eb_blk, 245 .eo_update_clusters = ocfs2_xattr_value_update_clusters, 246 .eo_fill_root_el = ocfs2_xattr_value_fill_root_el, 247 }; 248 249 static void ocfs2_xattr_tree_fill_root_el(struct ocfs2_extent_tree *et) 250 { 251 struct ocfs2_xattr_block *xb = et->et_object; 252 253 et->et_root_el = &xb->xb_attrs.xb_root.xt_list; 254 } 255 256 static void ocfs2_xattr_tree_fill_max_leaf_clusters(struct inode *inode, 257 struct ocfs2_extent_tree *et) 258 { 259 et->et_max_leaf_clusters = 260 ocfs2_clusters_for_bytes(inode->i_sb, 261 OCFS2_MAX_XATTR_TREE_LEAF_SIZE); 262 } 263 264 static void ocfs2_xattr_tree_set_last_eb_blk(struct ocfs2_extent_tree *et, 265 u64 blkno) 266 { 267 struct ocfs2_xattr_block *xb = et->et_object; 268 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 269 270 xt->xt_last_eb_blk = cpu_to_le64(blkno); 271 } 272 273 static u64 ocfs2_xattr_tree_get_last_eb_blk(struct ocfs2_extent_tree *et) 274 { 275 struct ocfs2_xattr_block *xb = et->et_object; 276 struct ocfs2_xattr_tree_root *xt = &xb->xb_attrs.xb_root; 277 278 return le64_to_cpu(xt->xt_last_eb_blk); 279 } 280 281 static void ocfs2_xattr_tree_update_clusters(struct inode *inode, 282 struct ocfs2_extent_tree *et, 283 u32 clusters) 284 { 285 struct ocfs2_xattr_block *xb = et->et_object; 286 287 le32_add_cpu(&xb->xb_attrs.xb_root.xt_clusters, clusters); 288 } 289 290 static struct ocfs2_extent_tree_operations ocfs2_xattr_tree_et_ops = { 291 .eo_set_last_eb_blk = ocfs2_xattr_tree_set_last_eb_blk, 292 .eo_get_last_eb_blk = ocfs2_xattr_tree_get_last_eb_blk, 293 .eo_update_clusters = ocfs2_xattr_tree_update_clusters, 294 .eo_fill_root_el = ocfs2_xattr_tree_fill_root_el, 295 .eo_fill_max_leaf_clusters = ocfs2_xattr_tree_fill_max_leaf_clusters, 296 }; 297 298 static void __ocfs2_init_extent_tree(struct ocfs2_extent_tree *et, 299 struct inode *inode, 300 struct buffer_head *bh, 301 void *obj, 302 struct ocfs2_extent_tree_operations *ops) 303 { 304 et->et_ops = ops; 305 et->et_root_bh = bh; 306 if (!obj) 307 obj = (void *)bh->b_data; 308 et->et_object = obj; 309 310 et->et_ops->eo_fill_root_el(et); 311 if (!et->et_ops->eo_fill_max_leaf_clusters) 312 et->et_max_leaf_clusters = 0; 313 else 314 et->et_ops->eo_fill_max_leaf_clusters(inode, et); 315 } 316 317 void ocfs2_init_dinode_extent_tree(struct ocfs2_extent_tree *et, 318 struct inode *inode, 319 struct buffer_head *bh) 320 { 321 __ocfs2_init_extent_tree(et, inode, bh, NULL, &ocfs2_dinode_et_ops); 322 } 323 324 void ocfs2_init_xattr_tree_extent_tree(struct ocfs2_extent_tree *et, 325 struct inode *inode, 326 struct buffer_head *bh) 327 { 328 __ocfs2_init_extent_tree(et, inode, bh, NULL, 329 &ocfs2_xattr_tree_et_ops); 330 } 331 332 void ocfs2_init_xattr_value_extent_tree(struct ocfs2_extent_tree *et, 333 struct inode *inode, 334 struct buffer_head *bh, 335 struct ocfs2_xattr_value_root *xv) 336 { 337 __ocfs2_init_extent_tree(et, inode, bh, xv, 338 &ocfs2_xattr_value_et_ops); 339 } 340 341 static inline void ocfs2_et_set_last_eb_blk(struct ocfs2_extent_tree *et, 342 u64 new_last_eb_blk) 343 { 344 et->et_ops->eo_set_last_eb_blk(et, new_last_eb_blk); 345 } 346 347 static inline u64 ocfs2_et_get_last_eb_blk(struct ocfs2_extent_tree *et) 348 { 349 return et->et_ops->eo_get_last_eb_blk(et); 350 } 351 352 static inline void ocfs2_et_update_clusters(struct inode *inode, 353 struct ocfs2_extent_tree *et, 354 u32 clusters) 355 { 356 et->et_ops->eo_update_clusters(inode, et, clusters); 357 } 358 359 static inline int ocfs2_et_insert_check(struct inode *inode, 360 struct ocfs2_extent_tree *et, 361 struct ocfs2_extent_rec *rec) 362 { 363 int ret = 0; 364 365 if (et->et_ops->eo_insert_check) 366 ret = et->et_ops->eo_insert_check(inode, et, rec); 367 return ret; 368 } 369 370 static inline int ocfs2_et_sanity_check(struct inode *inode, 371 struct ocfs2_extent_tree *et) 372 { 373 int ret = 0; 374 375 if (et->et_ops->eo_sanity_check) 376 ret = et->et_ops->eo_sanity_check(inode, et); 377 return ret; 378 } 379 380 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 381 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 382 struct ocfs2_extent_block *eb); 383 384 /* 385 * Structures which describe a path through a btree, and functions to 386 * manipulate them. 387 * 388 * The idea here is to be as generic as possible with the tree 389 * manipulation code. 390 */ 391 struct ocfs2_path_item { 392 struct buffer_head *bh; 393 struct ocfs2_extent_list *el; 394 }; 395 396 #define OCFS2_MAX_PATH_DEPTH 5 397 398 struct ocfs2_path { 399 int p_tree_depth; 400 struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 401 }; 402 403 #define path_root_bh(_path) ((_path)->p_node[0].bh) 404 #define path_root_el(_path) ((_path)->p_node[0].el) 405 #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 406 #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 407 #define path_num_items(_path) ((_path)->p_tree_depth + 1) 408 409 /* 410 * Reset the actual path elements so that we can re-use the structure 411 * to build another path. Generally, this involves freeing the buffer 412 * heads. 413 */ 414 static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 415 { 416 int i, start = 0, depth = 0; 417 struct ocfs2_path_item *node; 418 419 if (keep_root) 420 start = 1; 421 422 for(i = start; i < path_num_items(path); i++) { 423 node = &path->p_node[i]; 424 425 brelse(node->bh); 426 node->bh = NULL; 427 node->el = NULL; 428 } 429 430 /* 431 * Tree depth may change during truncate, or insert. If we're 432 * keeping the root extent list, then make sure that our path 433 * structure reflects the proper depth. 434 */ 435 if (keep_root) 436 depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 437 438 path->p_tree_depth = depth; 439 } 440 441 static void ocfs2_free_path(struct ocfs2_path *path) 442 { 443 if (path) { 444 ocfs2_reinit_path(path, 0); 445 kfree(path); 446 } 447 } 448 449 /* 450 * All the elements of src into dest. After this call, src could be freed 451 * without affecting dest. 452 * 453 * Both paths should have the same root. Any non-root elements of dest 454 * will be freed. 455 */ 456 static void ocfs2_cp_path(struct ocfs2_path *dest, struct ocfs2_path *src) 457 { 458 int i; 459 460 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 461 BUG_ON(path_root_el(dest) != path_root_el(src)); 462 463 ocfs2_reinit_path(dest, 1); 464 465 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 466 dest->p_node[i].bh = src->p_node[i].bh; 467 dest->p_node[i].el = src->p_node[i].el; 468 469 if (dest->p_node[i].bh) 470 get_bh(dest->p_node[i].bh); 471 } 472 } 473 474 /* 475 * Make the *dest path the same as src and re-initialize src path to 476 * have a root only. 477 */ 478 static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 479 { 480 int i; 481 482 BUG_ON(path_root_bh(dest) != path_root_bh(src)); 483 484 for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 485 brelse(dest->p_node[i].bh); 486 487 dest->p_node[i].bh = src->p_node[i].bh; 488 dest->p_node[i].el = src->p_node[i].el; 489 490 src->p_node[i].bh = NULL; 491 src->p_node[i].el = NULL; 492 } 493 } 494 495 /* 496 * Insert an extent block at given index. 497 * 498 * This will not take an additional reference on eb_bh. 499 */ 500 static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 501 struct buffer_head *eb_bh) 502 { 503 struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 504 505 /* 506 * Right now, no root bh is an extent block, so this helps 507 * catch code errors with dinode trees. The assertion can be 508 * safely removed if we ever need to insert extent block 509 * structures at the root. 510 */ 511 BUG_ON(index == 0); 512 513 path->p_node[index].bh = eb_bh; 514 path->p_node[index].el = &eb->h_list; 515 } 516 517 static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 518 struct ocfs2_extent_list *root_el) 519 { 520 struct ocfs2_path *path; 521 522 BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 523 524 path = kzalloc(sizeof(*path), GFP_NOFS); 525 if (path) { 526 path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 527 get_bh(root_bh); 528 path_root_bh(path) = root_bh; 529 path_root_el(path) = root_el; 530 } 531 532 return path; 533 } 534 535 static struct ocfs2_path *ocfs2_new_path_from_path(struct ocfs2_path *path) 536 { 537 return ocfs2_new_path(path_root_bh(path), path_root_el(path)); 538 } 539 540 static struct ocfs2_path *ocfs2_new_path_from_et(struct ocfs2_extent_tree *et) 541 { 542 return ocfs2_new_path(et->et_root_bh, et->et_root_el); 543 } 544 545 /* 546 * Convenience function to journal all components in a path. 547 */ 548 static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 549 struct ocfs2_path *path) 550 { 551 int i, ret = 0; 552 553 if (!path) 554 goto out; 555 556 for(i = 0; i < path_num_items(path); i++) { 557 ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 558 OCFS2_JOURNAL_ACCESS_WRITE); 559 if (ret < 0) { 560 mlog_errno(ret); 561 goto out; 562 } 563 } 564 565 out: 566 return ret; 567 } 568 569 /* 570 * Return the index of the extent record which contains cluster #v_cluster. 571 * -1 is returned if it was not found. 572 * 573 * Should work fine on interior and exterior nodes. 574 */ 575 int ocfs2_search_extent_list(struct ocfs2_extent_list *el, u32 v_cluster) 576 { 577 int ret = -1; 578 int i; 579 struct ocfs2_extent_rec *rec; 580 u32 rec_end, rec_start, clusters; 581 582 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 583 rec = &el->l_recs[i]; 584 585 rec_start = le32_to_cpu(rec->e_cpos); 586 clusters = ocfs2_rec_clusters(el, rec); 587 588 rec_end = rec_start + clusters; 589 590 if (v_cluster >= rec_start && v_cluster < rec_end) { 591 ret = i; 592 break; 593 } 594 } 595 596 return ret; 597 } 598 599 enum ocfs2_contig_type { 600 CONTIG_NONE = 0, 601 CONTIG_LEFT, 602 CONTIG_RIGHT, 603 CONTIG_LEFTRIGHT, 604 }; 605 606 607 /* 608 * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 609 * ocfs2_extent_contig only work properly against leaf nodes! 610 */ 611 static int ocfs2_block_extent_contig(struct super_block *sb, 612 struct ocfs2_extent_rec *ext, 613 u64 blkno) 614 { 615 u64 blk_end = le64_to_cpu(ext->e_blkno); 616 617 blk_end += ocfs2_clusters_to_blocks(sb, 618 le16_to_cpu(ext->e_leaf_clusters)); 619 620 return blkno == blk_end; 621 } 622 623 static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 624 struct ocfs2_extent_rec *right) 625 { 626 u32 left_range; 627 628 left_range = le32_to_cpu(left->e_cpos) + 629 le16_to_cpu(left->e_leaf_clusters); 630 631 return (left_range == le32_to_cpu(right->e_cpos)); 632 } 633 634 static enum ocfs2_contig_type 635 ocfs2_extent_contig(struct inode *inode, 636 struct ocfs2_extent_rec *ext, 637 struct ocfs2_extent_rec *insert_rec) 638 { 639 u64 blkno = le64_to_cpu(insert_rec->e_blkno); 640 641 /* 642 * Refuse to coalesce extent records with different flag 643 * fields - we don't want to mix unwritten extents with user 644 * data. 645 */ 646 if (ext->e_flags != insert_rec->e_flags) 647 return CONTIG_NONE; 648 649 if (ocfs2_extents_adjacent(ext, insert_rec) && 650 ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 651 return CONTIG_RIGHT; 652 653 blkno = le64_to_cpu(ext->e_blkno); 654 if (ocfs2_extents_adjacent(insert_rec, ext) && 655 ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 656 return CONTIG_LEFT; 657 658 return CONTIG_NONE; 659 } 660 661 /* 662 * NOTE: We can have pretty much any combination of contiguousness and 663 * appending. 664 * 665 * The usefulness of APPEND_TAIL is more in that it lets us know that 666 * we'll have to update the path to that leaf. 667 */ 668 enum ocfs2_append_type { 669 APPEND_NONE = 0, 670 APPEND_TAIL, 671 }; 672 673 enum ocfs2_split_type { 674 SPLIT_NONE = 0, 675 SPLIT_LEFT, 676 SPLIT_RIGHT, 677 }; 678 679 struct ocfs2_insert_type { 680 enum ocfs2_split_type ins_split; 681 enum ocfs2_append_type ins_appending; 682 enum ocfs2_contig_type ins_contig; 683 int ins_contig_index; 684 int ins_tree_depth; 685 }; 686 687 struct ocfs2_merge_ctxt { 688 enum ocfs2_contig_type c_contig_type; 689 int c_has_empty_extent; 690 int c_split_covers_rec; 691 }; 692 693 static int ocfs2_validate_extent_block(struct super_block *sb, 694 struct buffer_head *bh) 695 { 696 int rc; 697 struct ocfs2_extent_block *eb = 698 (struct ocfs2_extent_block *)bh->b_data; 699 700 mlog(0, "Validating extent block %llu\n", 701 (unsigned long long)bh->b_blocknr); 702 703 BUG_ON(!buffer_uptodate(bh)); 704 705 /* 706 * If the ecc fails, we return the error but otherwise 707 * leave the filesystem running. We know any error is 708 * local to this block. 709 */ 710 rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &eb->h_check); 711 if (rc) 712 return rc; 713 714 /* 715 * Errors after here are fatal. 716 */ 717 718 if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 719 ocfs2_error(sb, 720 "Extent block #%llu has bad signature %.*s", 721 (unsigned long long)bh->b_blocknr, 7, 722 eb->h_signature); 723 return -EINVAL; 724 } 725 726 if (le64_to_cpu(eb->h_blkno) != bh->b_blocknr) { 727 ocfs2_error(sb, 728 "Extent block #%llu has an invalid h_blkno " 729 "of %llu", 730 (unsigned long long)bh->b_blocknr, 731 (unsigned long long)le64_to_cpu(eb->h_blkno)); 732 return -EINVAL; 733 } 734 735 if (le32_to_cpu(eb->h_fs_generation) != OCFS2_SB(sb)->fs_generation) { 736 ocfs2_error(sb, 737 "Extent block #%llu has an invalid " 738 "h_fs_generation of #%u", 739 (unsigned long long)bh->b_blocknr, 740 le32_to_cpu(eb->h_fs_generation)); 741 return -EINVAL; 742 } 743 744 return 0; 745 } 746 747 int ocfs2_read_extent_block(struct inode *inode, u64 eb_blkno, 748 struct buffer_head **bh) 749 { 750 int rc; 751 struct buffer_head *tmp = *bh; 752 753 rc = ocfs2_read_block(inode, eb_blkno, &tmp, 754 ocfs2_validate_extent_block); 755 756 /* If ocfs2_read_block() got us a new bh, pass it up. */ 757 if (!rc && !*bh) 758 *bh = tmp; 759 760 return rc; 761 } 762 763 764 /* 765 * How many free extents have we got before we need more meta data? 766 */ 767 int ocfs2_num_free_extents(struct ocfs2_super *osb, 768 struct inode *inode, 769 struct ocfs2_extent_tree *et) 770 { 771 int retval; 772 struct ocfs2_extent_list *el = NULL; 773 struct ocfs2_extent_block *eb; 774 struct buffer_head *eb_bh = NULL; 775 u64 last_eb_blk = 0; 776 777 mlog_entry_void(); 778 779 el = et->et_root_el; 780 last_eb_blk = ocfs2_et_get_last_eb_blk(et); 781 782 if (last_eb_blk) { 783 retval = ocfs2_read_extent_block(inode, last_eb_blk, &eb_bh); 784 if (retval < 0) { 785 mlog_errno(retval); 786 goto bail; 787 } 788 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 789 el = &eb->h_list; 790 } 791 792 BUG_ON(el->l_tree_depth != 0); 793 794 retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 795 bail: 796 brelse(eb_bh); 797 798 mlog_exit(retval); 799 return retval; 800 } 801 802 /* expects array to already be allocated 803 * 804 * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 805 * l_count for you 806 */ 807 static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 808 handle_t *handle, 809 struct inode *inode, 810 int wanted, 811 struct ocfs2_alloc_context *meta_ac, 812 struct buffer_head *bhs[]) 813 { 814 int count, status, i; 815 u16 suballoc_bit_start; 816 u32 num_got; 817 u64 first_blkno; 818 struct ocfs2_extent_block *eb; 819 820 mlog_entry_void(); 821 822 count = 0; 823 while (count < wanted) { 824 status = ocfs2_claim_metadata(osb, 825 handle, 826 meta_ac, 827 wanted - count, 828 &suballoc_bit_start, 829 &num_got, 830 &first_blkno); 831 if (status < 0) { 832 mlog_errno(status); 833 goto bail; 834 } 835 836 for(i = count; i < (num_got + count); i++) { 837 bhs[i] = sb_getblk(osb->sb, first_blkno); 838 if (bhs[i] == NULL) { 839 status = -EIO; 840 mlog_errno(status); 841 goto bail; 842 } 843 ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 844 845 status = ocfs2_journal_access(handle, inode, bhs[i], 846 OCFS2_JOURNAL_ACCESS_CREATE); 847 if (status < 0) { 848 mlog_errno(status); 849 goto bail; 850 } 851 852 memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 853 eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 854 /* Ok, setup the minimal stuff here. */ 855 strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 856 eb->h_blkno = cpu_to_le64(first_blkno); 857 eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 858 eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 859 eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 860 eb->h_list.l_count = 861 cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 862 863 suballoc_bit_start++; 864 first_blkno++; 865 866 /* We'll also be dirtied by the caller, so 867 * this isn't absolutely necessary. */ 868 status = ocfs2_journal_dirty(handle, bhs[i]); 869 if (status < 0) { 870 mlog_errno(status); 871 goto bail; 872 } 873 } 874 875 count += num_got; 876 } 877 878 status = 0; 879 bail: 880 if (status < 0) { 881 for(i = 0; i < wanted; i++) { 882 brelse(bhs[i]); 883 bhs[i] = NULL; 884 } 885 } 886 mlog_exit(status); 887 return status; 888 } 889 890 /* 891 * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 892 * 893 * Returns the sum of the rightmost extent rec logical offset and 894 * cluster count. 895 * 896 * ocfs2_add_branch() uses this to determine what logical cluster 897 * value should be populated into the leftmost new branch records. 898 * 899 * ocfs2_shift_tree_depth() uses this to determine the # clusters 900 * value for the new topmost tree record. 901 */ 902 static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 903 { 904 int i; 905 906 i = le16_to_cpu(el->l_next_free_rec) - 1; 907 908 return le32_to_cpu(el->l_recs[i].e_cpos) + 909 ocfs2_rec_clusters(el, &el->l_recs[i]); 910 } 911 912 /* 913 * Add an entire tree branch to our inode. eb_bh is the extent block 914 * to start at, if we don't want to start the branch at the dinode 915 * structure. 916 * 917 * last_eb_bh is required as we have to update it's next_leaf pointer 918 * for the new last extent block. 919 * 920 * the new branch will be 'empty' in the sense that every block will 921 * contain a single record with cluster count == 0. 922 */ 923 static int ocfs2_add_branch(struct ocfs2_super *osb, 924 handle_t *handle, 925 struct inode *inode, 926 struct ocfs2_extent_tree *et, 927 struct buffer_head *eb_bh, 928 struct buffer_head **last_eb_bh, 929 struct ocfs2_alloc_context *meta_ac) 930 { 931 int status, new_blocks, i; 932 u64 next_blkno, new_last_eb_blk; 933 struct buffer_head *bh; 934 struct buffer_head **new_eb_bhs = NULL; 935 struct ocfs2_extent_block *eb; 936 struct ocfs2_extent_list *eb_el; 937 struct ocfs2_extent_list *el; 938 u32 new_cpos; 939 940 mlog_entry_void(); 941 942 BUG_ON(!last_eb_bh || !*last_eb_bh); 943 944 if (eb_bh) { 945 eb = (struct ocfs2_extent_block *) eb_bh->b_data; 946 el = &eb->h_list; 947 } else 948 el = et->et_root_el; 949 950 /* we never add a branch to a leaf. */ 951 BUG_ON(!el->l_tree_depth); 952 953 new_blocks = le16_to_cpu(el->l_tree_depth); 954 955 /* allocate the number of new eb blocks we need */ 956 new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 957 GFP_KERNEL); 958 if (!new_eb_bhs) { 959 status = -ENOMEM; 960 mlog_errno(status); 961 goto bail; 962 } 963 964 status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 965 meta_ac, new_eb_bhs); 966 if (status < 0) { 967 mlog_errno(status); 968 goto bail; 969 } 970 971 eb = (struct ocfs2_extent_block *)(*last_eb_bh)->b_data; 972 new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 973 974 /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 975 * linked with the rest of the tree. 976 * conversly, new_eb_bhs[0] is the new bottommost leaf. 977 * 978 * when we leave the loop, new_last_eb_blk will point to the 979 * newest leaf, and next_blkno will point to the topmost extent 980 * block. */ 981 next_blkno = new_last_eb_blk = 0; 982 for(i = 0; i < new_blocks; i++) { 983 bh = new_eb_bhs[i]; 984 eb = (struct ocfs2_extent_block *) bh->b_data; 985 /* ocfs2_create_new_meta_bhs() should create it right! */ 986 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 987 eb_el = &eb->h_list; 988 989 status = ocfs2_journal_access(handle, inode, bh, 990 OCFS2_JOURNAL_ACCESS_CREATE); 991 if (status < 0) { 992 mlog_errno(status); 993 goto bail; 994 } 995 996 eb->h_next_leaf_blk = 0; 997 eb_el->l_tree_depth = cpu_to_le16(i); 998 eb_el->l_next_free_rec = cpu_to_le16(1); 999 /* 1000 * This actually counts as an empty extent as 1001 * c_clusters == 0 1002 */ 1003 eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 1004 eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 1005 /* 1006 * eb_el isn't always an interior node, but even leaf 1007 * nodes want a zero'd flags and reserved field so 1008 * this gets the whole 32 bits regardless of use. 1009 */ 1010 eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 1011 if (!eb_el->l_tree_depth) 1012 new_last_eb_blk = le64_to_cpu(eb->h_blkno); 1013 1014 status = ocfs2_journal_dirty(handle, bh); 1015 if (status < 0) { 1016 mlog_errno(status); 1017 goto bail; 1018 } 1019 1020 next_blkno = le64_to_cpu(eb->h_blkno); 1021 } 1022 1023 /* This is a bit hairy. We want to update up to three blocks 1024 * here without leaving any of them in an inconsistent state 1025 * in case of error. We don't have to worry about 1026 * journal_dirty erroring as it won't unless we've aborted the 1027 * handle (in which case we would never be here) so reserving 1028 * the write with journal_access is all we need to do. */ 1029 status = ocfs2_journal_access(handle, inode, *last_eb_bh, 1030 OCFS2_JOURNAL_ACCESS_WRITE); 1031 if (status < 0) { 1032 mlog_errno(status); 1033 goto bail; 1034 } 1035 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1036 OCFS2_JOURNAL_ACCESS_WRITE); 1037 if (status < 0) { 1038 mlog_errno(status); 1039 goto bail; 1040 } 1041 if (eb_bh) { 1042 status = ocfs2_journal_access(handle, inode, eb_bh, 1043 OCFS2_JOURNAL_ACCESS_WRITE); 1044 if (status < 0) { 1045 mlog_errno(status); 1046 goto bail; 1047 } 1048 } 1049 1050 /* Link the new branch into the rest of the tree (el will 1051 * either be on the root_bh, or the extent block passed in. */ 1052 i = le16_to_cpu(el->l_next_free_rec); 1053 el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 1054 el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 1055 el->l_recs[i].e_int_clusters = 0; 1056 le16_add_cpu(&el->l_next_free_rec, 1); 1057 1058 /* fe needs a new last extent block pointer, as does the 1059 * next_leaf on the previously last-extent-block. */ 1060 ocfs2_et_set_last_eb_blk(et, new_last_eb_blk); 1061 1062 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 1063 eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 1064 1065 status = ocfs2_journal_dirty(handle, *last_eb_bh); 1066 if (status < 0) 1067 mlog_errno(status); 1068 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1069 if (status < 0) 1070 mlog_errno(status); 1071 if (eb_bh) { 1072 status = ocfs2_journal_dirty(handle, eb_bh); 1073 if (status < 0) 1074 mlog_errno(status); 1075 } 1076 1077 /* 1078 * Some callers want to track the rightmost leaf so pass it 1079 * back here. 1080 */ 1081 brelse(*last_eb_bh); 1082 get_bh(new_eb_bhs[0]); 1083 *last_eb_bh = new_eb_bhs[0]; 1084 1085 status = 0; 1086 bail: 1087 if (new_eb_bhs) { 1088 for (i = 0; i < new_blocks; i++) 1089 brelse(new_eb_bhs[i]); 1090 kfree(new_eb_bhs); 1091 } 1092 1093 mlog_exit(status); 1094 return status; 1095 } 1096 1097 /* 1098 * adds another level to the allocation tree. 1099 * returns back the new extent block so you can add a branch to it 1100 * after this call. 1101 */ 1102 static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 1103 handle_t *handle, 1104 struct inode *inode, 1105 struct ocfs2_extent_tree *et, 1106 struct ocfs2_alloc_context *meta_ac, 1107 struct buffer_head **ret_new_eb_bh) 1108 { 1109 int status, i; 1110 u32 new_clusters; 1111 struct buffer_head *new_eb_bh = NULL; 1112 struct ocfs2_extent_block *eb; 1113 struct ocfs2_extent_list *root_el; 1114 struct ocfs2_extent_list *eb_el; 1115 1116 mlog_entry_void(); 1117 1118 status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 1119 &new_eb_bh); 1120 if (status < 0) { 1121 mlog_errno(status); 1122 goto bail; 1123 } 1124 1125 eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 1126 /* ocfs2_create_new_meta_bhs() should create it right! */ 1127 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 1128 1129 eb_el = &eb->h_list; 1130 root_el = et->et_root_el; 1131 1132 status = ocfs2_journal_access(handle, inode, new_eb_bh, 1133 OCFS2_JOURNAL_ACCESS_CREATE); 1134 if (status < 0) { 1135 mlog_errno(status); 1136 goto bail; 1137 } 1138 1139 /* copy the root extent list data into the new extent block */ 1140 eb_el->l_tree_depth = root_el->l_tree_depth; 1141 eb_el->l_next_free_rec = root_el->l_next_free_rec; 1142 for (i = 0; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1143 eb_el->l_recs[i] = root_el->l_recs[i]; 1144 1145 status = ocfs2_journal_dirty(handle, new_eb_bh); 1146 if (status < 0) { 1147 mlog_errno(status); 1148 goto bail; 1149 } 1150 1151 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 1152 OCFS2_JOURNAL_ACCESS_WRITE); 1153 if (status < 0) { 1154 mlog_errno(status); 1155 goto bail; 1156 } 1157 1158 new_clusters = ocfs2_sum_rightmost_rec(eb_el); 1159 1160 /* update root_bh now */ 1161 le16_add_cpu(&root_el->l_tree_depth, 1); 1162 root_el->l_recs[0].e_cpos = 0; 1163 root_el->l_recs[0].e_blkno = eb->h_blkno; 1164 root_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 1165 for (i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 1166 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 1167 root_el->l_next_free_rec = cpu_to_le16(1); 1168 1169 /* If this is our 1st tree depth shift, then last_eb_blk 1170 * becomes the allocated extent block */ 1171 if (root_el->l_tree_depth == cpu_to_le16(1)) 1172 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 1173 1174 status = ocfs2_journal_dirty(handle, et->et_root_bh); 1175 if (status < 0) { 1176 mlog_errno(status); 1177 goto bail; 1178 } 1179 1180 *ret_new_eb_bh = new_eb_bh; 1181 new_eb_bh = NULL; 1182 status = 0; 1183 bail: 1184 brelse(new_eb_bh); 1185 1186 mlog_exit(status); 1187 return status; 1188 } 1189 1190 /* 1191 * Should only be called when there is no space left in any of the 1192 * leaf nodes. What we want to do is find the lowest tree depth 1193 * non-leaf extent block with room for new records. There are three 1194 * valid results of this search: 1195 * 1196 * 1) a lowest extent block is found, then we pass it back in 1197 * *lowest_eb_bh and return '0' 1198 * 1199 * 2) the search fails to find anything, but the root_el has room. We 1200 * pass NULL back in *lowest_eb_bh, but still return '0' 1201 * 1202 * 3) the search fails to find anything AND the root_el is full, in 1203 * which case we return > 0 1204 * 1205 * return status < 0 indicates an error. 1206 */ 1207 static int ocfs2_find_branch_target(struct ocfs2_super *osb, 1208 struct inode *inode, 1209 struct ocfs2_extent_tree *et, 1210 struct buffer_head **target_bh) 1211 { 1212 int status = 0, i; 1213 u64 blkno; 1214 struct ocfs2_extent_block *eb; 1215 struct ocfs2_extent_list *el; 1216 struct buffer_head *bh = NULL; 1217 struct buffer_head *lowest_bh = NULL; 1218 1219 mlog_entry_void(); 1220 1221 *target_bh = NULL; 1222 1223 el = et->et_root_el; 1224 1225 while(le16_to_cpu(el->l_tree_depth) > 1) { 1226 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1227 ocfs2_error(inode->i_sb, "Dinode %llu has empty " 1228 "extent list (next_free_rec == 0)", 1229 (unsigned long long)OCFS2_I(inode)->ip_blkno); 1230 status = -EIO; 1231 goto bail; 1232 } 1233 i = le16_to_cpu(el->l_next_free_rec) - 1; 1234 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1235 if (!blkno) { 1236 ocfs2_error(inode->i_sb, "Dinode %llu has extent " 1237 "list where extent # %d has no physical " 1238 "block start", 1239 (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 1240 status = -EIO; 1241 goto bail; 1242 } 1243 1244 brelse(bh); 1245 bh = NULL; 1246 1247 status = ocfs2_read_extent_block(inode, blkno, &bh); 1248 if (status < 0) { 1249 mlog_errno(status); 1250 goto bail; 1251 } 1252 1253 eb = (struct ocfs2_extent_block *) bh->b_data; 1254 el = &eb->h_list; 1255 1256 if (le16_to_cpu(el->l_next_free_rec) < 1257 le16_to_cpu(el->l_count)) { 1258 brelse(lowest_bh); 1259 lowest_bh = bh; 1260 get_bh(lowest_bh); 1261 } 1262 } 1263 1264 /* If we didn't find one and the fe doesn't have any room, 1265 * then return '1' */ 1266 el = et->et_root_el; 1267 if (!lowest_bh && (el->l_next_free_rec == el->l_count)) 1268 status = 1; 1269 1270 *target_bh = lowest_bh; 1271 bail: 1272 brelse(bh); 1273 1274 mlog_exit(status); 1275 return status; 1276 } 1277 1278 /* 1279 * Grow a b-tree so that it has more records. 1280 * 1281 * We might shift the tree depth in which case existing paths should 1282 * be considered invalid. 1283 * 1284 * Tree depth after the grow is returned via *final_depth. 1285 * 1286 * *last_eb_bh will be updated by ocfs2_add_branch(). 1287 */ 1288 static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 1289 struct ocfs2_extent_tree *et, int *final_depth, 1290 struct buffer_head **last_eb_bh, 1291 struct ocfs2_alloc_context *meta_ac) 1292 { 1293 int ret, shift; 1294 struct ocfs2_extent_list *el = et->et_root_el; 1295 int depth = le16_to_cpu(el->l_tree_depth); 1296 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 1297 struct buffer_head *bh = NULL; 1298 1299 BUG_ON(meta_ac == NULL); 1300 1301 shift = ocfs2_find_branch_target(osb, inode, et, &bh); 1302 if (shift < 0) { 1303 ret = shift; 1304 mlog_errno(ret); 1305 goto out; 1306 } 1307 1308 /* We traveled all the way to the bottom of the allocation tree 1309 * and didn't find room for any more extents - we need to add 1310 * another tree level */ 1311 if (shift) { 1312 BUG_ON(bh); 1313 mlog(0, "need to shift tree depth (current = %d)\n", depth); 1314 1315 /* ocfs2_shift_tree_depth will return us a buffer with 1316 * the new extent block (so we can pass that to 1317 * ocfs2_add_branch). */ 1318 ret = ocfs2_shift_tree_depth(osb, handle, inode, et, 1319 meta_ac, &bh); 1320 if (ret < 0) { 1321 mlog_errno(ret); 1322 goto out; 1323 } 1324 depth++; 1325 if (depth == 1) { 1326 /* 1327 * Special case: we have room now if we shifted from 1328 * tree_depth 0, so no more work needs to be done. 1329 * 1330 * We won't be calling add_branch, so pass 1331 * back *last_eb_bh as the new leaf. At depth 1332 * zero, it should always be null so there's 1333 * no reason to brelse. 1334 */ 1335 BUG_ON(*last_eb_bh); 1336 get_bh(bh); 1337 *last_eb_bh = bh; 1338 goto out; 1339 } 1340 } 1341 1342 /* call ocfs2_add_branch to add the final part of the tree with 1343 * the new data. */ 1344 mlog(0, "add branch. bh = %p\n", bh); 1345 ret = ocfs2_add_branch(osb, handle, inode, et, bh, last_eb_bh, 1346 meta_ac); 1347 if (ret < 0) { 1348 mlog_errno(ret); 1349 goto out; 1350 } 1351 1352 out: 1353 if (final_depth) 1354 *final_depth = depth; 1355 brelse(bh); 1356 return ret; 1357 } 1358 1359 /* 1360 * This function will discard the rightmost extent record. 1361 */ 1362 static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 1363 { 1364 int next_free = le16_to_cpu(el->l_next_free_rec); 1365 int count = le16_to_cpu(el->l_count); 1366 unsigned int num_bytes; 1367 1368 BUG_ON(!next_free); 1369 /* This will cause us to go off the end of our extent list. */ 1370 BUG_ON(next_free >= count); 1371 1372 num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 1373 1374 memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 1375 } 1376 1377 static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 1378 struct ocfs2_extent_rec *insert_rec) 1379 { 1380 int i, insert_index, next_free, has_empty, num_bytes; 1381 u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 1382 struct ocfs2_extent_rec *rec; 1383 1384 next_free = le16_to_cpu(el->l_next_free_rec); 1385 has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 1386 1387 BUG_ON(!next_free); 1388 1389 /* The tree code before us didn't allow enough room in the leaf. */ 1390 BUG_ON(el->l_next_free_rec == el->l_count && !has_empty); 1391 1392 /* 1393 * The easiest way to approach this is to just remove the 1394 * empty extent and temporarily decrement next_free. 1395 */ 1396 if (has_empty) { 1397 /* 1398 * If next_free was 1 (only an empty extent), this 1399 * loop won't execute, which is fine. We still want 1400 * the decrement above to happen. 1401 */ 1402 for(i = 0; i < (next_free - 1); i++) 1403 el->l_recs[i] = el->l_recs[i+1]; 1404 1405 next_free--; 1406 } 1407 1408 /* 1409 * Figure out what the new record index should be. 1410 */ 1411 for(i = 0; i < next_free; i++) { 1412 rec = &el->l_recs[i]; 1413 1414 if (insert_cpos < le32_to_cpu(rec->e_cpos)) 1415 break; 1416 } 1417 insert_index = i; 1418 1419 mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 1420 insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 1421 1422 BUG_ON(insert_index < 0); 1423 BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 1424 BUG_ON(insert_index > next_free); 1425 1426 /* 1427 * No need to memmove if we're just adding to the tail. 1428 */ 1429 if (insert_index != next_free) { 1430 BUG_ON(next_free >= le16_to_cpu(el->l_count)); 1431 1432 num_bytes = next_free - insert_index; 1433 num_bytes *= sizeof(struct ocfs2_extent_rec); 1434 memmove(&el->l_recs[insert_index + 1], 1435 &el->l_recs[insert_index], 1436 num_bytes); 1437 } 1438 1439 /* 1440 * Either we had an empty extent, and need to re-increment or 1441 * there was no empty extent on a non full rightmost leaf node, 1442 * in which case we still need to increment. 1443 */ 1444 next_free++; 1445 el->l_next_free_rec = cpu_to_le16(next_free); 1446 /* 1447 * Make sure none of the math above just messed up our tree. 1448 */ 1449 BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 1450 1451 el->l_recs[insert_index] = *insert_rec; 1452 1453 } 1454 1455 static void ocfs2_remove_empty_extent(struct ocfs2_extent_list *el) 1456 { 1457 int size, num_recs = le16_to_cpu(el->l_next_free_rec); 1458 1459 BUG_ON(num_recs == 0); 1460 1461 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 1462 num_recs--; 1463 size = num_recs * sizeof(struct ocfs2_extent_rec); 1464 memmove(&el->l_recs[0], &el->l_recs[1], size); 1465 memset(&el->l_recs[num_recs], 0, 1466 sizeof(struct ocfs2_extent_rec)); 1467 el->l_next_free_rec = cpu_to_le16(num_recs); 1468 } 1469 } 1470 1471 /* 1472 * Create an empty extent record . 1473 * 1474 * l_next_free_rec may be updated. 1475 * 1476 * If an empty extent already exists do nothing. 1477 */ 1478 static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1479 { 1480 int next_free = le16_to_cpu(el->l_next_free_rec); 1481 1482 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1483 1484 if (next_free == 0) 1485 goto set_and_inc; 1486 1487 if (ocfs2_is_empty_extent(&el->l_recs[0])) 1488 return; 1489 1490 mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1491 "Asked to create an empty extent in a full list:\n" 1492 "count = %u, tree depth = %u", 1493 le16_to_cpu(el->l_count), 1494 le16_to_cpu(el->l_tree_depth)); 1495 1496 ocfs2_shift_records_right(el); 1497 1498 set_and_inc: 1499 le16_add_cpu(&el->l_next_free_rec, 1); 1500 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1501 } 1502 1503 /* 1504 * For a rotation which involves two leaf nodes, the "root node" is 1505 * the lowest level tree node which contains a path to both leafs. This 1506 * resulting set of information can be used to form a complete "subtree" 1507 * 1508 * This function is passed two full paths from the dinode down to a 1509 * pair of adjacent leaves. It's task is to figure out which path 1510 * index contains the subtree root - this can be the root index itself 1511 * in a worst-case rotation. 1512 * 1513 * The array index of the subtree root is passed back. 1514 */ 1515 static int ocfs2_find_subtree_root(struct inode *inode, 1516 struct ocfs2_path *left, 1517 struct ocfs2_path *right) 1518 { 1519 int i = 0; 1520 1521 /* 1522 * Check that the caller passed in two paths from the same tree. 1523 */ 1524 BUG_ON(path_root_bh(left) != path_root_bh(right)); 1525 1526 do { 1527 i++; 1528 1529 /* 1530 * The caller didn't pass two adjacent paths. 1531 */ 1532 mlog_bug_on_msg(i > left->p_tree_depth, 1533 "Inode %lu, left depth %u, right depth %u\n" 1534 "left leaf blk %llu, right leaf blk %llu\n", 1535 inode->i_ino, left->p_tree_depth, 1536 right->p_tree_depth, 1537 (unsigned long long)path_leaf_bh(left)->b_blocknr, 1538 (unsigned long long)path_leaf_bh(right)->b_blocknr); 1539 } while (left->p_node[i].bh->b_blocknr == 1540 right->p_node[i].bh->b_blocknr); 1541 1542 return i - 1; 1543 } 1544 1545 typedef void (path_insert_t)(void *, struct buffer_head *); 1546 1547 /* 1548 * Traverse a btree path in search of cpos, starting at root_el. 1549 * 1550 * This code can be called with a cpos larger than the tree, in which 1551 * case it will return the rightmost path. 1552 */ 1553 static int __ocfs2_find_path(struct inode *inode, 1554 struct ocfs2_extent_list *root_el, u32 cpos, 1555 path_insert_t *func, void *data) 1556 { 1557 int i, ret = 0; 1558 u32 range; 1559 u64 blkno; 1560 struct buffer_head *bh = NULL; 1561 struct ocfs2_extent_block *eb; 1562 struct ocfs2_extent_list *el; 1563 struct ocfs2_extent_rec *rec; 1564 struct ocfs2_inode_info *oi = OCFS2_I(inode); 1565 1566 el = root_el; 1567 while (el->l_tree_depth) { 1568 if (le16_to_cpu(el->l_next_free_rec) == 0) { 1569 ocfs2_error(inode->i_sb, 1570 "Inode %llu has empty extent list at " 1571 "depth %u\n", 1572 (unsigned long long)oi->ip_blkno, 1573 le16_to_cpu(el->l_tree_depth)); 1574 ret = -EROFS; 1575 goto out; 1576 1577 } 1578 1579 for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1580 rec = &el->l_recs[i]; 1581 1582 /* 1583 * In the case that cpos is off the allocation 1584 * tree, this should just wind up returning the 1585 * rightmost record. 1586 */ 1587 range = le32_to_cpu(rec->e_cpos) + 1588 ocfs2_rec_clusters(el, rec); 1589 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1590 break; 1591 } 1592 1593 blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1594 if (blkno == 0) { 1595 ocfs2_error(inode->i_sb, 1596 "Inode %llu has bad blkno in extent list " 1597 "at depth %u (index %d)\n", 1598 (unsigned long long)oi->ip_blkno, 1599 le16_to_cpu(el->l_tree_depth), i); 1600 ret = -EROFS; 1601 goto out; 1602 } 1603 1604 brelse(bh); 1605 bh = NULL; 1606 ret = ocfs2_read_extent_block(inode, blkno, &bh); 1607 if (ret) { 1608 mlog_errno(ret); 1609 goto out; 1610 } 1611 1612 eb = (struct ocfs2_extent_block *) bh->b_data; 1613 el = &eb->h_list; 1614 1615 if (le16_to_cpu(el->l_next_free_rec) > 1616 le16_to_cpu(el->l_count)) { 1617 ocfs2_error(inode->i_sb, 1618 "Inode %llu has bad count in extent list " 1619 "at block %llu (next free=%u, count=%u)\n", 1620 (unsigned long long)oi->ip_blkno, 1621 (unsigned long long)bh->b_blocknr, 1622 le16_to_cpu(el->l_next_free_rec), 1623 le16_to_cpu(el->l_count)); 1624 ret = -EROFS; 1625 goto out; 1626 } 1627 1628 if (func) 1629 func(data, bh); 1630 } 1631 1632 out: 1633 /* 1634 * Catch any trailing bh that the loop didn't handle. 1635 */ 1636 brelse(bh); 1637 1638 return ret; 1639 } 1640 1641 /* 1642 * Given an initialized path (that is, it has a valid root extent 1643 * list), this function will traverse the btree in search of the path 1644 * which would contain cpos. 1645 * 1646 * The path traveled is recorded in the path structure. 1647 * 1648 * Note that this will not do any comparisons on leaf node extent 1649 * records, so it will work fine in the case that we just added a tree 1650 * branch. 1651 */ 1652 struct find_path_data { 1653 int index; 1654 struct ocfs2_path *path; 1655 }; 1656 static void find_path_ins(void *data, struct buffer_head *bh) 1657 { 1658 struct find_path_data *fp = data; 1659 1660 get_bh(bh); 1661 ocfs2_path_insert_eb(fp->path, fp->index, bh); 1662 fp->index++; 1663 } 1664 static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1665 u32 cpos) 1666 { 1667 struct find_path_data data; 1668 1669 data.index = 1; 1670 data.path = path; 1671 return __ocfs2_find_path(inode, path_root_el(path), cpos, 1672 find_path_ins, &data); 1673 } 1674 1675 static void find_leaf_ins(void *data, struct buffer_head *bh) 1676 { 1677 struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1678 struct ocfs2_extent_list *el = &eb->h_list; 1679 struct buffer_head **ret = data; 1680 1681 /* We want to retain only the leaf block. */ 1682 if (le16_to_cpu(el->l_tree_depth) == 0) { 1683 get_bh(bh); 1684 *ret = bh; 1685 } 1686 } 1687 /* 1688 * Find the leaf block in the tree which would contain cpos. No 1689 * checking of the actual leaf is done. 1690 * 1691 * Some paths want to call this instead of allocating a path structure 1692 * and calling ocfs2_find_path(). 1693 * 1694 * This function doesn't handle non btree extent lists. 1695 */ 1696 int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1697 u32 cpos, struct buffer_head **leaf_bh) 1698 { 1699 int ret; 1700 struct buffer_head *bh = NULL; 1701 1702 ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1703 if (ret) { 1704 mlog_errno(ret); 1705 goto out; 1706 } 1707 1708 *leaf_bh = bh; 1709 out: 1710 return ret; 1711 } 1712 1713 /* 1714 * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1715 * 1716 * Basically, we've moved stuff around at the bottom of the tree and 1717 * we need to fix up the extent records above the changes to reflect 1718 * the new changes. 1719 * 1720 * left_rec: the record on the left. 1721 * left_child_el: is the child list pointed to by left_rec 1722 * right_rec: the record to the right of left_rec 1723 * right_child_el: is the child list pointed to by right_rec 1724 * 1725 * By definition, this only works on interior nodes. 1726 */ 1727 static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1728 struct ocfs2_extent_list *left_child_el, 1729 struct ocfs2_extent_rec *right_rec, 1730 struct ocfs2_extent_list *right_child_el) 1731 { 1732 u32 left_clusters, right_end; 1733 1734 /* 1735 * Interior nodes never have holes. Their cpos is the cpos of 1736 * the leftmost record in their child list. Their cluster 1737 * count covers the full theoretical range of their child list 1738 * - the range between their cpos and the cpos of the record 1739 * immediately to their right. 1740 */ 1741 left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1742 if (ocfs2_is_empty_extent(&right_child_el->l_recs[0])) { 1743 BUG_ON(le16_to_cpu(right_child_el->l_next_free_rec) <= 1); 1744 left_clusters = le32_to_cpu(right_child_el->l_recs[1].e_cpos); 1745 } 1746 left_clusters -= le32_to_cpu(left_rec->e_cpos); 1747 left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1748 1749 /* 1750 * Calculate the rightmost cluster count boundary before 1751 * moving cpos - we will need to adjust clusters after 1752 * updating e_cpos to keep the same highest cluster count. 1753 */ 1754 right_end = le32_to_cpu(right_rec->e_cpos); 1755 right_end += le32_to_cpu(right_rec->e_int_clusters); 1756 1757 right_rec->e_cpos = left_rec->e_cpos; 1758 le32_add_cpu(&right_rec->e_cpos, left_clusters); 1759 1760 right_end -= le32_to_cpu(right_rec->e_cpos); 1761 right_rec->e_int_clusters = cpu_to_le32(right_end); 1762 } 1763 1764 /* 1765 * Adjust the adjacent root node records involved in a 1766 * rotation. left_el_blkno is passed in as a key so that we can easily 1767 * find it's index in the root list. 1768 */ 1769 static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1770 struct ocfs2_extent_list *left_el, 1771 struct ocfs2_extent_list *right_el, 1772 u64 left_el_blkno) 1773 { 1774 int i; 1775 1776 BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1777 le16_to_cpu(left_el->l_tree_depth)); 1778 1779 for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1780 if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1781 break; 1782 } 1783 1784 /* 1785 * The path walking code should have never returned a root and 1786 * two paths which are not adjacent. 1787 */ 1788 BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1789 1790 ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1791 &root_el->l_recs[i + 1], right_el); 1792 } 1793 1794 /* 1795 * We've changed a leaf block (in right_path) and need to reflect that 1796 * change back up the subtree. 1797 * 1798 * This happens in multiple places: 1799 * - When we've moved an extent record from the left path leaf to the right 1800 * path leaf to make room for an empty extent in the left path leaf. 1801 * - When our insert into the right path leaf is at the leftmost edge 1802 * and requires an update of the path immediately to it's left. This 1803 * can occur at the end of some types of rotation and appending inserts. 1804 * - When we've adjusted the last extent record in the left path leaf and the 1805 * 1st extent record in the right path leaf during cross extent block merge. 1806 */ 1807 static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1808 struct ocfs2_path *left_path, 1809 struct ocfs2_path *right_path, 1810 int subtree_index) 1811 { 1812 int ret, i, idx; 1813 struct ocfs2_extent_list *el, *left_el, *right_el; 1814 struct ocfs2_extent_rec *left_rec, *right_rec; 1815 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1816 1817 /* 1818 * Update the counts and position values within all the 1819 * interior nodes to reflect the leaf rotation we just did. 1820 * 1821 * The root node is handled below the loop. 1822 * 1823 * We begin the loop with right_el and left_el pointing to the 1824 * leaf lists and work our way up. 1825 * 1826 * NOTE: within this loop, left_el and right_el always refer 1827 * to the *child* lists. 1828 */ 1829 left_el = path_leaf_el(left_path); 1830 right_el = path_leaf_el(right_path); 1831 for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1832 mlog(0, "Adjust records at index %u\n", i); 1833 1834 /* 1835 * One nice property of knowing that all of these 1836 * nodes are below the root is that we only deal with 1837 * the leftmost right node record and the rightmost 1838 * left node record. 1839 */ 1840 el = left_path->p_node[i].el; 1841 idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1842 left_rec = &el->l_recs[idx]; 1843 1844 el = right_path->p_node[i].el; 1845 right_rec = &el->l_recs[0]; 1846 1847 ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1848 right_el); 1849 1850 ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1851 if (ret) 1852 mlog_errno(ret); 1853 1854 ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1855 if (ret) 1856 mlog_errno(ret); 1857 1858 /* 1859 * Setup our list pointers now so that the current 1860 * parents become children in the next iteration. 1861 */ 1862 left_el = left_path->p_node[i].el; 1863 right_el = right_path->p_node[i].el; 1864 } 1865 1866 /* 1867 * At the root node, adjust the two adjacent records which 1868 * begin our path to the leaves. 1869 */ 1870 1871 el = left_path->p_node[subtree_index].el; 1872 left_el = left_path->p_node[subtree_index + 1].el; 1873 right_el = right_path->p_node[subtree_index + 1].el; 1874 1875 ocfs2_adjust_root_records(el, left_el, right_el, 1876 left_path->p_node[subtree_index + 1].bh->b_blocknr); 1877 1878 root_bh = left_path->p_node[subtree_index].bh; 1879 1880 ret = ocfs2_journal_dirty(handle, root_bh); 1881 if (ret) 1882 mlog_errno(ret); 1883 } 1884 1885 static int ocfs2_rotate_subtree_right(struct inode *inode, 1886 handle_t *handle, 1887 struct ocfs2_path *left_path, 1888 struct ocfs2_path *right_path, 1889 int subtree_index) 1890 { 1891 int ret, i; 1892 struct buffer_head *right_leaf_bh; 1893 struct buffer_head *left_leaf_bh = NULL; 1894 struct buffer_head *root_bh; 1895 struct ocfs2_extent_list *right_el, *left_el; 1896 struct ocfs2_extent_rec move_rec; 1897 1898 left_leaf_bh = path_leaf_bh(left_path); 1899 left_el = path_leaf_el(left_path); 1900 1901 if (left_el->l_next_free_rec != left_el->l_count) { 1902 ocfs2_error(inode->i_sb, 1903 "Inode %llu has non-full interior leaf node %llu" 1904 "(next free = %u)", 1905 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1906 (unsigned long long)left_leaf_bh->b_blocknr, 1907 le16_to_cpu(left_el->l_next_free_rec)); 1908 return -EROFS; 1909 } 1910 1911 /* 1912 * This extent block may already have an empty record, so we 1913 * return early if so. 1914 */ 1915 if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1916 return 0; 1917 1918 root_bh = left_path->p_node[subtree_index].bh; 1919 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1920 1921 ret = ocfs2_journal_access(handle, inode, root_bh, 1922 OCFS2_JOURNAL_ACCESS_WRITE); 1923 if (ret) { 1924 mlog_errno(ret); 1925 goto out; 1926 } 1927 1928 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1929 ret = ocfs2_journal_access(handle, inode, 1930 right_path->p_node[i].bh, 1931 OCFS2_JOURNAL_ACCESS_WRITE); 1932 if (ret) { 1933 mlog_errno(ret); 1934 goto out; 1935 } 1936 1937 ret = ocfs2_journal_access(handle, inode, 1938 left_path->p_node[i].bh, 1939 OCFS2_JOURNAL_ACCESS_WRITE); 1940 if (ret) { 1941 mlog_errno(ret); 1942 goto out; 1943 } 1944 } 1945 1946 right_leaf_bh = path_leaf_bh(right_path); 1947 right_el = path_leaf_el(right_path); 1948 1949 /* This is a code error, not a disk corruption. */ 1950 mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1951 "because rightmost leaf block %llu is empty\n", 1952 (unsigned long long)OCFS2_I(inode)->ip_blkno, 1953 (unsigned long long)right_leaf_bh->b_blocknr); 1954 1955 ocfs2_create_empty_extent(right_el); 1956 1957 ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1958 if (ret) { 1959 mlog_errno(ret); 1960 goto out; 1961 } 1962 1963 /* Do the copy now. */ 1964 i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1965 move_rec = left_el->l_recs[i]; 1966 right_el->l_recs[0] = move_rec; 1967 1968 /* 1969 * Clear out the record we just copied and shift everything 1970 * over, leaving an empty extent in the left leaf. 1971 * 1972 * We temporarily subtract from next_free_rec so that the 1973 * shift will lose the tail record (which is now defunct). 1974 */ 1975 le16_add_cpu(&left_el->l_next_free_rec, -1); 1976 ocfs2_shift_records_right(left_el); 1977 memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1978 le16_add_cpu(&left_el->l_next_free_rec, 1); 1979 1980 ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1981 if (ret) { 1982 mlog_errno(ret); 1983 goto out; 1984 } 1985 1986 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1987 subtree_index); 1988 1989 out: 1990 return ret; 1991 } 1992 1993 /* 1994 * Given a full path, determine what cpos value would return us a path 1995 * containing the leaf immediately to the left of the current one. 1996 * 1997 * Will return zero if the path passed in is already the leftmost path. 1998 */ 1999 static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 2000 struct ocfs2_path *path, u32 *cpos) 2001 { 2002 int i, j, ret = 0; 2003 u64 blkno; 2004 struct ocfs2_extent_list *el; 2005 2006 BUG_ON(path->p_tree_depth == 0); 2007 2008 *cpos = 0; 2009 2010 blkno = path_leaf_bh(path)->b_blocknr; 2011 2012 /* Start at the tree node just above the leaf and work our way up. */ 2013 i = path->p_tree_depth - 1; 2014 while (i >= 0) { 2015 el = path->p_node[i].el; 2016 2017 /* 2018 * Find the extent record just before the one in our 2019 * path. 2020 */ 2021 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2022 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2023 if (j == 0) { 2024 if (i == 0) { 2025 /* 2026 * We've determined that the 2027 * path specified is already 2028 * the leftmost one - return a 2029 * cpos of zero. 2030 */ 2031 goto out; 2032 } 2033 /* 2034 * The leftmost record points to our 2035 * leaf - we need to travel up the 2036 * tree one level. 2037 */ 2038 goto next_node; 2039 } 2040 2041 *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 2042 *cpos = *cpos + ocfs2_rec_clusters(el, 2043 &el->l_recs[j - 1]); 2044 *cpos = *cpos - 1; 2045 goto out; 2046 } 2047 } 2048 2049 /* 2050 * If we got here, we never found a valid node where 2051 * the tree indicated one should be. 2052 */ 2053 ocfs2_error(sb, 2054 "Invalid extent tree at extent block %llu\n", 2055 (unsigned long long)blkno); 2056 ret = -EROFS; 2057 goto out; 2058 2059 next_node: 2060 blkno = path->p_node[i].bh->b_blocknr; 2061 i--; 2062 } 2063 2064 out: 2065 return ret; 2066 } 2067 2068 /* 2069 * Extend the transaction by enough credits to complete the rotation, 2070 * and still leave at least the original number of credits allocated 2071 * to this transaction. 2072 */ 2073 static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 2074 int op_credits, 2075 struct ocfs2_path *path) 2076 { 2077 int credits = (path->p_tree_depth - subtree_depth) * 2 + 1 + op_credits; 2078 2079 if (handle->h_buffer_credits < credits) 2080 return ocfs2_extend_trans(handle, credits); 2081 2082 return 0; 2083 } 2084 2085 /* 2086 * Trap the case where we're inserting into the theoretical range past 2087 * the _actual_ left leaf range. Otherwise, we'll rotate a record 2088 * whose cpos is less than ours into the right leaf. 2089 * 2090 * It's only necessary to look at the rightmost record of the left 2091 * leaf because the logic that calls us should ensure that the 2092 * theoretical ranges in the path components above the leaves are 2093 * correct. 2094 */ 2095 static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 2096 u32 insert_cpos) 2097 { 2098 struct ocfs2_extent_list *left_el; 2099 struct ocfs2_extent_rec *rec; 2100 int next_free; 2101 2102 left_el = path_leaf_el(left_path); 2103 next_free = le16_to_cpu(left_el->l_next_free_rec); 2104 rec = &left_el->l_recs[next_free - 1]; 2105 2106 if (insert_cpos > le32_to_cpu(rec->e_cpos)) 2107 return 1; 2108 return 0; 2109 } 2110 2111 static int ocfs2_leftmost_rec_contains(struct ocfs2_extent_list *el, u32 cpos) 2112 { 2113 int next_free = le16_to_cpu(el->l_next_free_rec); 2114 unsigned int range; 2115 struct ocfs2_extent_rec *rec; 2116 2117 if (next_free == 0) 2118 return 0; 2119 2120 rec = &el->l_recs[0]; 2121 if (ocfs2_is_empty_extent(rec)) { 2122 /* Empty list. */ 2123 if (next_free == 1) 2124 return 0; 2125 rec = &el->l_recs[1]; 2126 } 2127 2128 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2129 if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 2130 return 1; 2131 return 0; 2132 } 2133 2134 /* 2135 * Rotate all the records in a btree right one record, starting at insert_cpos. 2136 * 2137 * The path to the rightmost leaf should be passed in. 2138 * 2139 * The array is assumed to be large enough to hold an entire path (tree depth). 2140 * 2141 * Upon succesful return from this function: 2142 * 2143 * - The 'right_path' array will contain a path to the leaf block 2144 * whose range contains e_cpos. 2145 * - That leaf block will have a single empty extent in list index 0. 2146 * - In the case that the rotation requires a post-insert update, 2147 * *ret_left_path will contain a valid path which can be passed to 2148 * ocfs2_insert_path(). 2149 */ 2150 static int ocfs2_rotate_tree_right(struct inode *inode, 2151 handle_t *handle, 2152 enum ocfs2_split_type split, 2153 u32 insert_cpos, 2154 struct ocfs2_path *right_path, 2155 struct ocfs2_path **ret_left_path) 2156 { 2157 int ret, start, orig_credits = handle->h_buffer_credits; 2158 u32 cpos; 2159 struct ocfs2_path *left_path = NULL; 2160 2161 *ret_left_path = NULL; 2162 2163 left_path = ocfs2_new_path_from_path(right_path); 2164 if (!left_path) { 2165 ret = -ENOMEM; 2166 mlog_errno(ret); 2167 goto out; 2168 } 2169 2170 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 2171 if (ret) { 2172 mlog_errno(ret); 2173 goto out; 2174 } 2175 2176 mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 2177 2178 /* 2179 * What we want to do here is: 2180 * 2181 * 1) Start with the rightmost path. 2182 * 2183 * 2) Determine a path to the leaf block directly to the left 2184 * of that leaf. 2185 * 2186 * 3) Determine the 'subtree root' - the lowest level tree node 2187 * which contains a path to both leaves. 2188 * 2189 * 4) Rotate the subtree. 2190 * 2191 * 5) Find the next subtree by considering the left path to be 2192 * the new right path. 2193 * 2194 * The check at the top of this while loop also accepts 2195 * insert_cpos == cpos because cpos is only a _theoretical_ 2196 * value to get us the left path - insert_cpos might very well 2197 * be filling that hole. 2198 * 2199 * Stop at a cpos of '0' because we either started at the 2200 * leftmost branch (i.e., a tree with one branch and a 2201 * rotation inside of it), or we've gone as far as we can in 2202 * rotating subtrees. 2203 */ 2204 while (cpos && insert_cpos <= cpos) { 2205 mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 2206 insert_cpos, cpos); 2207 2208 ret = ocfs2_find_path(inode, left_path, cpos); 2209 if (ret) { 2210 mlog_errno(ret); 2211 goto out; 2212 } 2213 2214 mlog_bug_on_msg(path_leaf_bh(left_path) == 2215 path_leaf_bh(right_path), 2216 "Inode %lu: error during insert of %u " 2217 "(left path cpos %u) results in two identical " 2218 "paths ending at %llu\n", 2219 inode->i_ino, insert_cpos, cpos, 2220 (unsigned long long) 2221 path_leaf_bh(left_path)->b_blocknr); 2222 2223 if (split == SPLIT_NONE && 2224 ocfs2_rotate_requires_path_adjustment(left_path, 2225 insert_cpos)) { 2226 2227 /* 2228 * We've rotated the tree as much as we 2229 * should. The rest is up to 2230 * ocfs2_insert_path() to complete, after the 2231 * record insertion. We indicate this 2232 * situation by returning the left path. 2233 * 2234 * The reason we don't adjust the records here 2235 * before the record insert is that an error 2236 * later might break the rule where a parent 2237 * record e_cpos will reflect the actual 2238 * e_cpos of the 1st nonempty record of the 2239 * child list. 2240 */ 2241 *ret_left_path = left_path; 2242 goto out_ret_path; 2243 } 2244 2245 start = ocfs2_find_subtree_root(inode, left_path, right_path); 2246 2247 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2248 start, 2249 (unsigned long long) right_path->p_node[start].bh->b_blocknr, 2250 right_path->p_tree_depth); 2251 2252 ret = ocfs2_extend_rotate_transaction(handle, start, 2253 orig_credits, right_path); 2254 if (ret) { 2255 mlog_errno(ret); 2256 goto out; 2257 } 2258 2259 ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 2260 right_path, start); 2261 if (ret) { 2262 mlog_errno(ret); 2263 goto out; 2264 } 2265 2266 if (split != SPLIT_NONE && 2267 ocfs2_leftmost_rec_contains(path_leaf_el(right_path), 2268 insert_cpos)) { 2269 /* 2270 * A rotate moves the rightmost left leaf 2271 * record over to the leftmost right leaf 2272 * slot. If we're doing an extent split 2273 * instead of a real insert, then we have to 2274 * check that the extent to be split wasn't 2275 * just moved over. If it was, then we can 2276 * exit here, passing left_path back - 2277 * ocfs2_split_extent() is smart enough to 2278 * search both leaves. 2279 */ 2280 *ret_left_path = left_path; 2281 goto out_ret_path; 2282 } 2283 2284 /* 2285 * There is no need to re-read the next right path 2286 * as we know that it'll be our current left 2287 * path. Optimize by copying values instead. 2288 */ 2289 ocfs2_mv_path(right_path, left_path); 2290 2291 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 2292 &cpos); 2293 if (ret) { 2294 mlog_errno(ret); 2295 goto out; 2296 } 2297 } 2298 2299 out: 2300 ocfs2_free_path(left_path); 2301 2302 out_ret_path: 2303 return ret; 2304 } 2305 2306 static void ocfs2_update_edge_lengths(struct inode *inode, handle_t *handle, 2307 struct ocfs2_path *path) 2308 { 2309 int i, idx; 2310 struct ocfs2_extent_rec *rec; 2311 struct ocfs2_extent_list *el; 2312 struct ocfs2_extent_block *eb; 2313 u32 range; 2314 2315 /* Path should always be rightmost. */ 2316 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2317 BUG_ON(eb->h_next_leaf_blk != 0ULL); 2318 2319 el = &eb->h_list; 2320 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 2321 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2322 rec = &el->l_recs[idx]; 2323 range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 2324 2325 for (i = 0; i < path->p_tree_depth; i++) { 2326 el = path->p_node[i].el; 2327 idx = le16_to_cpu(el->l_next_free_rec) - 1; 2328 rec = &el->l_recs[idx]; 2329 2330 rec->e_int_clusters = cpu_to_le32(range); 2331 le32_add_cpu(&rec->e_int_clusters, -le32_to_cpu(rec->e_cpos)); 2332 2333 ocfs2_journal_dirty(handle, path->p_node[i].bh); 2334 } 2335 } 2336 2337 static void ocfs2_unlink_path(struct inode *inode, handle_t *handle, 2338 struct ocfs2_cached_dealloc_ctxt *dealloc, 2339 struct ocfs2_path *path, int unlink_start) 2340 { 2341 int ret, i; 2342 struct ocfs2_extent_block *eb; 2343 struct ocfs2_extent_list *el; 2344 struct buffer_head *bh; 2345 2346 for(i = unlink_start; i < path_num_items(path); i++) { 2347 bh = path->p_node[i].bh; 2348 2349 eb = (struct ocfs2_extent_block *)bh->b_data; 2350 /* 2351 * Not all nodes might have had their final count 2352 * decremented by the caller - handle this here. 2353 */ 2354 el = &eb->h_list; 2355 if (le16_to_cpu(el->l_next_free_rec) > 1) { 2356 mlog(ML_ERROR, 2357 "Inode %llu, attempted to remove extent block " 2358 "%llu with %u records\n", 2359 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2360 (unsigned long long)le64_to_cpu(eb->h_blkno), 2361 le16_to_cpu(el->l_next_free_rec)); 2362 2363 ocfs2_journal_dirty(handle, bh); 2364 ocfs2_remove_from_cache(inode, bh); 2365 continue; 2366 } 2367 2368 el->l_next_free_rec = 0; 2369 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2370 2371 ocfs2_journal_dirty(handle, bh); 2372 2373 ret = ocfs2_cache_extent_block_free(dealloc, eb); 2374 if (ret) 2375 mlog_errno(ret); 2376 2377 ocfs2_remove_from_cache(inode, bh); 2378 } 2379 } 2380 2381 static void ocfs2_unlink_subtree(struct inode *inode, handle_t *handle, 2382 struct ocfs2_path *left_path, 2383 struct ocfs2_path *right_path, 2384 int subtree_index, 2385 struct ocfs2_cached_dealloc_ctxt *dealloc) 2386 { 2387 int i; 2388 struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 2389 struct ocfs2_extent_list *root_el = left_path->p_node[subtree_index].el; 2390 struct ocfs2_extent_list *el; 2391 struct ocfs2_extent_block *eb; 2392 2393 el = path_leaf_el(left_path); 2394 2395 eb = (struct ocfs2_extent_block *)right_path->p_node[subtree_index + 1].bh->b_data; 2396 2397 for(i = 1; i < le16_to_cpu(root_el->l_next_free_rec); i++) 2398 if (root_el->l_recs[i].e_blkno == eb->h_blkno) 2399 break; 2400 2401 BUG_ON(i >= le16_to_cpu(root_el->l_next_free_rec)); 2402 2403 memset(&root_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 2404 le16_add_cpu(&root_el->l_next_free_rec, -1); 2405 2406 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2407 eb->h_next_leaf_blk = 0; 2408 2409 ocfs2_journal_dirty(handle, root_bh); 2410 ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2411 2412 ocfs2_unlink_path(inode, handle, dealloc, right_path, 2413 subtree_index + 1); 2414 } 2415 2416 static int ocfs2_rotate_subtree_left(struct inode *inode, handle_t *handle, 2417 struct ocfs2_path *left_path, 2418 struct ocfs2_path *right_path, 2419 int subtree_index, 2420 struct ocfs2_cached_dealloc_ctxt *dealloc, 2421 int *deleted, 2422 struct ocfs2_extent_tree *et) 2423 { 2424 int ret, i, del_right_subtree = 0, right_has_empty = 0; 2425 struct buffer_head *root_bh, *et_root_bh = path_root_bh(right_path); 2426 struct ocfs2_extent_list *right_leaf_el, *left_leaf_el; 2427 struct ocfs2_extent_block *eb; 2428 2429 *deleted = 0; 2430 2431 right_leaf_el = path_leaf_el(right_path); 2432 left_leaf_el = path_leaf_el(left_path); 2433 root_bh = left_path->p_node[subtree_index].bh; 2434 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 2435 2436 if (!ocfs2_is_empty_extent(&left_leaf_el->l_recs[0])) 2437 return 0; 2438 2439 eb = (struct ocfs2_extent_block *)path_leaf_bh(right_path)->b_data; 2440 if (ocfs2_is_empty_extent(&right_leaf_el->l_recs[0])) { 2441 /* 2442 * It's legal for us to proceed if the right leaf is 2443 * the rightmost one and it has an empty extent. There 2444 * are two cases to handle - whether the leaf will be 2445 * empty after removal or not. If the leaf isn't empty 2446 * then just remove the empty extent up front. The 2447 * next block will handle empty leaves by flagging 2448 * them for unlink. 2449 * 2450 * Non rightmost leaves will throw -EAGAIN and the 2451 * caller can manually move the subtree and retry. 2452 */ 2453 2454 if (eb->h_next_leaf_blk != 0ULL) 2455 return -EAGAIN; 2456 2457 if (le16_to_cpu(right_leaf_el->l_next_free_rec) > 1) { 2458 ret = ocfs2_journal_access(handle, inode, 2459 path_leaf_bh(right_path), 2460 OCFS2_JOURNAL_ACCESS_WRITE); 2461 if (ret) { 2462 mlog_errno(ret); 2463 goto out; 2464 } 2465 2466 ocfs2_remove_empty_extent(right_leaf_el); 2467 } else 2468 right_has_empty = 1; 2469 } 2470 2471 if (eb->h_next_leaf_blk == 0ULL && 2472 le16_to_cpu(right_leaf_el->l_next_free_rec) == 1) { 2473 /* 2474 * We have to update i_last_eb_blk during the meta 2475 * data delete. 2476 */ 2477 ret = ocfs2_journal_access(handle, inode, et_root_bh, 2478 OCFS2_JOURNAL_ACCESS_WRITE); 2479 if (ret) { 2480 mlog_errno(ret); 2481 goto out; 2482 } 2483 2484 del_right_subtree = 1; 2485 } 2486 2487 /* 2488 * Getting here with an empty extent in the right path implies 2489 * that it's the rightmost path and will be deleted. 2490 */ 2491 BUG_ON(right_has_empty && !del_right_subtree); 2492 2493 ret = ocfs2_journal_access(handle, inode, root_bh, 2494 OCFS2_JOURNAL_ACCESS_WRITE); 2495 if (ret) { 2496 mlog_errno(ret); 2497 goto out; 2498 } 2499 2500 for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 2501 ret = ocfs2_journal_access(handle, inode, 2502 right_path->p_node[i].bh, 2503 OCFS2_JOURNAL_ACCESS_WRITE); 2504 if (ret) { 2505 mlog_errno(ret); 2506 goto out; 2507 } 2508 2509 ret = ocfs2_journal_access(handle, inode, 2510 left_path->p_node[i].bh, 2511 OCFS2_JOURNAL_ACCESS_WRITE); 2512 if (ret) { 2513 mlog_errno(ret); 2514 goto out; 2515 } 2516 } 2517 2518 if (!right_has_empty) { 2519 /* 2520 * Only do this if we're moving a real 2521 * record. Otherwise, the action is delayed until 2522 * after removal of the right path in which case we 2523 * can do a simple shift to remove the empty extent. 2524 */ 2525 ocfs2_rotate_leaf(left_leaf_el, &right_leaf_el->l_recs[0]); 2526 memset(&right_leaf_el->l_recs[0], 0, 2527 sizeof(struct ocfs2_extent_rec)); 2528 } 2529 if (eb->h_next_leaf_blk == 0ULL) { 2530 /* 2531 * Move recs over to get rid of empty extent, decrease 2532 * next_free. This is allowed to remove the last 2533 * extent in our leaf (setting l_next_free_rec to 2534 * zero) - the delete code below won't care. 2535 */ 2536 ocfs2_remove_empty_extent(right_leaf_el); 2537 } 2538 2539 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 2540 if (ret) 2541 mlog_errno(ret); 2542 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 2543 if (ret) 2544 mlog_errno(ret); 2545 2546 if (del_right_subtree) { 2547 ocfs2_unlink_subtree(inode, handle, left_path, right_path, 2548 subtree_index, dealloc); 2549 ocfs2_update_edge_lengths(inode, handle, left_path); 2550 2551 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2552 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2553 2554 /* 2555 * Removal of the extent in the left leaf was skipped 2556 * above so we could delete the right path 2557 * 1st. 2558 */ 2559 if (right_has_empty) 2560 ocfs2_remove_empty_extent(left_leaf_el); 2561 2562 ret = ocfs2_journal_dirty(handle, et_root_bh); 2563 if (ret) 2564 mlog_errno(ret); 2565 2566 *deleted = 1; 2567 } else 2568 ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 2569 subtree_index); 2570 2571 out: 2572 return ret; 2573 } 2574 2575 /* 2576 * Given a full path, determine what cpos value would return us a path 2577 * containing the leaf immediately to the right of the current one. 2578 * 2579 * Will return zero if the path passed in is already the rightmost path. 2580 * 2581 * This looks similar, but is subtly different to 2582 * ocfs2_find_cpos_for_left_leaf(). 2583 */ 2584 static int ocfs2_find_cpos_for_right_leaf(struct super_block *sb, 2585 struct ocfs2_path *path, u32 *cpos) 2586 { 2587 int i, j, ret = 0; 2588 u64 blkno; 2589 struct ocfs2_extent_list *el; 2590 2591 *cpos = 0; 2592 2593 if (path->p_tree_depth == 0) 2594 return 0; 2595 2596 blkno = path_leaf_bh(path)->b_blocknr; 2597 2598 /* Start at the tree node just above the leaf and work our way up. */ 2599 i = path->p_tree_depth - 1; 2600 while (i >= 0) { 2601 int next_free; 2602 2603 el = path->p_node[i].el; 2604 2605 /* 2606 * Find the extent record just after the one in our 2607 * path. 2608 */ 2609 next_free = le16_to_cpu(el->l_next_free_rec); 2610 for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 2611 if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 2612 if (j == (next_free - 1)) { 2613 if (i == 0) { 2614 /* 2615 * We've determined that the 2616 * path specified is already 2617 * the rightmost one - return a 2618 * cpos of zero. 2619 */ 2620 goto out; 2621 } 2622 /* 2623 * The rightmost record points to our 2624 * leaf - we need to travel up the 2625 * tree one level. 2626 */ 2627 goto next_node; 2628 } 2629 2630 *cpos = le32_to_cpu(el->l_recs[j + 1].e_cpos); 2631 goto out; 2632 } 2633 } 2634 2635 /* 2636 * If we got here, we never found a valid node where 2637 * the tree indicated one should be. 2638 */ 2639 ocfs2_error(sb, 2640 "Invalid extent tree at extent block %llu\n", 2641 (unsigned long long)blkno); 2642 ret = -EROFS; 2643 goto out; 2644 2645 next_node: 2646 blkno = path->p_node[i].bh->b_blocknr; 2647 i--; 2648 } 2649 2650 out: 2651 return ret; 2652 } 2653 2654 static int ocfs2_rotate_rightmost_leaf_left(struct inode *inode, 2655 handle_t *handle, 2656 struct buffer_head *bh, 2657 struct ocfs2_extent_list *el) 2658 { 2659 int ret; 2660 2661 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2662 return 0; 2663 2664 ret = ocfs2_journal_access(handle, inode, bh, 2665 OCFS2_JOURNAL_ACCESS_WRITE); 2666 if (ret) { 2667 mlog_errno(ret); 2668 goto out; 2669 } 2670 2671 ocfs2_remove_empty_extent(el); 2672 2673 ret = ocfs2_journal_dirty(handle, bh); 2674 if (ret) 2675 mlog_errno(ret); 2676 2677 out: 2678 return ret; 2679 } 2680 2681 static int __ocfs2_rotate_tree_left(struct inode *inode, 2682 handle_t *handle, int orig_credits, 2683 struct ocfs2_path *path, 2684 struct ocfs2_cached_dealloc_ctxt *dealloc, 2685 struct ocfs2_path **empty_extent_path, 2686 struct ocfs2_extent_tree *et) 2687 { 2688 int ret, subtree_root, deleted; 2689 u32 right_cpos; 2690 struct ocfs2_path *left_path = NULL; 2691 struct ocfs2_path *right_path = NULL; 2692 2693 BUG_ON(!ocfs2_is_empty_extent(&(path_leaf_el(path)->l_recs[0]))); 2694 2695 *empty_extent_path = NULL; 2696 2697 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, path, 2698 &right_cpos); 2699 if (ret) { 2700 mlog_errno(ret); 2701 goto out; 2702 } 2703 2704 left_path = ocfs2_new_path_from_path(path); 2705 if (!left_path) { 2706 ret = -ENOMEM; 2707 mlog_errno(ret); 2708 goto out; 2709 } 2710 2711 ocfs2_cp_path(left_path, path); 2712 2713 right_path = ocfs2_new_path_from_path(path); 2714 if (!right_path) { 2715 ret = -ENOMEM; 2716 mlog_errno(ret); 2717 goto out; 2718 } 2719 2720 while (right_cpos) { 2721 ret = ocfs2_find_path(inode, right_path, right_cpos); 2722 if (ret) { 2723 mlog_errno(ret); 2724 goto out; 2725 } 2726 2727 subtree_root = ocfs2_find_subtree_root(inode, left_path, 2728 right_path); 2729 2730 mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 2731 subtree_root, 2732 (unsigned long long) 2733 right_path->p_node[subtree_root].bh->b_blocknr, 2734 right_path->p_tree_depth); 2735 2736 ret = ocfs2_extend_rotate_transaction(handle, subtree_root, 2737 orig_credits, left_path); 2738 if (ret) { 2739 mlog_errno(ret); 2740 goto out; 2741 } 2742 2743 /* 2744 * Caller might still want to make changes to the 2745 * tree root, so re-add it to the journal here. 2746 */ 2747 ret = ocfs2_journal_access(handle, inode, 2748 path_root_bh(left_path), 2749 OCFS2_JOURNAL_ACCESS_WRITE); 2750 if (ret) { 2751 mlog_errno(ret); 2752 goto out; 2753 } 2754 2755 ret = ocfs2_rotate_subtree_left(inode, handle, left_path, 2756 right_path, subtree_root, 2757 dealloc, &deleted, et); 2758 if (ret == -EAGAIN) { 2759 /* 2760 * The rotation has to temporarily stop due to 2761 * the right subtree having an empty 2762 * extent. Pass it back to the caller for a 2763 * fixup. 2764 */ 2765 *empty_extent_path = right_path; 2766 right_path = NULL; 2767 goto out; 2768 } 2769 if (ret) { 2770 mlog_errno(ret); 2771 goto out; 2772 } 2773 2774 /* 2775 * The subtree rotate might have removed records on 2776 * the rightmost edge. If so, then rotation is 2777 * complete. 2778 */ 2779 if (deleted) 2780 break; 2781 2782 ocfs2_mv_path(left_path, right_path); 2783 2784 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 2785 &right_cpos); 2786 if (ret) { 2787 mlog_errno(ret); 2788 goto out; 2789 } 2790 } 2791 2792 out: 2793 ocfs2_free_path(right_path); 2794 ocfs2_free_path(left_path); 2795 2796 return ret; 2797 } 2798 2799 static int ocfs2_remove_rightmost_path(struct inode *inode, handle_t *handle, 2800 struct ocfs2_path *path, 2801 struct ocfs2_cached_dealloc_ctxt *dealloc, 2802 struct ocfs2_extent_tree *et) 2803 { 2804 int ret, subtree_index; 2805 u32 cpos; 2806 struct ocfs2_path *left_path = NULL; 2807 struct ocfs2_extent_block *eb; 2808 struct ocfs2_extent_list *el; 2809 2810 2811 ret = ocfs2_et_sanity_check(inode, et); 2812 if (ret) 2813 goto out; 2814 /* 2815 * There's two ways we handle this depending on 2816 * whether path is the only existing one. 2817 */ 2818 ret = ocfs2_extend_rotate_transaction(handle, 0, 2819 handle->h_buffer_credits, 2820 path); 2821 if (ret) { 2822 mlog_errno(ret); 2823 goto out; 2824 } 2825 2826 ret = ocfs2_journal_access_path(inode, handle, path); 2827 if (ret) { 2828 mlog_errno(ret); 2829 goto out; 2830 } 2831 2832 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 2833 if (ret) { 2834 mlog_errno(ret); 2835 goto out; 2836 } 2837 2838 if (cpos) { 2839 /* 2840 * We have a path to the left of this one - it needs 2841 * an update too. 2842 */ 2843 left_path = ocfs2_new_path_from_path(path); 2844 if (!left_path) { 2845 ret = -ENOMEM; 2846 mlog_errno(ret); 2847 goto out; 2848 } 2849 2850 ret = ocfs2_find_path(inode, left_path, cpos); 2851 if (ret) { 2852 mlog_errno(ret); 2853 goto out; 2854 } 2855 2856 ret = ocfs2_journal_access_path(inode, handle, left_path); 2857 if (ret) { 2858 mlog_errno(ret); 2859 goto out; 2860 } 2861 2862 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 2863 2864 ocfs2_unlink_subtree(inode, handle, left_path, path, 2865 subtree_index, dealloc); 2866 ocfs2_update_edge_lengths(inode, handle, left_path); 2867 2868 eb = (struct ocfs2_extent_block *)path_leaf_bh(left_path)->b_data; 2869 ocfs2_et_set_last_eb_blk(et, le64_to_cpu(eb->h_blkno)); 2870 } else { 2871 /* 2872 * 'path' is also the leftmost path which 2873 * means it must be the only one. This gets 2874 * handled differently because we want to 2875 * revert the inode back to having extents 2876 * in-line. 2877 */ 2878 ocfs2_unlink_path(inode, handle, dealloc, path, 1); 2879 2880 el = et->et_root_el; 2881 el->l_tree_depth = 0; 2882 el->l_next_free_rec = 0; 2883 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 2884 2885 ocfs2_et_set_last_eb_blk(et, 0); 2886 } 2887 2888 ocfs2_journal_dirty(handle, path_root_bh(path)); 2889 2890 out: 2891 ocfs2_free_path(left_path); 2892 return ret; 2893 } 2894 2895 /* 2896 * Left rotation of btree records. 2897 * 2898 * In many ways, this is (unsurprisingly) the opposite of right 2899 * rotation. We start at some non-rightmost path containing an empty 2900 * extent in the leaf block. The code works its way to the rightmost 2901 * path by rotating records to the left in every subtree. 2902 * 2903 * This is used by any code which reduces the number of extent records 2904 * in a leaf. After removal, an empty record should be placed in the 2905 * leftmost list position. 2906 * 2907 * This won't handle a length update of the rightmost path records if 2908 * the rightmost tree leaf record is removed so the caller is 2909 * responsible for detecting and correcting that. 2910 */ 2911 static int ocfs2_rotate_tree_left(struct inode *inode, handle_t *handle, 2912 struct ocfs2_path *path, 2913 struct ocfs2_cached_dealloc_ctxt *dealloc, 2914 struct ocfs2_extent_tree *et) 2915 { 2916 int ret, orig_credits = handle->h_buffer_credits; 2917 struct ocfs2_path *tmp_path = NULL, *restart_path = NULL; 2918 struct ocfs2_extent_block *eb; 2919 struct ocfs2_extent_list *el; 2920 2921 el = path_leaf_el(path); 2922 if (!ocfs2_is_empty_extent(&el->l_recs[0])) 2923 return 0; 2924 2925 if (path->p_tree_depth == 0) { 2926 rightmost_no_delete: 2927 /* 2928 * Inline extents. This is trivially handled, so do 2929 * it up front. 2930 */ 2931 ret = ocfs2_rotate_rightmost_leaf_left(inode, handle, 2932 path_leaf_bh(path), 2933 path_leaf_el(path)); 2934 if (ret) 2935 mlog_errno(ret); 2936 goto out; 2937 } 2938 2939 /* 2940 * Handle rightmost branch now. There's several cases: 2941 * 1) simple rotation leaving records in there. That's trivial. 2942 * 2) rotation requiring a branch delete - there's no more 2943 * records left. Two cases of this: 2944 * a) There are branches to the left. 2945 * b) This is also the leftmost (the only) branch. 2946 * 2947 * 1) is handled via ocfs2_rotate_rightmost_leaf_left() 2948 * 2a) we need the left branch so that we can update it with the unlink 2949 * 2b) we need to bring the inode back to inline extents. 2950 */ 2951 2952 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 2953 el = &eb->h_list; 2954 if (eb->h_next_leaf_blk == 0) { 2955 /* 2956 * This gets a bit tricky if we're going to delete the 2957 * rightmost path. Get the other cases out of the way 2958 * 1st. 2959 */ 2960 if (le16_to_cpu(el->l_next_free_rec) > 1) 2961 goto rightmost_no_delete; 2962 2963 if (le16_to_cpu(el->l_next_free_rec) == 0) { 2964 ret = -EIO; 2965 ocfs2_error(inode->i_sb, 2966 "Inode %llu has empty extent block at %llu", 2967 (unsigned long long)OCFS2_I(inode)->ip_blkno, 2968 (unsigned long long)le64_to_cpu(eb->h_blkno)); 2969 goto out; 2970 } 2971 2972 /* 2973 * XXX: The caller can not trust "path" any more after 2974 * this as it will have been deleted. What do we do? 2975 * 2976 * In theory the rotate-for-merge code will never get 2977 * here because it'll always ask for a rotate in a 2978 * nonempty list. 2979 */ 2980 2981 ret = ocfs2_remove_rightmost_path(inode, handle, path, 2982 dealloc, et); 2983 if (ret) 2984 mlog_errno(ret); 2985 goto out; 2986 } 2987 2988 /* 2989 * Now we can loop, remembering the path we get from -EAGAIN 2990 * and restarting from there. 2991 */ 2992 try_rotate: 2993 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, path, 2994 dealloc, &restart_path, et); 2995 if (ret && ret != -EAGAIN) { 2996 mlog_errno(ret); 2997 goto out; 2998 } 2999 3000 while (ret == -EAGAIN) { 3001 tmp_path = restart_path; 3002 restart_path = NULL; 3003 3004 ret = __ocfs2_rotate_tree_left(inode, handle, orig_credits, 3005 tmp_path, dealloc, 3006 &restart_path, et); 3007 if (ret && ret != -EAGAIN) { 3008 mlog_errno(ret); 3009 goto out; 3010 } 3011 3012 ocfs2_free_path(tmp_path); 3013 tmp_path = NULL; 3014 3015 if (ret == 0) 3016 goto try_rotate; 3017 } 3018 3019 out: 3020 ocfs2_free_path(tmp_path); 3021 ocfs2_free_path(restart_path); 3022 return ret; 3023 } 3024 3025 static void ocfs2_cleanup_merge(struct ocfs2_extent_list *el, 3026 int index) 3027 { 3028 struct ocfs2_extent_rec *rec = &el->l_recs[index]; 3029 unsigned int size; 3030 3031 if (rec->e_leaf_clusters == 0) { 3032 /* 3033 * We consumed all of the merged-from record. An empty 3034 * extent cannot exist anywhere but the 1st array 3035 * position, so move things over if the merged-from 3036 * record doesn't occupy that position. 3037 * 3038 * This creates a new empty extent so the caller 3039 * should be smart enough to have removed any existing 3040 * ones. 3041 */ 3042 if (index > 0) { 3043 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3044 size = index * sizeof(struct ocfs2_extent_rec); 3045 memmove(&el->l_recs[1], &el->l_recs[0], size); 3046 } 3047 3048 /* 3049 * Always memset - the caller doesn't check whether it 3050 * created an empty extent, so there could be junk in 3051 * the other fields. 3052 */ 3053 memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 3054 } 3055 } 3056 3057 static int ocfs2_get_right_path(struct inode *inode, 3058 struct ocfs2_path *left_path, 3059 struct ocfs2_path **ret_right_path) 3060 { 3061 int ret; 3062 u32 right_cpos; 3063 struct ocfs2_path *right_path = NULL; 3064 struct ocfs2_extent_list *left_el; 3065 3066 *ret_right_path = NULL; 3067 3068 /* This function shouldn't be called for non-trees. */ 3069 BUG_ON(left_path->p_tree_depth == 0); 3070 3071 left_el = path_leaf_el(left_path); 3072 BUG_ON(left_el->l_next_free_rec != left_el->l_count); 3073 3074 ret = ocfs2_find_cpos_for_right_leaf(inode->i_sb, left_path, 3075 &right_cpos); 3076 if (ret) { 3077 mlog_errno(ret); 3078 goto out; 3079 } 3080 3081 /* This function shouldn't be called for the rightmost leaf. */ 3082 BUG_ON(right_cpos == 0); 3083 3084 right_path = ocfs2_new_path_from_path(left_path); 3085 if (!right_path) { 3086 ret = -ENOMEM; 3087 mlog_errno(ret); 3088 goto out; 3089 } 3090 3091 ret = ocfs2_find_path(inode, right_path, right_cpos); 3092 if (ret) { 3093 mlog_errno(ret); 3094 goto out; 3095 } 3096 3097 *ret_right_path = right_path; 3098 out: 3099 if (ret) 3100 ocfs2_free_path(right_path); 3101 return ret; 3102 } 3103 3104 /* 3105 * Remove split_rec clusters from the record at index and merge them 3106 * onto the beginning of the record "next" to it. 3107 * For index < l_count - 1, the next means the extent rec at index + 1. 3108 * For index == l_count - 1, the "next" means the 1st extent rec of the 3109 * next extent block. 3110 */ 3111 static int ocfs2_merge_rec_right(struct inode *inode, 3112 struct ocfs2_path *left_path, 3113 handle_t *handle, 3114 struct ocfs2_extent_rec *split_rec, 3115 int index) 3116 { 3117 int ret, next_free, i; 3118 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3119 struct ocfs2_extent_rec *left_rec; 3120 struct ocfs2_extent_rec *right_rec; 3121 struct ocfs2_extent_list *right_el; 3122 struct ocfs2_path *right_path = NULL; 3123 int subtree_index = 0; 3124 struct ocfs2_extent_list *el = path_leaf_el(left_path); 3125 struct buffer_head *bh = path_leaf_bh(left_path); 3126 struct buffer_head *root_bh = NULL; 3127 3128 BUG_ON(index >= le16_to_cpu(el->l_next_free_rec)); 3129 left_rec = &el->l_recs[index]; 3130 3131 if (index == le16_to_cpu(el->l_next_free_rec) - 1 && 3132 le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count)) { 3133 /* we meet with a cross extent block merge. */ 3134 ret = ocfs2_get_right_path(inode, left_path, &right_path); 3135 if (ret) { 3136 mlog_errno(ret); 3137 goto out; 3138 } 3139 3140 right_el = path_leaf_el(right_path); 3141 next_free = le16_to_cpu(right_el->l_next_free_rec); 3142 BUG_ON(next_free <= 0); 3143 right_rec = &right_el->l_recs[0]; 3144 if (ocfs2_is_empty_extent(right_rec)) { 3145 BUG_ON(next_free <= 1); 3146 right_rec = &right_el->l_recs[1]; 3147 } 3148 3149 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3150 le16_to_cpu(left_rec->e_leaf_clusters) != 3151 le32_to_cpu(right_rec->e_cpos)); 3152 3153 subtree_index = ocfs2_find_subtree_root(inode, 3154 left_path, right_path); 3155 3156 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3157 handle->h_buffer_credits, 3158 right_path); 3159 if (ret) { 3160 mlog_errno(ret); 3161 goto out; 3162 } 3163 3164 root_bh = left_path->p_node[subtree_index].bh; 3165 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3166 3167 ret = ocfs2_journal_access(handle, inode, root_bh, 3168 OCFS2_JOURNAL_ACCESS_WRITE); 3169 if (ret) { 3170 mlog_errno(ret); 3171 goto out; 3172 } 3173 3174 for (i = subtree_index + 1; 3175 i < path_num_items(right_path); i++) { 3176 ret = ocfs2_journal_access(handle, inode, 3177 right_path->p_node[i].bh, 3178 OCFS2_JOURNAL_ACCESS_WRITE); 3179 if (ret) { 3180 mlog_errno(ret); 3181 goto out; 3182 } 3183 3184 ret = ocfs2_journal_access(handle, inode, 3185 left_path->p_node[i].bh, 3186 OCFS2_JOURNAL_ACCESS_WRITE); 3187 if (ret) { 3188 mlog_errno(ret); 3189 goto out; 3190 } 3191 } 3192 3193 } else { 3194 BUG_ON(index == le16_to_cpu(el->l_next_free_rec) - 1); 3195 right_rec = &el->l_recs[index + 1]; 3196 } 3197 3198 ret = ocfs2_journal_access(handle, inode, bh, 3199 OCFS2_JOURNAL_ACCESS_WRITE); 3200 if (ret) { 3201 mlog_errno(ret); 3202 goto out; 3203 } 3204 3205 le16_add_cpu(&left_rec->e_leaf_clusters, -split_clusters); 3206 3207 le32_add_cpu(&right_rec->e_cpos, -split_clusters); 3208 le64_add_cpu(&right_rec->e_blkno, 3209 -ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3210 le16_add_cpu(&right_rec->e_leaf_clusters, split_clusters); 3211 3212 ocfs2_cleanup_merge(el, index); 3213 3214 ret = ocfs2_journal_dirty(handle, bh); 3215 if (ret) 3216 mlog_errno(ret); 3217 3218 if (right_path) { 3219 ret = ocfs2_journal_dirty(handle, path_leaf_bh(right_path)); 3220 if (ret) 3221 mlog_errno(ret); 3222 3223 ocfs2_complete_edge_insert(inode, handle, left_path, 3224 right_path, subtree_index); 3225 } 3226 out: 3227 if (right_path) 3228 ocfs2_free_path(right_path); 3229 return ret; 3230 } 3231 3232 static int ocfs2_get_left_path(struct inode *inode, 3233 struct ocfs2_path *right_path, 3234 struct ocfs2_path **ret_left_path) 3235 { 3236 int ret; 3237 u32 left_cpos; 3238 struct ocfs2_path *left_path = NULL; 3239 3240 *ret_left_path = NULL; 3241 3242 /* This function shouldn't be called for non-trees. */ 3243 BUG_ON(right_path->p_tree_depth == 0); 3244 3245 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 3246 right_path, &left_cpos); 3247 if (ret) { 3248 mlog_errno(ret); 3249 goto out; 3250 } 3251 3252 /* This function shouldn't be called for the leftmost leaf. */ 3253 BUG_ON(left_cpos == 0); 3254 3255 left_path = ocfs2_new_path_from_path(right_path); 3256 if (!left_path) { 3257 ret = -ENOMEM; 3258 mlog_errno(ret); 3259 goto out; 3260 } 3261 3262 ret = ocfs2_find_path(inode, left_path, left_cpos); 3263 if (ret) { 3264 mlog_errno(ret); 3265 goto out; 3266 } 3267 3268 *ret_left_path = left_path; 3269 out: 3270 if (ret) 3271 ocfs2_free_path(left_path); 3272 return ret; 3273 } 3274 3275 /* 3276 * Remove split_rec clusters from the record at index and merge them 3277 * onto the tail of the record "before" it. 3278 * For index > 0, the "before" means the extent rec at index - 1. 3279 * 3280 * For index == 0, the "before" means the last record of the previous 3281 * extent block. And there is also a situation that we may need to 3282 * remove the rightmost leaf extent block in the right_path and change 3283 * the right path to indicate the new rightmost path. 3284 */ 3285 static int ocfs2_merge_rec_left(struct inode *inode, 3286 struct ocfs2_path *right_path, 3287 handle_t *handle, 3288 struct ocfs2_extent_rec *split_rec, 3289 struct ocfs2_cached_dealloc_ctxt *dealloc, 3290 struct ocfs2_extent_tree *et, 3291 int index) 3292 { 3293 int ret, i, subtree_index = 0, has_empty_extent = 0; 3294 unsigned int split_clusters = le16_to_cpu(split_rec->e_leaf_clusters); 3295 struct ocfs2_extent_rec *left_rec; 3296 struct ocfs2_extent_rec *right_rec; 3297 struct ocfs2_extent_list *el = path_leaf_el(right_path); 3298 struct buffer_head *bh = path_leaf_bh(right_path); 3299 struct buffer_head *root_bh = NULL; 3300 struct ocfs2_path *left_path = NULL; 3301 struct ocfs2_extent_list *left_el; 3302 3303 BUG_ON(index < 0); 3304 3305 right_rec = &el->l_recs[index]; 3306 if (index == 0) { 3307 /* we meet with a cross extent block merge. */ 3308 ret = ocfs2_get_left_path(inode, right_path, &left_path); 3309 if (ret) { 3310 mlog_errno(ret); 3311 goto out; 3312 } 3313 3314 left_el = path_leaf_el(left_path); 3315 BUG_ON(le16_to_cpu(left_el->l_next_free_rec) != 3316 le16_to_cpu(left_el->l_count)); 3317 3318 left_rec = &left_el->l_recs[ 3319 le16_to_cpu(left_el->l_next_free_rec) - 1]; 3320 BUG_ON(le32_to_cpu(left_rec->e_cpos) + 3321 le16_to_cpu(left_rec->e_leaf_clusters) != 3322 le32_to_cpu(split_rec->e_cpos)); 3323 3324 subtree_index = ocfs2_find_subtree_root(inode, 3325 left_path, right_path); 3326 3327 ret = ocfs2_extend_rotate_transaction(handle, subtree_index, 3328 handle->h_buffer_credits, 3329 left_path); 3330 if (ret) { 3331 mlog_errno(ret); 3332 goto out; 3333 } 3334 3335 root_bh = left_path->p_node[subtree_index].bh; 3336 BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 3337 3338 ret = ocfs2_journal_access(handle, inode, root_bh, 3339 OCFS2_JOURNAL_ACCESS_WRITE); 3340 if (ret) { 3341 mlog_errno(ret); 3342 goto out; 3343 } 3344 3345 for (i = subtree_index + 1; 3346 i < path_num_items(right_path); i++) { 3347 ret = ocfs2_journal_access(handle, inode, 3348 right_path->p_node[i].bh, 3349 OCFS2_JOURNAL_ACCESS_WRITE); 3350 if (ret) { 3351 mlog_errno(ret); 3352 goto out; 3353 } 3354 3355 ret = ocfs2_journal_access(handle, inode, 3356 left_path->p_node[i].bh, 3357 OCFS2_JOURNAL_ACCESS_WRITE); 3358 if (ret) { 3359 mlog_errno(ret); 3360 goto out; 3361 } 3362 } 3363 } else { 3364 left_rec = &el->l_recs[index - 1]; 3365 if (ocfs2_is_empty_extent(&el->l_recs[0])) 3366 has_empty_extent = 1; 3367 } 3368 3369 ret = ocfs2_journal_access(handle, inode, bh, 3370 OCFS2_JOURNAL_ACCESS_WRITE); 3371 if (ret) { 3372 mlog_errno(ret); 3373 goto out; 3374 } 3375 3376 if (has_empty_extent && index == 1) { 3377 /* 3378 * The easy case - we can just plop the record right in. 3379 */ 3380 *left_rec = *split_rec; 3381 3382 has_empty_extent = 0; 3383 } else 3384 le16_add_cpu(&left_rec->e_leaf_clusters, split_clusters); 3385 3386 le32_add_cpu(&right_rec->e_cpos, split_clusters); 3387 le64_add_cpu(&right_rec->e_blkno, 3388 ocfs2_clusters_to_blocks(inode->i_sb, split_clusters)); 3389 le16_add_cpu(&right_rec->e_leaf_clusters, -split_clusters); 3390 3391 ocfs2_cleanup_merge(el, index); 3392 3393 ret = ocfs2_journal_dirty(handle, bh); 3394 if (ret) 3395 mlog_errno(ret); 3396 3397 if (left_path) { 3398 ret = ocfs2_journal_dirty(handle, path_leaf_bh(left_path)); 3399 if (ret) 3400 mlog_errno(ret); 3401 3402 /* 3403 * In the situation that the right_rec is empty and the extent 3404 * block is empty also, ocfs2_complete_edge_insert can't handle 3405 * it and we need to delete the right extent block. 3406 */ 3407 if (le16_to_cpu(right_rec->e_leaf_clusters) == 0 && 3408 le16_to_cpu(el->l_next_free_rec) == 1) { 3409 3410 ret = ocfs2_remove_rightmost_path(inode, handle, 3411 right_path, 3412 dealloc, et); 3413 if (ret) { 3414 mlog_errno(ret); 3415 goto out; 3416 } 3417 3418 /* Now the rightmost extent block has been deleted. 3419 * So we use the new rightmost path. 3420 */ 3421 ocfs2_mv_path(right_path, left_path); 3422 left_path = NULL; 3423 } else 3424 ocfs2_complete_edge_insert(inode, handle, left_path, 3425 right_path, subtree_index); 3426 } 3427 out: 3428 if (left_path) 3429 ocfs2_free_path(left_path); 3430 return ret; 3431 } 3432 3433 static int ocfs2_try_to_merge_extent(struct inode *inode, 3434 handle_t *handle, 3435 struct ocfs2_path *path, 3436 int split_index, 3437 struct ocfs2_extent_rec *split_rec, 3438 struct ocfs2_cached_dealloc_ctxt *dealloc, 3439 struct ocfs2_merge_ctxt *ctxt, 3440 struct ocfs2_extent_tree *et) 3441 3442 { 3443 int ret = 0; 3444 struct ocfs2_extent_list *el = path_leaf_el(path); 3445 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 3446 3447 BUG_ON(ctxt->c_contig_type == CONTIG_NONE); 3448 3449 if (ctxt->c_split_covers_rec && ctxt->c_has_empty_extent) { 3450 /* 3451 * The merge code will need to create an empty 3452 * extent to take the place of the newly 3453 * emptied slot. Remove any pre-existing empty 3454 * extents - having more than one in a leaf is 3455 * illegal. 3456 */ 3457 ret = ocfs2_rotate_tree_left(inode, handle, path, 3458 dealloc, et); 3459 if (ret) { 3460 mlog_errno(ret); 3461 goto out; 3462 } 3463 split_index--; 3464 rec = &el->l_recs[split_index]; 3465 } 3466 3467 if (ctxt->c_contig_type == CONTIG_LEFTRIGHT) { 3468 /* 3469 * Left-right contig implies this. 3470 */ 3471 BUG_ON(!ctxt->c_split_covers_rec); 3472 3473 /* 3474 * Since the leftright insert always covers the entire 3475 * extent, this call will delete the insert record 3476 * entirely, resulting in an empty extent record added to 3477 * the extent block. 3478 * 3479 * Since the adding of an empty extent shifts 3480 * everything back to the right, there's no need to 3481 * update split_index here. 3482 * 3483 * When the split_index is zero, we need to merge it to the 3484 * prevoius extent block. It is more efficient and easier 3485 * if we do merge_right first and merge_left later. 3486 */ 3487 ret = ocfs2_merge_rec_right(inode, path, 3488 handle, split_rec, 3489 split_index); 3490 if (ret) { 3491 mlog_errno(ret); 3492 goto out; 3493 } 3494 3495 /* 3496 * We can only get this from logic error above. 3497 */ 3498 BUG_ON(!ocfs2_is_empty_extent(&el->l_recs[0])); 3499 3500 /* The merge left us with an empty extent, remove it. */ 3501 ret = ocfs2_rotate_tree_left(inode, handle, path, 3502 dealloc, et); 3503 if (ret) { 3504 mlog_errno(ret); 3505 goto out; 3506 } 3507 3508 rec = &el->l_recs[split_index]; 3509 3510 /* 3511 * Note that we don't pass split_rec here on purpose - 3512 * we've merged it into the rec already. 3513 */ 3514 ret = ocfs2_merge_rec_left(inode, path, 3515 handle, rec, 3516 dealloc, et, 3517 split_index); 3518 3519 if (ret) { 3520 mlog_errno(ret); 3521 goto out; 3522 } 3523 3524 ret = ocfs2_rotate_tree_left(inode, handle, path, 3525 dealloc, et); 3526 /* 3527 * Error from this last rotate is not critical, so 3528 * print but don't bubble it up. 3529 */ 3530 if (ret) 3531 mlog_errno(ret); 3532 ret = 0; 3533 } else { 3534 /* 3535 * Merge a record to the left or right. 3536 * 3537 * 'contig_type' is relative to the existing record, 3538 * so for example, if we're "right contig", it's to 3539 * the record on the left (hence the left merge). 3540 */ 3541 if (ctxt->c_contig_type == CONTIG_RIGHT) { 3542 ret = ocfs2_merge_rec_left(inode, 3543 path, 3544 handle, split_rec, 3545 dealloc, et, 3546 split_index); 3547 if (ret) { 3548 mlog_errno(ret); 3549 goto out; 3550 } 3551 } else { 3552 ret = ocfs2_merge_rec_right(inode, 3553 path, 3554 handle, split_rec, 3555 split_index); 3556 if (ret) { 3557 mlog_errno(ret); 3558 goto out; 3559 } 3560 } 3561 3562 if (ctxt->c_split_covers_rec) { 3563 /* 3564 * The merge may have left an empty extent in 3565 * our leaf. Try to rotate it away. 3566 */ 3567 ret = ocfs2_rotate_tree_left(inode, handle, path, 3568 dealloc, et); 3569 if (ret) 3570 mlog_errno(ret); 3571 ret = 0; 3572 } 3573 } 3574 3575 out: 3576 return ret; 3577 } 3578 3579 static void ocfs2_subtract_from_rec(struct super_block *sb, 3580 enum ocfs2_split_type split, 3581 struct ocfs2_extent_rec *rec, 3582 struct ocfs2_extent_rec *split_rec) 3583 { 3584 u64 len_blocks; 3585 3586 len_blocks = ocfs2_clusters_to_blocks(sb, 3587 le16_to_cpu(split_rec->e_leaf_clusters)); 3588 3589 if (split == SPLIT_LEFT) { 3590 /* 3591 * Region is on the left edge of the existing 3592 * record. 3593 */ 3594 le32_add_cpu(&rec->e_cpos, 3595 le16_to_cpu(split_rec->e_leaf_clusters)); 3596 le64_add_cpu(&rec->e_blkno, len_blocks); 3597 le16_add_cpu(&rec->e_leaf_clusters, 3598 -le16_to_cpu(split_rec->e_leaf_clusters)); 3599 } else { 3600 /* 3601 * Region is on the right edge of the existing 3602 * record. 3603 */ 3604 le16_add_cpu(&rec->e_leaf_clusters, 3605 -le16_to_cpu(split_rec->e_leaf_clusters)); 3606 } 3607 } 3608 3609 /* 3610 * Do the final bits of extent record insertion at the target leaf 3611 * list. If this leaf is part of an allocation tree, it is assumed 3612 * that the tree above has been prepared. 3613 */ 3614 static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 3615 struct ocfs2_extent_list *el, 3616 struct ocfs2_insert_type *insert, 3617 struct inode *inode) 3618 { 3619 int i = insert->ins_contig_index; 3620 unsigned int range; 3621 struct ocfs2_extent_rec *rec; 3622 3623 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 3624 3625 if (insert->ins_split != SPLIT_NONE) { 3626 i = ocfs2_search_extent_list(el, le32_to_cpu(insert_rec->e_cpos)); 3627 BUG_ON(i == -1); 3628 rec = &el->l_recs[i]; 3629 ocfs2_subtract_from_rec(inode->i_sb, insert->ins_split, rec, 3630 insert_rec); 3631 goto rotate; 3632 } 3633 3634 /* 3635 * Contiguous insert - either left or right. 3636 */ 3637 if (insert->ins_contig != CONTIG_NONE) { 3638 rec = &el->l_recs[i]; 3639 if (insert->ins_contig == CONTIG_LEFT) { 3640 rec->e_blkno = insert_rec->e_blkno; 3641 rec->e_cpos = insert_rec->e_cpos; 3642 } 3643 le16_add_cpu(&rec->e_leaf_clusters, 3644 le16_to_cpu(insert_rec->e_leaf_clusters)); 3645 return; 3646 } 3647 3648 /* 3649 * Handle insert into an empty leaf. 3650 */ 3651 if (le16_to_cpu(el->l_next_free_rec) == 0 || 3652 ((le16_to_cpu(el->l_next_free_rec) == 1) && 3653 ocfs2_is_empty_extent(&el->l_recs[0]))) { 3654 el->l_recs[0] = *insert_rec; 3655 el->l_next_free_rec = cpu_to_le16(1); 3656 return; 3657 } 3658 3659 /* 3660 * Appending insert. 3661 */ 3662 if (insert->ins_appending == APPEND_TAIL) { 3663 i = le16_to_cpu(el->l_next_free_rec) - 1; 3664 rec = &el->l_recs[i]; 3665 range = le32_to_cpu(rec->e_cpos) 3666 + le16_to_cpu(rec->e_leaf_clusters); 3667 BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 3668 3669 mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 3670 le16_to_cpu(el->l_count), 3671 "inode %lu, depth %u, count %u, next free %u, " 3672 "rec.cpos %u, rec.clusters %u, " 3673 "insert.cpos %u, insert.clusters %u\n", 3674 inode->i_ino, 3675 le16_to_cpu(el->l_tree_depth), 3676 le16_to_cpu(el->l_count), 3677 le16_to_cpu(el->l_next_free_rec), 3678 le32_to_cpu(el->l_recs[i].e_cpos), 3679 le16_to_cpu(el->l_recs[i].e_leaf_clusters), 3680 le32_to_cpu(insert_rec->e_cpos), 3681 le16_to_cpu(insert_rec->e_leaf_clusters)); 3682 i++; 3683 el->l_recs[i] = *insert_rec; 3684 le16_add_cpu(&el->l_next_free_rec, 1); 3685 return; 3686 } 3687 3688 rotate: 3689 /* 3690 * Ok, we have to rotate. 3691 * 3692 * At this point, it is safe to assume that inserting into an 3693 * empty leaf and appending to a leaf have both been handled 3694 * above. 3695 * 3696 * This leaf needs to have space, either by the empty 1st 3697 * extent record, or by virtue of an l_next_rec < l_count. 3698 */ 3699 ocfs2_rotate_leaf(el, insert_rec); 3700 } 3701 3702 static void ocfs2_adjust_rightmost_records(struct inode *inode, 3703 handle_t *handle, 3704 struct ocfs2_path *path, 3705 struct ocfs2_extent_rec *insert_rec) 3706 { 3707 int ret, i, next_free; 3708 struct buffer_head *bh; 3709 struct ocfs2_extent_list *el; 3710 struct ocfs2_extent_rec *rec; 3711 3712 /* 3713 * Update everything except the leaf block. 3714 */ 3715 for (i = 0; i < path->p_tree_depth; i++) { 3716 bh = path->p_node[i].bh; 3717 el = path->p_node[i].el; 3718 3719 next_free = le16_to_cpu(el->l_next_free_rec); 3720 if (next_free == 0) { 3721 ocfs2_error(inode->i_sb, 3722 "Dinode %llu has a bad extent list", 3723 (unsigned long long)OCFS2_I(inode)->ip_blkno); 3724 ret = -EIO; 3725 return; 3726 } 3727 3728 rec = &el->l_recs[next_free - 1]; 3729 3730 rec->e_int_clusters = insert_rec->e_cpos; 3731 le32_add_cpu(&rec->e_int_clusters, 3732 le16_to_cpu(insert_rec->e_leaf_clusters)); 3733 le32_add_cpu(&rec->e_int_clusters, 3734 -le32_to_cpu(rec->e_cpos)); 3735 3736 ret = ocfs2_journal_dirty(handle, bh); 3737 if (ret) 3738 mlog_errno(ret); 3739 3740 } 3741 } 3742 3743 static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 3744 struct ocfs2_extent_rec *insert_rec, 3745 struct ocfs2_path *right_path, 3746 struct ocfs2_path **ret_left_path) 3747 { 3748 int ret, next_free; 3749 struct ocfs2_extent_list *el; 3750 struct ocfs2_path *left_path = NULL; 3751 3752 *ret_left_path = NULL; 3753 3754 /* 3755 * This shouldn't happen for non-trees. The extent rec cluster 3756 * count manipulation below only works for interior nodes. 3757 */ 3758 BUG_ON(right_path->p_tree_depth == 0); 3759 3760 /* 3761 * If our appending insert is at the leftmost edge of a leaf, 3762 * then we might need to update the rightmost records of the 3763 * neighboring path. 3764 */ 3765 el = path_leaf_el(right_path); 3766 next_free = le16_to_cpu(el->l_next_free_rec); 3767 if (next_free == 0 || 3768 (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 3769 u32 left_cpos; 3770 3771 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 3772 &left_cpos); 3773 if (ret) { 3774 mlog_errno(ret); 3775 goto out; 3776 } 3777 3778 mlog(0, "Append may need a left path update. cpos: %u, " 3779 "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 3780 left_cpos); 3781 3782 /* 3783 * No need to worry if the append is already in the 3784 * leftmost leaf. 3785 */ 3786 if (left_cpos) { 3787 left_path = ocfs2_new_path_from_path(right_path); 3788 if (!left_path) { 3789 ret = -ENOMEM; 3790 mlog_errno(ret); 3791 goto out; 3792 } 3793 3794 ret = ocfs2_find_path(inode, left_path, left_cpos); 3795 if (ret) { 3796 mlog_errno(ret); 3797 goto out; 3798 } 3799 3800 /* 3801 * ocfs2_insert_path() will pass the left_path to the 3802 * journal for us. 3803 */ 3804 } 3805 } 3806 3807 ret = ocfs2_journal_access_path(inode, handle, right_path); 3808 if (ret) { 3809 mlog_errno(ret); 3810 goto out; 3811 } 3812 3813 ocfs2_adjust_rightmost_records(inode, handle, right_path, insert_rec); 3814 3815 *ret_left_path = left_path; 3816 ret = 0; 3817 out: 3818 if (ret != 0) 3819 ocfs2_free_path(left_path); 3820 3821 return ret; 3822 } 3823 3824 static void ocfs2_split_record(struct inode *inode, 3825 struct ocfs2_path *left_path, 3826 struct ocfs2_path *right_path, 3827 struct ocfs2_extent_rec *split_rec, 3828 enum ocfs2_split_type split) 3829 { 3830 int index; 3831 u32 cpos = le32_to_cpu(split_rec->e_cpos); 3832 struct ocfs2_extent_list *left_el = NULL, *right_el, *insert_el, *el; 3833 struct ocfs2_extent_rec *rec, *tmprec; 3834 3835 right_el = path_leaf_el(right_path);; 3836 if (left_path) 3837 left_el = path_leaf_el(left_path); 3838 3839 el = right_el; 3840 insert_el = right_el; 3841 index = ocfs2_search_extent_list(el, cpos); 3842 if (index != -1) { 3843 if (index == 0 && left_path) { 3844 BUG_ON(ocfs2_is_empty_extent(&el->l_recs[0])); 3845 3846 /* 3847 * This typically means that the record 3848 * started in the left path but moved to the 3849 * right as a result of rotation. We either 3850 * move the existing record to the left, or we 3851 * do the later insert there. 3852 * 3853 * In this case, the left path should always 3854 * exist as the rotate code will have passed 3855 * it back for a post-insert update. 3856 */ 3857 3858 if (split == SPLIT_LEFT) { 3859 /* 3860 * It's a left split. Since we know 3861 * that the rotate code gave us an 3862 * empty extent in the left path, we 3863 * can just do the insert there. 3864 */ 3865 insert_el = left_el; 3866 } else { 3867 /* 3868 * Right split - we have to move the 3869 * existing record over to the left 3870 * leaf. The insert will be into the 3871 * newly created empty extent in the 3872 * right leaf. 3873 */ 3874 tmprec = &right_el->l_recs[index]; 3875 ocfs2_rotate_leaf(left_el, tmprec); 3876 el = left_el; 3877 3878 memset(tmprec, 0, sizeof(*tmprec)); 3879 index = ocfs2_search_extent_list(left_el, cpos); 3880 BUG_ON(index == -1); 3881 } 3882 } 3883 } else { 3884 BUG_ON(!left_path); 3885 BUG_ON(!ocfs2_is_empty_extent(&left_el->l_recs[0])); 3886 /* 3887 * Left path is easy - we can just allow the insert to 3888 * happen. 3889 */ 3890 el = left_el; 3891 insert_el = left_el; 3892 index = ocfs2_search_extent_list(el, cpos); 3893 BUG_ON(index == -1); 3894 } 3895 3896 rec = &el->l_recs[index]; 3897 ocfs2_subtract_from_rec(inode->i_sb, split, rec, split_rec); 3898 ocfs2_rotate_leaf(insert_el, split_rec); 3899 } 3900 3901 /* 3902 * This function only does inserts on an allocation b-tree. For tree 3903 * depth = 0, ocfs2_insert_at_leaf() is called directly. 3904 * 3905 * right_path is the path we want to do the actual insert 3906 * in. left_path should only be passed in if we need to update that 3907 * portion of the tree after an edge insert. 3908 */ 3909 static int ocfs2_insert_path(struct inode *inode, 3910 handle_t *handle, 3911 struct ocfs2_path *left_path, 3912 struct ocfs2_path *right_path, 3913 struct ocfs2_extent_rec *insert_rec, 3914 struct ocfs2_insert_type *insert) 3915 { 3916 int ret, subtree_index; 3917 struct buffer_head *leaf_bh = path_leaf_bh(right_path); 3918 3919 if (left_path) { 3920 int credits = handle->h_buffer_credits; 3921 3922 /* 3923 * There's a chance that left_path got passed back to 3924 * us without being accounted for in the 3925 * journal. Extend our transaction here to be sure we 3926 * can change those blocks. 3927 */ 3928 credits += left_path->p_tree_depth; 3929 3930 ret = ocfs2_extend_trans(handle, credits); 3931 if (ret < 0) { 3932 mlog_errno(ret); 3933 goto out; 3934 } 3935 3936 ret = ocfs2_journal_access_path(inode, handle, left_path); 3937 if (ret < 0) { 3938 mlog_errno(ret); 3939 goto out; 3940 } 3941 } 3942 3943 /* 3944 * Pass both paths to the journal. The majority of inserts 3945 * will be touching all components anyway. 3946 */ 3947 ret = ocfs2_journal_access_path(inode, handle, right_path); 3948 if (ret < 0) { 3949 mlog_errno(ret); 3950 goto out; 3951 } 3952 3953 if (insert->ins_split != SPLIT_NONE) { 3954 /* 3955 * We could call ocfs2_insert_at_leaf() for some types 3956 * of splits, but it's easier to just let one separate 3957 * function sort it all out. 3958 */ 3959 ocfs2_split_record(inode, left_path, right_path, 3960 insert_rec, insert->ins_split); 3961 3962 /* 3963 * Split might have modified either leaf and we don't 3964 * have a guarantee that the later edge insert will 3965 * dirty this for us. 3966 */ 3967 if (left_path) 3968 ret = ocfs2_journal_dirty(handle, 3969 path_leaf_bh(left_path)); 3970 if (ret) 3971 mlog_errno(ret); 3972 } else 3973 ocfs2_insert_at_leaf(insert_rec, path_leaf_el(right_path), 3974 insert, inode); 3975 3976 ret = ocfs2_journal_dirty(handle, leaf_bh); 3977 if (ret) 3978 mlog_errno(ret); 3979 3980 if (left_path) { 3981 /* 3982 * The rotate code has indicated that we need to fix 3983 * up portions of the tree after the insert. 3984 * 3985 * XXX: Should we extend the transaction here? 3986 */ 3987 subtree_index = ocfs2_find_subtree_root(inode, left_path, 3988 right_path); 3989 ocfs2_complete_edge_insert(inode, handle, left_path, 3990 right_path, subtree_index); 3991 } 3992 3993 ret = 0; 3994 out: 3995 return ret; 3996 } 3997 3998 static int ocfs2_do_insert_extent(struct inode *inode, 3999 handle_t *handle, 4000 struct ocfs2_extent_tree *et, 4001 struct ocfs2_extent_rec *insert_rec, 4002 struct ocfs2_insert_type *type) 4003 { 4004 int ret, rotate = 0; 4005 u32 cpos; 4006 struct ocfs2_path *right_path = NULL; 4007 struct ocfs2_path *left_path = NULL; 4008 struct ocfs2_extent_list *el; 4009 4010 el = et->et_root_el; 4011 4012 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4013 OCFS2_JOURNAL_ACCESS_WRITE); 4014 if (ret) { 4015 mlog_errno(ret); 4016 goto out; 4017 } 4018 4019 if (le16_to_cpu(el->l_tree_depth) == 0) { 4020 ocfs2_insert_at_leaf(insert_rec, el, type, inode); 4021 goto out_update_clusters; 4022 } 4023 4024 right_path = ocfs2_new_path_from_et(et); 4025 if (!right_path) { 4026 ret = -ENOMEM; 4027 mlog_errno(ret); 4028 goto out; 4029 } 4030 4031 /* 4032 * Determine the path to start with. Rotations need the 4033 * rightmost path, everything else can go directly to the 4034 * target leaf. 4035 */ 4036 cpos = le32_to_cpu(insert_rec->e_cpos); 4037 if (type->ins_appending == APPEND_NONE && 4038 type->ins_contig == CONTIG_NONE) { 4039 rotate = 1; 4040 cpos = UINT_MAX; 4041 } 4042 4043 ret = ocfs2_find_path(inode, right_path, cpos); 4044 if (ret) { 4045 mlog_errno(ret); 4046 goto out; 4047 } 4048 4049 /* 4050 * Rotations and appends need special treatment - they modify 4051 * parts of the tree's above them. 4052 * 4053 * Both might pass back a path immediate to the left of the 4054 * one being inserted to. This will be cause 4055 * ocfs2_insert_path() to modify the rightmost records of 4056 * left_path to account for an edge insert. 4057 * 4058 * XXX: When modifying this code, keep in mind that an insert 4059 * can wind up skipping both of these two special cases... 4060 */ 4061 if (rotate) { 4062 ret = ocfs2_rotate_tree_right(inode, handle, type->ins_split, 4063 le32_to_cpu(insert_rec->e_cpos), 4064 right_path, &left_path); 4065 if (ret) { 4066 mlog_errno(ret); 4067 goto out; 4068 } 4069 4070 /* 4071 * ocfs2_rotate_tree_right() might have extended the 4072 * transaction without re-journaling our tree root. 4073 */ 4074 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 4075 OCFS2_JOURNAL_ACCESS_WRITE); 4076 if (ret) { 4077 mlog_errno(ret); 4078 goto out; 4079 } 4080 } else if (type->ins_appending == APPEND_TAIL 4081 && type->ins_contig != CONTIG_LEFT) { 4082 ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 4083 right_path, &left_path); 4084 if (ret) { 4085 mlog_errno(ret); 4086 goto out; 4087 } 4088 } 4089 4090 ret = ocfs2_insert_path(inode, handle, left_path, right_path, 4091 insert_rec, type); 4092 if (ret) { 4093 mlog_errno(ret); 4094 goto out; 4095 } 4096 4097 out_update_clusters: 4098 if (type->ins_split == SPLIT_NONE) 4099 ocfs2_et_update_clusters(inode, et, 4100 le16_to_cpu(insert_rec->e_leaf_clusters)); 4101 4102 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 4103 if (ret) 4104 mlog_errno(ret); 4105 4106 out: 4107 ocfs2_free_path(left_path); 4108 ocfs2_free_path(right_path); 4109 4110 return ret; 4111 } 4112 4113 static enum ocfs2_contig_type 4114 ocfs2_figure_merge_contig_type(struct inode *inode, struct ocfs2_path *path, 4115 struct ocfs2_extent_list *el, int index, 4116 struct ocfs2_extent_rec *split_rec) 4117 { 4118 int status; 4119 enum ocfs2_contig_type ret = CONTIG_NONE; 4120 u32 left_cpos, right_cpos; 4121 struct ocfs2_extent_rec *rec = NULL; 4122 struct ocfs2_extent_list *new_el; 4123 struct ocfs2_path *left_path = NULL, *right_path = NULL; 4124 struct buffer_head *bh; 4125 struct ocfs2_extent_block *eb; 4126 4127 if (index > 0) { 4128 rec = &el->l_recs[index - 1]; 4129 } else if (path->p_tree_depth > 0) { 4130 status = ocfs2_find_cpos_for_left_leaf(inode->i_sb, 4131 path, &left_cpos); 4132 if (status) 4133 goto out; 4134 4135 if (left_cpos != 0) { 4136 left_path = ocfs2_new_path_from_path(path); 4137 if (!left_path) 4138 goto out; 4139 4140 status = ocfs2_find_path(inode, left_path, left_cpos); 4141 if (status) 4142 goto out; 4143 4144 new_el = path_leaf_el(left_path); 4145 4146 if (le16_to_cpu(new_el->l_next_free_rec) != 4147 le16_to_cpu(new_el->l_count)) { 4148 bh = path_leaf_bh(left_path); 4149 eb = (struct ocfs2_extent_block *)bh->b_data; 4150 ocfs2_error(inode->i_sb, 4151 "Extent block #%llu has an " 4152 "invalid l_next_free_rec of " 4153 "%d. It should have " 4154 "matched the l_count of %d", 4155 (unsigned long long)le64_to_cpu(eb->h_blkno), 4156 le16_to_cpu(new_el->l_next_free_rec), 4157 le16_to_cpu(new_el->l_count)); 4158 status = -EINVAL; 4159 goto out; 4160 } 4161 rec = &new_el->l_recs[ 4162 le16_to_cpu(new_el->l_next_free_rec) - 1]; 4163 } 4164 } 4165 4166 /* 4167 * We're careful to check for an empty extent record here - 4168 * the merge code will know what to do if it sees one. 4169 */ 4170 if (rec) { 4171 if (index == 1 && ocfs2_is_empty_extent(rec)) { 4172 if (split_rec->e_cpos == el->l_recs[index].e_cpos) 4173 ret = CONTIG_RIGHT; 4174 } else { 4175 ret = ocfs2_extent_contig(inode, rec, split_rec); 4176 } 4177 } 4178 4179 rec = NULL; 4180 if (index < (le16_to_cpu(el->l_next_free_rec) - 1)) 4181 rec = &el->l_recs[index + 1]; 4182 else if (le16_to_cpu(el->l_next_free_rec) == le16_to_cpu(el->l_count) && 4183 path->p_tree_depth > 0) { 4184 status = ocfs2_find_cpos_for_right_leaf(inode->i_sb, 4185 path, &right_cpos); 4186 if (status) 4187 goto out; 4188 4189 if (right_cpos == 0) 4190 goto out; 4191 4192 right_path = ocfs2_new_path_from_path(path); 4193 if (!right_path) 4194 goto out; 4195 4196 status = ocfs2_find_path(inode, right_path, right_cpos); 4197 if (status) 4198 goto out; 4199 4200 new_el = path_leaf_el(right_path); 4201 rec = &new_el->l_recs[0]; 4202 if (ocfs2_is_empty_extent(rec)) { 4203 if (le16_to_cpu(new_el->l_next_free_rec) <= 1) { 4204 bh = path_leaf_bh(right_path); 4205 eb = (struct ocfs2_extent_block *)bh->b_data; 4206 ocfs2_error(inode->i_sb, 4207 "Extent block #%llu has an " 4208 "invalid l_next_free_rec of %d", 4209 (unsigned long long)le64_to_cpu(eb->h_blkno), 4210 le16_to_cpu(new_el->l_next_free_rec)); 4211 status = -EINVAL; 4212 goto out; 4213 } 4214 rec = &new_el->l_recs[1]; 4215 } 4216 } 4217 4218 if (rec) { 4219 enum ocfs2_contig_type contig_type; 4220 4221 contig_type = ocfs2_extent_contig(inode, rec, split_rec); 4222 4223 if (contig_type == CONTIG_LEFT && ret == CONTIG_RIGHT) 4224 ret = CONTIG_LEFTRIGHT; 4225 else if (ret == CONTIG_NONE) 4226 ret = contig_type; 4227 } 4228 4229 out: 4230 if (left_path) 4231 ocfs2_free_path(left_path); 4232 if (right_path) 4233 ocfs2_free_path(right_path); 4234 4235 return ret; 4236 } 4237 4238 static void ocfs2_figure_contig_type(struct inode *inode, 4239 struct ocfs2_insert_type *insert, 4240 struct ocfs2_extent_list *el, 4241 struct ocfs2_extent_rec *insert_rec, 4242 struct ocfs2_extent_tree *et) 4243 { 4244 int i; 4245 enum ocfs2_contig_type contig_type = CONTIG_NONE; 4246 4247 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4248 4249 for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 4250 contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 4251 insert_rec); 4252 if (contig_type != CONTIG_NONE) { 4253 insert->ins_contig_index = i; 4254 break; 4255 } 4256 } 4257 insert->ins_contig = contig_type; 4258 4259 if (insert->ins_contig != CONTIG_NONE) { 4260 struct ocfs2_extent_rec *rec = 4261 &el->l_recs[insert->ins_contig_index]; 4262 unsigned int len = le16_to_cpu(rec->e_leaf_clusters) + 4263 le16_to_cpu(insert_rec->e_leaf_clusters); 4264 4265 /* 4266 * Caller might want us to limit the size of extents, don't 4267 * calculate contiguousness if we might exceed that limit. 4268 */ 4269 if (et->et_max_leaf_clusters && 4270 (len > et->et_max_leaf_clusters)) 4271 insert->ins_contig = CONTIG_NONE; 4272 } 4273 } 4274 4275 /* 4276 * This should only be called against the righmost leaf extent list. 4277 * 4278 * ocfs2_figure_appending_type() will figure out whether we'll have to 4279 * insert at the tail of the rightmost leaf. 4280 * 4281 * This should also work against the root extent list for tree's with 0 4282 * depth. If we consider the root extent list to be the rightmost leaf node 4283 * then the logic here makes sense. 4284 */ 4285 static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 4286 struct ocfs2_extent_list *el, 4287 struct ocfs2_extent_rec *insert_rec) 4288 { 4289 int i; 4290 u32 cpos = le32_to_cpu(insert_rec->e_cpos); 4291 struct ocfs2_extent_rec *rec; 4292 4293 insert->ins_appending = APPEND_NONE; 4294 4295 BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 4296 4297 if (!el->l_next_free_rec) 4298 goto set_tail_append; 4299 4300 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 4301 /* Were all records empty? */ 4302 if (le16_to_cpu(el->l_next_free_rec) == 1) 4303 goto set_tail_append; 4304 } 4305 4306 i = le16_to_cpu(el->l_next_free_rec) - 1; 4307 rec = &el->l_recs[i]; 4308 4309 if (cpos >= 4310 (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 4311 goto set_tail_append; 4312 4313 return; 4314 4315 set_tail_append: 4316 insert->ins_appending = APPEND_TAIL; 4317 } 4318 4319 /* 4320 * Helper function called at the begining of an insert. 4321 * 4322 * This computes a few things that are commonly used in the process of 4323 * inserting into the btree: 4324 * - Whether the new extent is contiguous with an existing one. 4325 * - The current tree depth. 4326 * - Whether the insert is an appending one. 4327 * - The total # of free records in the tree. 4328 * 4329 * All of the information is stored on the ocfs2_insert_type 4330 * structure. 4331 */ 4332 static int ocfs2_figure_insert_type(struct inode *inode, 4333 struct ocfs2_extent_tree *et, 4334 struct buffer_head **last_eb_bh, 4335 struct ocfs2_extent_rec *insert_rec, 4336 int *free_records, 4337 struct ocfs2_insert_type *insert) 4338 { 4339 int ret; 4340 struct ocfs2_extent_block *eb; 4341 struct ocfs2_extent_list *el; 4342 struct ocfs2_path *path = NULL; 4343 struct buffer_head *bh = NULL; 4344 4345 insert->ins_split = SPLIT_NONE; 4346 4347 el = et->et_root_el; 4348 insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 4349 4350 if (el->l_tree_depth) { 4351 /* 4352 * If we have tree depth, we read in the 4353 * rightmost extent block ahead of time as 4354 * ocfs2_figure_insert_type() and ocfs2_add_branch() 4355 * may want it later. 4356 */ 4357 ret = ocfs2_read_extent_block(inode, 4358 ocfs2_et_get_last_eb_blk(et), 4359 &bh); 4360 if (ret) { 4361 mlog_exit(ret); 4362 goto out; 4363 } 4364 eb = (struct ocfs2_extent_block *) bh->b_data; 4365 el = &eb->h_list; 4366 } 4367 4368 /* 4369 * Unless we have a contiguous insert, we'll need to know if 4370 * there is room left in our allocation tree for another 4371 * extent record. 4372 * 4373 * XXX: This test is simplistic, we can search for empty 4374 * extent records too. 4375 */ 4376 *free_records = le16_to_cpu(el->l_count) - 4377 le16_to_cpu(el->l_next_free_rec); 4378 4379 if (!insert->ins_tree_depth) { 4380 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4381 ocfs2_figure_appending_type(insert, el, insert_rec); 4382 return 0; 4383 } 4384 4385 path = ocfs2_new_path_from_et(et); 4386 if (!path) { 4387 ret = -ENOMEM; 4388 mlog_errno(ret); 4389 goto out; 4390 } 4391 4392 /* 4393 * In the case that we're inserting past what the tree 4394 * currently accounts for, ocfs2_find_path() will return for 4395 * us the rightmost tree path. This is accounted for below in 4396 * the appending code. 4397 */ 4398 ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 4399 if (ret) { 4400 mlog_errno(ret); 4401 goto out; 4402 } 4403 4404 el = path_leaf_el(path); 4405 4406 /* 4407 * Now that we have the path, there's two things we want to determine: 4408 * 1) Contiguousness (also set contig_index if this is so) 4409 * 4410 * 2) Are we doing an append? We can trivially break this up 4411 * into two types of appends: simple record append, or a 4412 * rotate inside the tail leaf. 4413 */ 4414 ocfs2_figure_contig_type(inode, insert, el, insert_rec, et); 4415 4416 /* 4417 * The insert code isn't quite ready to deal with all cases of 4418 * left contiguousness. Specifically, if it's an insert into 4419 * the 1st record in a leaf, it will require the adjustment of 4420 * cluster count on the last record of the path directly to it's 4421 * left. For now, just catch that case and fool the layers 4422 * above us. This works just fine for tree_depth == 0, which 4423 * is why we allow that above. 4424 */ 4425 if (insert->ins_contig == CONTIG_LEFT && 4426 insert->ins_contig_index == 0) 4427 insert->ins_contig = CONTIG_NONE; 4428 4429 /* 4430 * Ok, so we can simply compare against last_eb to figure out 4431 * whether the path doesn't exist. This will only happen in 4432 * the case that we're doing a tail append, so maybe we can 4433 * take advantage of that information somehow. 4434 */ 4435 if (ocfs2_et_get_last_eb_blk(et) == 4436 path_leaf_bh(path)->b_blocknr) { 4437 /* 4438 * Ok, ocfs2_find_path() returned us the rightmost 4439 * tree path. This might be an appending insert. There are 4440 * two cases: 4441 * 1) We're doing a true append at the tail: 4442 * -This might even be off the end of the leaf 4443 * 2) We're "appending" by rotating in the tail 4444 */ 4445 ocfs2_figure_appending_type(insert, el, insert_rec); 4446 } 4447 4448 out: 4449 ocfs2_free_path(path); 4450 4451 if (ret == 0) 4452 *last_eb_bh = bh; 4453 else 4454 brelse(bh); 4455 return ret; 4456 } 4457 4458 /* 4459 * Insert an extent into an inode btree. 4460 * 4461 * The caller needs to update fe->i_clusters 4462 */ 4463 int ocfs2_insert_extent(struct ocfs2_super *osb, 4464 handle_t *handle, 4465 struct inode *inode, 4466 struct ocfs2_extent_tree *et, 4467 u32 cpos, 4468 u64 start_blk, 4469 u32 new_clusters, 4470 u8 flags, 4471 struct ocfs2_alloc_context *meta_ac) 4472 { 4473 int status; 4474 int uninitialized_var(free_records); 4475 struct buffer_head *last_eb_bh = NULL; 4476 struct ocfs2_insert_type insert = {0, }; 4477 struct ocfs2_extent_rec rec; 4478 4479 mlog(0, "add %u clusters at position %u to inode %llu\n", 4480 new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4481 4482 memset(&rec, 0, sizeof(rec)); 4483 rec.e_cpos = cpu_to_le32(cpos); 4484 rec.e_blkno = cpu_to_le64(start_blk); 4485 rec.e_leaf_clusters = cpu_to_le16(new_clusters); 4486 rec.e_flags = flags; 4487 status = ocfs2_et_insert_check(inode, et, &rec); 4488 if (status) { 4489 mlog_errno(status); 4490 goto bail; 4491 } 4492 4493 status = ocfs2_figure_insert_type(inode, et, &last_eb_bh, &rec, 4494 &free_records, &insert); 4495 if (status < 0) { 4496 mlog_errno(status); 4497 goto bail; 4498 } 4499 4500 mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 4501 "Insert.contig_index: %d, Insert.free_records: %d, " 4502 "Insert.tree_depth: %d\n", 4503 insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 4504 free_records, insert.ins_tree_depth); 4505 4506 if (insert.ins_contig == CONTIG_NONE && free_records == 0) { 4507 status = ocfs2_grow_tree(inode, handle, et, 4508 &insert.ins_tree_depth, &last_eb_bh, 4509 meta_ac); 4510 if (status) { 4511 mlog_errno(status); 4512 goto bail; 4513 } 4514 } 4515 4516 /* Finally, we can add clusters. This might rotate the tree for us. */ 4517 status = ocfs2_do_insert_extent(inode, handle, et, &rec, &insert); 4518 if (status < 0) 4519 mlog_errno(status); 4520 else if (et->et_ops == &ocfs2_dinode_et_ops) 4521 ocfs2_extent_map_insert_rec(inode, &rec); 4522 4523 bail: 4524 brelse(last_eb_bh); 4525 4526 mlog_exit(status); 4527 return status; 4528 } 4529 4530 /* 4531 * Allcate and add clusters into the extent b-tree. 4532 * The new clusters(clusters_to_add) will be inserted at logical_offset. 4533 * The extent b-tree's root is specified by et, and 4534 * it is not limited to the file storage. Any extent tree can use this 4535 * function if it implements the proper ocfs2_extent_tree. 4536 */ 4537 int ocfs2_add_clusters_in_btree(struct ocfs2_super *osb, 4538 struct inode *inode, 4539 u32 *logical_offset, 4540 u32 clusters_to_add, 4541 int mark_unwritten, 4542 struct ocfs2_extent_tree *et, 4543 handle_t *handle, 4544 struct ocfs2_alloc_context *data_ac, 4545 struct ocfs2_alloc_context *meta_ac, 4546 enum ocfs2_alloc_restarted *reason_ret) 4547 { 4548 int status = 0; 4549 int free_extents; 4550 enum ocfs2_alloc_restarted reason = RESTART_NONE; 4551 u32 bit_off, num_bits; 4552 u64 block; 4553 u8 flags = 0; 4554 4555 BUG_ON(!clusters_to_add); 4556 4557 if (mark_unwritten) 4558 flags = OCFS2_EXT_UNWRITTEN; 4559 4560 free_extents = ocfs2_num_free_extents(osb, inode, et); 4561 if (free_extents < 0) { 4562 status = free_extents; 4563 mlog_errno(status); 4564 goto leave; 4565 } 4566 4567 /* there are two cases which could cause us to EAGAIN in the 4568 * we-need-more-metadata case: 4569 * 1) we haven't reserved *any* 4570 * 2) we are so fragmented, we've needed to add metadata too 4571 * many times. */ 4572 if (!free_extents && !meta_ac) { 4573 mlog(0, "we haven't reserved any metadata!\n"); 4574 status = -EAGAIN; 4575 reason = RESTART_META; 4576 goto leave; 4577 } else if ((!free_extents) 4578 && (ocfs2_alloc_context_bits_left(meta_ac) 4579 < ocfs2_extend_meta_needed(et->et_root_el))) { 4580 mlog(0, "filesystem is really fragmented...\n"); 4581 status = -EAGAIN; 4582 reason = RESTART_META; 4583 goto leave; 4584 } 4585 4586 status = __ocfs2_claim_clusters(osb, handle, data_ac, 1, 4587 clusters_to_add, &bit_off, &num_bits); 4588 if (status < 0) { 4589 if (status != -ENOSPC) 4590 mlog_errno(status); 4591 goto leave; 4592 } 4593 4594 BUG_ON(num_bits > clusters_to_add); 4595 4596 /* reserve our write early -- insert_extent may update the inode */ 4597 status = ocfs2_journal_access(handle, inode, et->et_root_bh, 4598 OCFS2_JOURNAL_ACCESS_WRITE); 4599 if (status < 0) { 4600 mlog_errno(status); 4601 goto leave; 4602 } 4603 4604 block = ocfs2_clusters_to_blocks(osb->sb, bit_off); 4605 mlog(0, "Allocating %u clusters at block %u for inode %llu\n", 4606 num_bits, bit_off, (unsigned long long)OCFS2_I(inode)->ip_blkno); 4607 status = ocfs2_insert_extent(osb, handle, inode, et, 4608 *logical_offset, block, 4609 num_bits, flags, meta_ac); 4610 if (status < 0) { 4611 mlog_errno(status); 4612 goto leave; 4613 } 4614 4615 status = ocfs2_journal_dirty(handle, et->et_root_bh); 4616 if (status < 0) { 4617 mlog_errno(status); 4618 goto leave; 4619 } 4620 4621 clusters_to_add -= num_bits; 4622 *logical_offset += num_bits; 4623 4624 if (clusters_to_add) { 4625 mlog(0, "need to alloc once more, wanted = %u\n", 4626 clusters_to_add); 4627 status = -EAGAIN; 4628 reason = RESTART_TRANS; 4629 } 4630 4631 leave: 4632 mlog_exit(status); 4633 if (reason_ret) 4634 *reason_ret = reason; 4635 return status; 4636 } 4637 4638 static void ocfs2_make_right_split_rec(struct super_block *sb, 4639 struct ocfs2_extent_rec *split_rec, 4640 u32 cpos, 4641 struct ocfs2_extent_rec *rec) 4642 { 4643 u32 rec_cpos = le32_to_cpu(rec->e_cpos); 4644 u32 rec_range = rec_cpos + le16_to_cpu(rec->e_leaf_clusters); 4645 4646 memset(split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4647 4648 split_rec->e_cpos = cpu_to_le32(cpos); 4649 split_rec->e_leaf_clusters = cpu_to_le16(rec_range - cpos); 4650 4651 split_rec->e_blkno = rec->e_blkno; 4652 le64_add_cpu(&split_rec->e_blkno, 4653 ocfs2_clusters_to_blocks(sb, cpos - rec_cpos)); 4654 4655 split_rec->e_flags = rec->e_flags; 4656 } 4657 4658 static int ocfs2_split_and_insert(struct inode *inode, 4659 handle_t *handle, 4660 struct ocfs2_path *path, 4661 struct ocfs2_extent_tree *et, 4662 struct buffer_head **last_eb_bh, 4663 int split_index, 4664 struct ocfs2_extent_rec *orig_split_rec, 4665 struct ocfs2_alloc_context *meta_ac) 4666 { 4667 int ret = 0, depth; 4668 unsigned int insert_range, rec_range, do_leftright = 0; 4669 struct ocfs2_extent_rec tmprec; 4670 struct ocfs2_extent_list *rightmost_el; 4671 struct ocfs2_extent_rec rec; 4672 struct ocfs2_extent_rec split_rec = *orig_split_rec; 4673 struct ocfs2_insert_type insert; 4674 struct ocfs2_extent_block *eb; 4675 4676 leftright: 4677 /* 4678 * Store a copy of the record on the stack - it might move 4679 * around as the tree is manipulated below. 4680 */ 4681 rec = path_leaf_el(path)->l_recs[split_index]; 4682 4683 rightmost_el = et->et_root_el; 4684 4685 depth = le16_to_cpu(rightmost_el->l_tree_depth); 4686 if (depth) { 4687 BUG_ON(!(*last_eb_bh)); 4688 eb = (struct ocfs2_extent_block *) (*last_eb_bh)->b_data; 4689 rightmost_el = &eb->h_list; 4690 } 4691 4692 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 4693 le16_to_cpu(rightmost_el->l_count)) { 4694 ret = ocfs2_grow_tree(inode, handle, et, 4695 &depth, last_eb_bh, meta_ac); 4696 if (ret) { 4697 mlog_errno(ret); 4698 goto out; 4699 } 4700 } 4701 4702 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 4703 insert.ins_appending = APPEND_NONE; 4704 insert.ins_contig = CONTIG_NONE; 4705 insert.ins_tree_depth = depth; 4706 4707 insert_range = le32_to_cpu(split_rec.e_cpos) + 4708 le16_to_cpu(split_rec.e_leaf_clusters); 4709 rec_range = le32_to_cpu(rec.e_cpos) + 4710 le16_to_cpu(rec.e_leaf_clusters); 4711 4712 if (split_rec.e_cpos == rec.e_cpos) { 4713 insert.ins_split = SPLIT_LEFT; 4714 } else if (insert_range == rec_range) { 4715 insert.ins_split = SPLIT_RIGHT; 4716 } else { 4717 /* 4718 * Left/right split. We fake this as a right split 4719 * first and then make a second pass as a left split. 4720 */ 4721 insert.ins_split = SPLIT_RIGHT; 4722 4723 ocfs2_make_right_split_rec(inode->i_sb, &tmprec, insert_range, 4724 &rec); 4725 4726 split_rec = tmprec; 4727 4728 BUG_ON(do_leftright); 4729 do_leftright = 1; 4730 } 4731 4732 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 4733 if (ret) { 4734 mlog_errno(ret); 4735 goto out; 4736 } 4737 4738 if (do_leftright == 1) { 4739 u32 cpos; 4740 struct ocfs2_extent_list *el; 4741 4742 do_leftright++; 4743 split_rec = *orig_split_rec; 4744 4745 ocfs2_reinit_path(path, 1); 4746 4747 cpos = le32_to_cpu(split_rec.e_cpos); 4748 ret = ocfs2_find_path(inode, path, cpos); 4749 if (ret) { 4750 mlog_errno(ret); 4751 goto out; 4752 } 4753 4754 el = path_leaf_el(path); 4755 split_index = ocfs2_search_extent_list(el, cpos); 4756 goto leftright; 4757 } 4758 out: 4759 4760 return ret; 4761 } 4762 4763 /* 4764 * Mark part or all of the extent record at split_index in the leaf 4765 * pointed to by path as written. This removes the unwritten 4766 * extent flag. 4767 * 4768 * Care is taken to handle contiguousness so as to not grow the tree. 4769 * 4770 * meta_ac is not strictly necessary - we only truly need it if growth 4771 * of the tree is required. All other cases will degrade into a less 4772 * optimal tree layout. 4773 * 4774 * last_eb_bh should be the rightmost leaf block for any extent 4775 * btree. Since a split may grow the tree or a merge might shrink it, 4776 * the caller cannot trust the contents of that buffer after this call. 4777 * 4778 * This code is optimized for readability - several passes might be 4779 * made over certain portions of the tree. All of those blocks will 4780 * have been brought into cache (and pinned via the journal), so the 4781 * extra overhead is not expressed in terms of disk reads. 4782 */ 4783 static int __ocfs2_mark_extent_written(struct inode *inode, 4784 struct ocfs2_extent_tree *et, 4785 handle_t *handle, 4786 struct ocfs2_path *path, 4787 int split_index, 4788 struct ocfs2_extent_rec *split_rec, 4789 struct ocfs2_alloc_context *meta_ac, 4790 struct ocfs2_cached_dealloc_ctxt *dealloc) 4791 { 4792 int ret = 0; 4793 struct ocfs2_extent_list *el = path_leaf_el(path); 4794 struct buffer_head *last_eb_bh = NULL; 4795 struct ocfs2_extent_rec *rec = &el->l_recs[split_index]; 4796 struct ocfs2_merge_ctxt ctxt; 4797 struct ocfs2_extent_list *rightmost_el; 4798 4799 if (!(rec->e_flags & OCFS2_EXT_UNWRITTEN)) { 4800 ret = -EIO; 4801 mlog_errno(ret); 4802 goto out; 4803 } 4804 4805 if (le32_to_cpu(rec->e_cpos) > le32_to_cpu(split_rec->e_cpos) || 4806 ((le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters)) < 4807 (le32_to_cpu(split_rec->e_cpos) + le16_to_cpu(split_rec->e_leaf_clusters)))) { 4808 ret = -EIO; 4809 mlog_errno(ret); 4810 goto out; 4811 } 4812 4813 ctxt.c_contig_type = ocfs2_figure_merge_contig_type(inode, path, el, 4814 split_index, 4815 split_rec); 4816 4817 /* 4818 * The core merge / split code wants to know how much room is 4819 * left in this inodes allocation tree, so we pass the 4820 * rightmost extent list. 4821 */ 4822 if (path->p_tree_depth) { 4823 struct ocfs2_extent_block *eb; 4824 4825 ret = ocfs2_read_extent_block(inode, 4826 ocfs2_et_get_last_eb_blk(et), 4827 &last_eb_bh); 4828 if (ret) { 4829 mlog_exit(ret); 4830 goto out; 4831 } 4832 4833 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4834 rightmost_el = &eb->h_list; 4835 } else 4836 rightmost_el = path_root_el(path); 4837 4838 if (rec->e_cpos == split_rec->e_cpos && 4839 rec->e_leaf_clusters == split_rec->e_leaf_clusters) 4840 ctxt.c_split_covers_rec = 1; 4841 else 4842 ctxt.c_split_covers_rec = 0; 4843 4844 ctxt.c_has_empty_extent = ocfs2_is_empty_extent(&el->l_recs[0]); 4845 4846 mlog(0, "index: %d, contig: %u, has_empty: %u, split_covers: %u\n", 4847 split_index, ctxt.c_contig_type, ctxt.c_has_empty_extent, 4848 ctxt.c_split_covers_rec); 4849 4850 if (ctxt.c_contig_type == CONTIG_NONE) { 4851 if (ctxt.c_split_covers_rec) 4852 el->l_recs[split_index] = *split_rec; 4853 else 4854 ret = ocfs2_split_and_insert(inode, handle, path, et, 4855 &last_eb_bh, split_index, 4856 split_rec, meta_ac); 4857 if (ret) 4858 mlog_errno(ret); 4859 } else { 4860 ret = ocfs2_try_to_merge_extent(inode, handle, path, 4861 split_index, split_rec, 4862 dealloc, &ctxt, et); 4863 if (ret) 4864 mlog_errno(ret); 4865 } 4866 4867 out: 4868 brelse(last_eb_bh); 4869 return ret; 4870 } 4871 4872 /* 4873 * Mark the already-existing extent at cpos as written for len clusters. 4874 * 4875 * If the existing extent is larger than the request, initiate a 4876 * split. An attempt will be made at merging with adjacent extents. 4877 * 4878 * The caller is responsible for passing down meta_ac if we'll need it. 4879 */ 4880 int ocfs2_mark_extent_written(struct inode *inode, 4881 struct ocfs2_extent_tree *et, 4882 handle_t *handle, u32 cpos, u32 len, u32 phys, 4883 struct ocfs2_alloc_context *meta_ac, 4884 struct ocfs2_cached_dealloc_ctxt *dealloc) 4885 { 4886 int ret, index; 4887 u64 start_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys); 4888 struct ocfs2_extent_rec split_rec; 4889 struct ocfs2_path *left_path = NULL; 4890 struct ocfs2_extent_list *el; 4891 4892 mlog(0, "Inode %lu cpos %u, len %u, phys %u (%llu)\n", 4893 inode->i_ino, cpos, len, phys, (unsigned long long)start_blkno); 4894 4895 if (!ocfs2_writes_unwritten_extents(OCFS2_SB(inode->i_sb))) { 4896 ocfs2_error(inode->i_sb, "Inode %llu has unwritten extents " 4897 "that are being written to, but the feature bit " 4898 "is not set in the super block.", 4899 (unsigned long long)OCFS2_I(inode)->ip_blkno); 4900 ret = -EROFS; 4901 goto out; 4902 } 4903 4904 /* 4905 * XXX: This should be fixed up so that we just re-insert the 4906 * next extent records. 4907 * 4908 * XXX: This is a hack on the extent tree, maybe it should be 4909 * an op? 4910 */ 4911 if (et->et_ops == &ocfs2_dinode_et_ops) 4912 ocfs2_extent_map_trunc(inode, 0); 4913 4914 left_path = ocfs2_new_path_from_et(et); 4915 if (!left_path) { 4916 ret = -ENOMEM; 4917 mlog_errno(ret); 4918 goto out; 4919 } 4920 4921 ret = ocfs2_find_path(inode, left_path, cpos); 4922 if (ret) { 4923 mlog_errno(ret); 4924 goto out; 4925 } 4926 el = path_leaf_el(left_path); 4927 4928 index = ocfs2_search_extent_list(el, cpos); 4929 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 4930 ocfs2_error(inode->i_sb, 4931 "Inode %llu has an extent at cpos %u which can no " 4932 "longer be found.\n", 4933 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 4934 ret = -EROFS; 4935 goto out; 4936 } 4937 4938 memset(&split_rec, 0, sizeof(struct ocfs2_extent_rec)); 4939 split_rec.e_cpos = cpu_to_le32(cpos); 4940 split_rec.e_leaf_clusters = cpu_to_le16(len); 4941 split_rec.e_blkno = cpu_to_le64(start_blkno); 4942 split_rec.e_flags = path_leaf_el(left_path)->l_recs[index].e_flags; 4943 split_rec.e_flags &= ~OCFS2_EXT_UNWRITTEN; 4944 4945 ret = __ocfs2_mark_extent_written(inode, et, handle, left_path, 4946 index, &split_rec, meta_ac, 4947 dealloc); 4948 if (ret) 4949 mlog_errno(ret); 4950 4951 out: 4952 ocfs2_free_path(left_path); 4953 return ret; 4954 } 4955 4956 static int ocfs2_split_tree(struct inode *inode, struct ocfs2_extent_tree *et, 4957 handle_t *handle, struct ocfs2_path *path, 4958 int index, u32 new_range, 4959 struct ocfs2_alloc_context *meta_ac) 4960 { 4961 int ret, depth, credits = handle->h_buffer_credits; 4962 struct buffer_head *last_eb_bh = NULL; 4963 struct ocfs2_extent_block *eb; 4964 struct ocfs2_extent_list *rightmost_el, *el; 4965 struct ocfs2_extent_rec split_rec; 4966 struct ocfs2_extent_rec *rec; 4967 struct ocfs2_insert_type insert; 4968 4969 /* 4970 * Setup the record to split before we grow the tree. 4971 */ 4972 el = path_leaf_el(path); 4973 rec = &el->l_recs[index]; 4974 ocfs2_make_right_split_rec(inode->i_sb, &split_rec, new_range, rec); 4975 4976 depth = path->p_tree_depth; 4977 if (depth > 0) { 4978 ret = ocfs2_read_extent_block(inode, 4979 ocfs2_et_get_last_eb_blk(et), 4980 &last_eb_bh); 4981 if (ret < 0) { 4982 mlog_errno(ret); 4983 goto out; 4984 } 4985 4986 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4987 rightmost_el = &eb->h_list; 4988 } else 4989 rightmost_el = path_leaf_el(path); 4990 4991 credits += path->p_tree_depth + 4992 ocfs2_extend_meta_needed(et->et_root_el); 4993 ret = ocfs2_extend_trans(handle, credits); 4994 if (ret) { 4995 mlog_errno(ret); 4996 goto out; 4997 } 4998 4999 if (le16_to_cpu(rightmost_el->l_next_free_rec) == 5000 le16_to_cpu(rightmost_el->l_count)) { 5001 ret = ocfs2_grow_tree(inode, handle, et, &depth, &last_eb_bh, 5002 meta_ac); 5003 if (ret) { 5004 mlog_errno(ret); 5005 goto out; 5006 } 5007 } 5008 5009 memset(&insert, 0, sizeof(struct ocfs2_insert_type)); 5010 insert.ins_appending = APPEND_NONE; 5011 insert.ins_contig = CONTIG_NONE; 5012 insert.ins_split = SPLIT_RIGHT; 5013 insert.ins_tree_depth = depth; 5014 5015 ret = ocfs2_do_insert_extent(inode, handle, et, &split_rec, &insert); 5016 if (ret) 5017 mlog_errno(ret); 5018 5019 out: 5020 brelse(last_eb_bh); 5021 return ret; 5022 } 5023 5024 static int ocfs2_truncate_rec(struct inode *inode, handle_t *handle, 5025 struct ocfs2_path *path, int index, 5026 struct ocfs2_cached_dealloc_ctxt *dealloc, 5027 u32 cpos, u32 len, 5028 struct ocfs2_extent_tree *et) 5029 { 5030 int ret; 5031 u32 left_cpos, rec_range, trunc_range; 5032 int wants_rotate = 0, is_rightmost_tree_rec = 0; 5033 struct super_block *sb = inode->i_sb; 5034 struct ocfs2_path *left_path = NULL; 5035 struct ocfs2_extent_list *el = path_leaf_el(path); 5036 struct ocfs2_extent_rec *rec; 5037 struct ocfs2_extent_block *eb; 5038 5039 if (ocfs2_is_empty_extent(&el->l_recs[0]) && index > 0) { 5040 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5041 if (ret) { 5042 mlog_errno(ret); 5043 goto out; 5044 } 5045 5046 index--; 5047 } 5048 5049 if (index == (le16_to_cpu(el->l_next_free_rec) - 1) && 5050 path->p_tree_depth) { 5051 /* 5052 * Check whether this is the rightmost tree record. If 5053 * we remove all of this record or part of its right 5054 * edge then an update of the record lengths above it 5055 * will be required. 5056 */ 5057 eb = (struct ocfs2_extent_block *)path_leaf_bh(path)->b_data; 5058 if (eb->h_next_leaf_blk == 0) 5059 is_rightmost_tree_rec = 1; 5060 } 5061 5062 rec = &el->l_recs[index]; 5063 if (index == 0 && path->p_tree_depth && 5064 le32_to_cpu(rec->e_cpos) == cpos) { 5065 /* 5066 * Changing the leftmost offset (via partial or whole 5067 * record truncate) of an interior (or rightmost) path 5068 * means we have to update the subtree that is formed 5069 * by this leaf and the one to it's left. 5070 * 5071 * There are two cases we can skip: 5072 * 1) Path is the leftmost one in our inode tree. 5073 * 2) The leaf is rightmost and will be empty after 5074 * we remove the extent record - the rotate code 5075 * knows how to update the newly formed edge. 5076 */ 5077 5078 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, 5079 &left_cpos); 5080 if (ret) { 5081 mlog_errno(ret); 5082 goto out; 5083 } 5084 5085 if (left_cpos && le16_to_cpu(el->l_next_free_rec) > 1) { 5086 left_path = ocfs2_new_path_from_path(path); 5087 if (!left_path) { 5088 ret = -ENOMEM; 5089 mlog_errno(ret); 5090 goto out; 5091 } 5092 5093 ret = ocfs2_find_path(inode, left_path, left_cpos); 5094 if (ret) { 5095 mlog_errno(ret); 5096 goto out; 5097 } 5098 } 5099 } 5100 5101 ret = ocfs2_extend_rotate_transaction(handle, 0, 5102 handle->h_buffer_credits, 5103 path); 5104 if (ret) { 5105 mlog_errno(ret); 5106 goto out; 5107 } 5108 5109 ret = ocfs2_journal_access_path(inode, handle, path); 5110 if (ret) { 5111 mlog_errno(ret); 5112 goto out; 5113 } 5114 5115 ret = ocfs2_journal_access_path(inode, handle, left_path); 5116 if (ret) { 5117 mlog_errno(ret); 5118 goto out; 5119 } 5120 5121 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5122 trunc_range = cpos + len; 5123 5124 if (le32_to_cpu(rec->e_cpos) == cpos && rec_range == trunc_range) { 5125 int next_free; 5126 5127 memset(rec, 0, sizeof(*rec)); 5128 ocfs2_cleanup_merge(el, index); 5129 wants_rotate = 1; 5130 5131 next_free = le16_to_cpu(el->l_next_free_rec); 5132 if (is_rightmost_tree_rec && next_free > 1) { 5133 /* 5134 * We skip the edge update if this path will 5135 * be deleted by the rotate code. 5136 */ 5137 rec = &el->l_recs[next_free - 1]; 5138 ocfs2_adjust_rightmost_records(inode, handle, path, 5139 rec); 5140 } 5141 } else if (le32_to_cpu(rec->e_cpos) == cpos) { 5142 /* Remove leftmost portion of the record. */ 5143 le32_add_cpu(&rec->e_cpos, len); 5144 le64_add_cpu(&rec->e_blkno, ocfs2_clusters_to_blocks(sb, len)); 5145 le16_add_cpu(&rec->e_leaf_clusters, -len); 5146 } else if (rec_range == trunc_range) { 5147 /* Remove rightmost portion of the record */ 5148 le16_add_cpu(&rec->e_leaf_clusters, -len); 5149 if (is_rightmost_tree_rec) 5150 ocfs2_adjust_rightmost_records(inode, handle, path, rec); 5151 } else { 5152 /* Caller should have trapped this. */ 5153 mlog(ML_ERROR, "Inode %llu: Invalid record truncate: (%u, %u) " 5154 "(%u, %u)\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, 5155 le32_to_cpu(rec->e_cpos), 5156 le16_to_cpu(rec->e_leaf_clusters), cpos, len); 5157 BUG(); 5158 } 5159 5160 if (left_path) { 5161 int subtree_index; 5162 5163 subtree_index = ocfs2_find_subtree_root(inode, left_path, path); 5164 ocfs2_complete_edge_insert(inode, handle, left_path, path, 5165 subtree_index); 5166 } 5167 5168 ocfs2_journal_dirty(handle, path_leaf_bh(path)); 5169 5170 ret = ocfs2_rotate_tree_left(inode, handle, path, dealloc, et); 5171 if (ret) { 5172 mlog_errno(ret); 5173 goto out; 5174 } 5175 5176 out: 5177 ocfs2_free_path(left_path); 5178 return ret; 5179 } 5180 5181 int ocfs2_remove_extent(struct inode *inode, 5182 struct ocfs2_extent_tree *et, 5183 u32 cpos, u32 len, handle_t *handle, 5184 struct ocfs2_alloc_context *meta_ac, 5185 struct ocfs2_cached_dealloc_ctxt *dealloc) 5186 { 5187 int ret, index; 5188 u32 rec_range, trunc_range; 5189 struct ocfs2_extent_rec *rec; 5190 struct ocfs2_extent_list *el; 5191 struct ocfs2_path *path = NULL; 5192 5193 ocfs2_extent_map_trunc(inode, 0); 5194 5195 path = ocfs2_new_path_from_et(et); 5196 if (!path) { 5197 ret = -ENOMEM; 5198 mlog_errno(ret); 5199 goto out; 5200 } 5201 5202 ret = ocfs2_find_path(inode, path, cpos); 5203 if (ret) { 5204 mlog_errno(ret); 5205 goto out; 5206 } 5207 5208 el = path_leaf_el(path); 5209 index = ocfs2_search_extent_list(el, cpos); 5210 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5211 ocfs2_error(inode->i_sb, 5212 "Inode %llu has an extent at cpos %u which can no " 5213 "longer be found.\n", 5214 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos); 5215 ret = -EROFS; 5216 goto out; 5217 } 5218 5219 /* 5220 * We have 3 cases of extent removal: 5221 * 1) Range covers the entire extent rec 5222 * 2) Range begins or ends on one edge of the extent rec 5223 * 3) Range is in the middle of the extent rec (no shared edges) 5224 * 5225 * For case 1 we remove the extent rec and left rotate to 5226 * fill the hole. 5227 * 5228 * For case 2 we just shrink the existing extent rec, with a 5229 * tree update if the shrinking edge is also the edge of an 5230 * extent block. 5231 * 5232 * For case 3 we do a right split to turn the extent rec into 5233 * something case 2 can handle. 5234 */ 5235 rec = &el->l_recs[index]; 5236 rec_range = le32_to_cpu(rec->e_cpos) + ocfs2_rec_clusters(el, rec); 5237 trunc_range = cpos + len; 5238 5239 BUG_ON(cpos < le32_to_cpu(rec->e_cpos) || trunc_range > rec_range); 5240 5241 mlog(0, "Inode %llu, remove (cpos %u, len %u). Existing index %d " 5242 "(cpos %u, len %u)\n", 5243 (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, len, index, 5244 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec)); 5245 5246 if (le32_to_cpu(rec->e_cpos) == cpos || rec_range == trunc_range) { 5247 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5248 cpos, len, et); 5249 if (ret) { 5250 mlog_errno(ret); 5251 goto out; 5252 } 5253 } else { 5254 ret = ocfs2_split_tree(inode, et, handle, path, index, 5255 trunc_range, meta_ac); 5256 if (ret) { 5257 mlog_errno(ret); 5258 goto out; 5259 } 5260 5261 /* 5262 * The split could have manipulated the tree enough to 5263 * move the record location, so we have to look for it again. 5264 */ 5265 ocfs2_reinit_path(path, 1); 5266 5267 ret = ocfs2_find_path(inode, path, cpos); 5268 if (ret) { 5269 mlog_errno(ret); 5270 goto out; 5271 } 5272 5273 el = path_leaf_el(path); 5274 index = ocfs2_search_extent_list(el, cpos); 5275 if (index == -1 || index >= le16_to_cpu(el->l_next_free_rec)) { 5276 ocfs2_error(inode->i_sb, 5277 "Inode %llu: split at cpos %u lost record.", 5278 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5279 cpos); 5280 ret = -EROFS; 5281 goto out; 5282 } 5283 5284 /* 5285 * Double check our values here. If anything is fishy, 5286 * it's easier to catch it at the top level. 5287 */ 5288 rec = &el->l_recs[index]; 5289 rec_range = le32_to_cpu(rec->e_cpos) + 5290 ocfs2_rec_clusters(el, rec); 5291 if (rec_range != trunc_range) { 5292 ocfs2_error(inode->i_sb, 5293 "Inode %llu: error after split at cpos %u" 5294 "trunc len %u, existing record is (%u,%u)", 5295 (unsigned long long)OCFS2_I(inode)->ip_blkno, 5296 cpos, len, le32_to_cpu(rec->e_cpos), 5297 ocfs2_rec_clusters(el, rec)); 5298 ret = -EROFS; 5299 goto out; 5300 } 5301 5302 ret = ocfs2_truncate_rec(inode, handle, path, index, dealloc, 5303 cpos, len, et); 5304 if (ret) { 5305 mlog_errno(ret); 5306 goto out; 5307 } 5308 } 5309 5310 out: 5311 ocfs2_free_path(path); 5312 return ret; 5313 } 5314 5315 int ocfs2_remove_btree_range(struct inode *inode, 5316 struct ocfs2_extent_tree *et, 5317 u32 cpos, u32 phys_cpos, u32 len, 5318 struct ocfs2_cached_dealloc_ctxt *dealloc) 5319 { 5320 int ret; 5321 u64 phys_blkno = ocfs2_clusters_to_blocks(inode->i_sb, phys_cpos); 5322 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 5323 struct inode *tl_inode = osb->osb_tl_inode; 5324 handle_t *handle; 5325 struct ocfs2_alloc_context *meta_ac = NULL; 5326 5327 ret = ocfs2_lock_allocators(inode, et, 0, 1, NULL, &meta_ac); 5328 if (ret) { 5329 mlog_errno(ret); 5330 return ret; 5331 } 5332 5333 mutex_lock(&tl_inode->i_mutex); 5334 5335 if (ocfs2_truncate_log_needs_flush(osb)) { 5336 ret = __ocfs2_flush_truncate_log(osb); 5337 if (ret < 0) { 5338 mlog_errno(ret); 5339 goto out; 5340 } 5341 } 5342 5343 handle = ocfs2_start_trans(osb, ocfs2_remove_extent_credits(osb->sb)); 5344 if (IS_ERR(handle)) { 5345 ret = PTR_ERR(handle); 5346 mlog_errno(ret); 5347 goto out; 5348 } 5349 5350 ret = ocfs2_journal_access(handle, inode, et->et_root_bh, 5351 OCFS2_JOURNAL_ACCESS_WRITE); 5352 if (ret) { 5353 mlog_errno(ret); 5354 goto out; 5355 } 5356 5357 ret = ocfs2_remove_extent(inode, et, cpos, len, handle, meta_ac, 5358 dealloc); 5359 if (ret) { 5360 mlog_errno(ret); 5361 goto out_commit; 5362 } 5363 5364 ocfs2_et_update_clusters(inode, et, -len); 5365 5366 ret = ocfs2_journal_dirty(handle, et->et_root_bh); 5367 if (ret) { 5368 mlog_errno(ret); 5369 goto out_commit; 5370 } 5371 5372 ret = ocfs2_truncate_log_append(osb, handle, phys_blkno, len); 5373 if (ret) 5374 mlog_errno(ret); 5375 5376 out_commit: 5377 ocfs2_commit_trans(osb, handle); 5378 out: 5379 mutex_unlock(&tl_inode->i_mutex); 5380 5381 if (meta_ac) 5382 ocfs2_free_alloc_context(meta_ac); 5383 5384 return ret; 5385 } 5386 5387 int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 5388 { 5389 struct buffer_head *tl_bh = osb->osb_tl_bh; 5390 struct ocfs2_dinode *di; 5391 struct ocfs2_truncate_log *tl; 5392 5393 di = (struct ocfs2_dinode *) tl_bh->b_data; 5394 tl = &di->id2.i_dealloc; 5395 5396 mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 5397 "slot %d, invalid truncate log parameters: used = " 5398 "%u, count = %u\n", osb->slot_num, 5399 le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 5400 return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 5401 } 5402 5403 static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 5404 unsigned int new_start) 5405 { 5406 unsigned int tail_index; 5407 unsigned int current_tail; 5408 5409 /* No records, nothing to coalesce */ 5410 if (!le16_to_cpu(tl->tl_used)) 5411 return 0; 5412 5413 tail_index = le16_to_cpu(tl->tl_used) - 1; 5414 current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 5415 current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 5416 5417 return current_tail == new_start; 5418 } 5419 5420 int ocfs2_truncate_log_append(struct ocfs2_super *osb, 5421 handle_t *handle, 5422 u64 start_blk, 5423 unsigned int num_clusters) 5424 { 5425 int status, index; 5426 unsigned int start_cluster, tl_count; 5427 struct inode *tl_inode = osb->osb_tl_inode; 5428 struct buffer_head *tl_bh = osb->osb_tl_bh; 5429 struct ocfs2_dinode *di; 5430 struct ocfs2_truncate_log *tl; 5431 5432 mlog_entry("start_blk = %llu, num_clusters = %u\n", 5433 (unsigned long long)start_blk, num_clusters); 5434 5435 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5436 5437 start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 5438 5439 di = (struct ocfs2_dinode *) tl_bh->b_data; 5440 5441 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 5442 * by the underlying call to ocfs2_read_inode_block(), so any 5443 * corruption is a code bug */ 5444 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5445 5446 tl = &di->id2.i_dealloc; 5447 tl_count = le16_to_cpu(tl->tl_count); 5448 mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 5449 tl_count == 0, 5450 "Truncate record count on #%llu invalid " 5451 "wanted %u, actual %u\n", 5452 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 5453 ocfs2_truncate_recs_per_inode(osb->sb), 5454 le16_to_cpu(tl->tl_count)); 5455 5456 /* Caller should have known to flush before calling us. */ 5457 index = le16_to_cpu(tl->tl_used); 5458 if (index >= tl_count) { 5459 status = -ENOSPC; 5460 mlog_errno(status); 5461 goto bail; 5462 } 5463 5464 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5465 OCFS2_JOURNAL_ACCESS_WRITE); 5466 if (status < 0) { 5467 mlog_errno(status); 5468 goto bail; 5469 } 5470 5471 mlog(0, "Log truncate of %u clusters starting at cluster %u to " 5472 "%llu (index = %d)\n", num_clusters, start_cluster, 5473 (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 5474 5475 if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 5476 /* 5477 * Move index back to the record we are coalescing with. 5478 * ocfs2_truncate_log_can_coalesce() guarantees nonzero 5479 */ 5480 index--; 5481 5482 num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 5483 mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 5484 index, le32_to_cpu(tl->tl_recs[index].t_start), 5485 num_clusters); 5486 } else { 5487 tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 5488 tl->tl_used = cpu_to_le16(index + 1); 5489 } 5490 tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 5491 5492 status = ocfs2_journal_dirty(handle, tl_bh); 5493 if (status < 0) { 5494 mlog_errno(status); 5495 goto bail; 5496 } 5497 5498 bail: 5499 mlog_exit(status); 5500 return status; 5501 } 5502 5503 static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 5504 handle_t *handle, 5505 struct inode *data_alloc_inode, 5506 struct buffer_head *data_alloc_bh) 5507 { 5508 int status = 0; 5509 int i; 5510 unsigned int num_clusters; 5511 u64 start_blk; 5512 struct ocfs2_truncate_rec rec; 5513 struct ocfs2_dinode *di; 5514 struct ocfs2_truncate_log *tl; 5515 struct inode *tl_inode = osb->osb_tl_inode; 5516 struct buffer_head *tl_bh = osb->osb_tl_bh; 5517 5518 mlog_entry_void(); 5519 5520 di = (struct ocfs2_dinode *) tl_bh->b_data; 5521 tl = &di->id2.i_dealloc; 5522 i = le16_to_cpu(tl->tl_used) - 1; 5523 while (i >= 0) { 5524 /* Caller has given us at least enough credits to 5525 * update the truncate log dinode */ 5526 status = ocfs2_journal_access(handle, tl_inode, tl_bh, 5527 OCFS2_JOURNAL_ACCESS_WRITE); 5528 if (status < 0) { 5529 mlog_errno(status); 5530 goto bail; 5531 } 5532 5533 tl->tl_used = cpu_to_le16(i); 5534 5535 status = ocfs2_journal_dirty(handle, tl_bh); 5536 if (status < 0) { 5537 mlog_errno(status); 5538 goto bail; 5539 } 5540 5541 /* TODO: Perhaps we can calculate the bulk of the 5542 * credits up front rather than extending like 5543 * this. */ 5544 status = ocfs2_extend_trans(handle, 5545 OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 5546 if (status < 0) { 5547 mlog_errno(status); 5548 goto bail; 5549 } 5550 5551 rec = tl->tl_recs[i]; 5552 start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 5553 le32_to_cpu(rec.t_start)); 5554 num_clusters = le32_to_cpu(rec.t_clusters); 5555 5556 /* if start_blk is not set, we ignore the record as 5557 * invalid. */ 5558 if (start_blk) { 5559 mlog(0, "free record %d, start = %u, clusters = %u\n", 5560 i, le32_to_cpu(rec.t_start), num_clusters); 5561 5562 status = ocfs2_free_clusters(handle, data_alloc_inode, 5563 data_alloc_bh, start_blk, 5564 num_clusters); 5565 if (status < 0) { 5566 mlog_errno(status); 5567 goto bail; 5568 } 5569 } 5570 i--; 5571 } 5572 5573 bail: 5574 mlog_exit(status); 5575 return status; 5576 } 5577 5578 /* Expects you to already be holding tl_inode->i_mutex */ 5579 int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5580 { 5581 int status; 5582 unsigned int num_to_flush; 5583 handle_t *handle; 5584 struct inode *tl_inode = osb->osb_tl_inode; 5585 struct inode *data_alloc_inode = NULL; 5586 struct buffer_head *tl_bh = osb->osb_tl_bh; 5587 struct buffer_head *data_alloc_bh = NULL; 5588 struct ocfs2_dinode *di; 5589 struct ocfs2_truncate_log *tl; 5590 5591 mlog_entry_void(); 5592 5593 BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 5594 5595 di = (struct ocfs2_dinode *) tl_bh->b_data; 5596 5597 /* tl_bh is loaded from ocfs2_truncate_log_init(). It's validated 5598 * by the underlying call to ocfs2_read_inode_block(), so any 5599 * corruption is a code bug */ 5600 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5601 5602 tl = &di->id2.i_dealloc; 5603 num_to_flush = le16_to_cpu(tl->tl_used); 5604 mlog(0, "Flush %u records from truncate log #%llu\n", 5605 num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 5606 if (!num_to_flush) { 5607 status = 0; 5608 goto out; 5609 } 5610 5611 data_alloc_inode = ocfs2_get_system_file_inode(osb, 5612 GLOBAL_BITMAP_SYSTEM_INODE, 5613 OCFS2_INVALID_SLOT); 5614 if (!data_alloc_inode) { 5615 status = -EINVAL; 5616 mlog(ML_ERROR, "Could not get bitmap inode!\n"); 5617 goto out; 5618 } 5619 5620 mutex_lock(&data_alloc_inode->i_mutex); 5621 5622 status = ocfs2_inode_lock(data_alloc_inode, &data_alloc_bh, 1); 5623 if (status < 0) { 5624 mlog_errno(status); 5625 goto out_mutex; 5626 } 5627 5628 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5629 if (IS_ERR(handle)) { 5630 status = PTR_ERR(handle); 5631 mlog_errno(status); 5632 goto out_unlock; 5633 } 5634 5635 status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 5636 data_alloc_bh); 5637 if (status < 0) 5638 mlog_errno(status); 5639 5640 ocfs2_commit_trans(osb, handle); 5641 5642 out_unlock: 5643 brelse(data_alloc_bh); 5644 ocfs2_inode_unlock(data_alloc_inode, 1); 5645 5646 out_mutex: 5647 mutex_unlock(&data_alloc_inode->i_mutex); 5648 iput(data_alloc_inode); 5649 5650 out: 5651 mlog_exit(status); 5652 return status; 5653 } 5654 5655 int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 5656 { 5657 int status; 5658 struct inode *tl_inode = osb->osb_tl_inode; 5659 5660 mutex_lock(&tl_inode->i_mutex); 5661 status = __ocfs2_flush_truncate_log(osb); 5662 mutex_unlock(&tl_inode->i_mutex); 5663 5664 return status; 5665 } 5666 5667 static void ocfs2_truncate_log_worker(struct work_struct *work) 5668 { 5669 int status; 5670 struct ocfs2_super *osb = 5671 container_of(work, struct ocfs2_super, 5672 osb_truncate_log_wq.work); 5673 5674 mlog_entry_void(); 5675 5676 status = ocfs2_flush_truncate_log(osb); 5677 if (status < 0) 5678 mlog_errno(status); 5679 else 5680 ocfs2_init_inode_steal_slot(osb); 5681 5682 mlog_exit(status); 5683 } 5684 5685 #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 5686 void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 5687 int cancel) 5688 { 5689 if (osb->osb_tl_inode) { 5690 /* We want to push off log flushes while truncates are 5691 * still running. */ 5692 if (cancel) 5693 cancel_delayed_work(&osb->osb_truncate_log_wq); 5694 5695 queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 5696 OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 5697 } 5698 } 5699 5700 static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 5701 int slot_num, 5702 struct inode **tl_inode, 5703 struct buffer_head **tl_bh) 5704 { 5705 int status; 5706 struct inode *inode = NULL; 5707 struct buffer_head *bh = NULL; 5708 5709 inode = ocfs2_get_system_file_inode(osb, 5710 TRUNCATE_LOG_SYSTEM_INODE, 5711 slot_num); 5712 if (!inode) { 5713 status = -EINVAL; 5714 mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 5715 goto bail; 5716 } 5717 5718 status = ocfs2_read_inode_block(inode, &bh); 5719 if (status < 0) { 5720 iput(inode); 5721 mlog_errno(status); 5722 goto bail; 5723 } 5724 5725 *tl_inode = inode; 5726 *tl_bh = bh; 5727 bail: 5728 mlog_exit(status); 5729 return status; 5730 } 5731 5732 /* called during the 1st stage of node recovery. we stamp a clean 5733 * truncate log and pass back a copy for processing later. if the 5734 * truncate log does not require processing, a *tl_copy is set to 5735 * NULL. */ 5736 int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 5737 int slot_num, 5738 struct ocfs2_dinode **tl_copy) 5739 { 5740 int status; 5741 struct inode *tl_inode = NULL; 5742 struct buffer_head *tl_bh = NULL; 5743 struct ocfs2_dinode *di; 5744 struct ocfs2_truncate_log *tl; 5745 5746 *tl_copy = NULL; 5747 5748 mlog(0, "recover truncate log from slot %d\n", slot_num); 5749 5750 status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 5751 if (status < 0) { 5752 mlog_errno(status); 5753 goto bail; 5754 } 5755 5756 di = (struct ocfs2_dinode *) tl_bh->b_data; 5757 5758 /* tl_bh is loaded from ocfs2_get_truncate_log_info(). It's 5759 * validated by the underlying call to ocfs2_read_inode_block(), 5760 * so any corruption is a code bug */ 5761 BUG_ON(!OCFS2_IS_VALID_DINODE(di)); 5762 5763 tl = &di->id2.i_dealloc; 5764 if (le16_to_cpu(tl->tl_used)) { 5765 mlog(0, "We'll have %u logs to recover\n", 5766 le16_to_cpu(tl->tl_used)); 5767 5768 *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 5769 if (!(*tl_copy)) { 5770 status = -ENOMEM; 5771 mlog_errno(status); 5772 goto bail; 5773 } 5774 5775 /* Assuming the write-out below goes well, this copy 5776 * will be passed back to recovery for processing. */ 5777 memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 5778 5779 /* All we need to do to clear the truncate log is set 5780 * tl_used. */ 5781 tl->tl_used = 0; 5782 5783 status = ocfs2_write_block(osb, tl_bh, tl_inode); 5784 if (status < 0) { 5785 mlog_errno(status); 5786 goto bail; 5787 } 5788 } 5789 5790 bail: 5791 if (tl_inode) 5792 iput(tl_inode); 5793 brelse(tl_bh); 5794 5795 if (status < 0 && (*tl_copy)) { 5796 kfree(*tl_copy); 5797 *tl_copy = NULL; 5798 } 5799 5800 mlog_exit(status); 5801 return status; 5802 } 5803 5804 int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 5805 struct ocfs2_dinode *tl_copy) 5806 { 5807 int status = 0; 5808 int i; 5809 unsigned int clusters, num_recs, start_cluster; 5810 u64 start_blk; 5811 handle_t *handle; 5812 struct inode *tl_inode = osb->osb_tl_inode; 5813 struct ocfs2_truncate_log *tl; 5814 5815 mlog_entry_void(); 5816 5817 if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 5818 mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 5819 return -EINVAL; 5820 } 5821 5822 tl = &tl_copy->id2.i_dealloc; 5823 num_recs = le16_to_cpu(tl->tl_used); 5824 mlog(0, "cleanup %u records from %llu\n", num_recs, 5825 (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 5826 5827 mutex_lock(&tl_inode->i_mutex); 5828 for(i = 0; i < num_recs; i++) { 5829 if (ocfs2_truncate_log_needs_flush(osb)) { 5830 status = __ocfs2_flush_truncate_log(osb); 5831 if (status < 0) { 5832 mlog_errno(status); 5833 goto bail_up; 5834 } 5835 } 5836 5837 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 5838 if (IS_ERR(handle)) { 5839 status = PTR_ERR(handle); 5840 mlog_errno(status); 5841 goto bail_up; 5842 } 5843 5844 clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 5845 start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 5846 start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 5847 5848 status = ocfs2_truncate_log_append(osb, handle, 5849 start_blk, clusters); 5850 ocfs2_commit_trans(osb, handle); 5851 if (status < 0) { 5852 mlog_errno(status); 5853 goto bail_up; 5854 } 5855 } 5856 5857 bail_up: 5858 mutex_unlock(&tl_inode->i_mutex); 5859 5860 mlog_exit(status); 5861 return status; 5862 } 5863 5864 void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 5865 { 5866 int status; 5867 struct inode *tl_inode = osb->osb_tl_inode; 5868 5869 mlog_entry_void(); 5870 5871 if (tl_inode) { 5872 cancel_delayed_work(&osb->osb_truncate_log_wq); 5873 flush_workqueue(ocfs2_wq); 5874 5875 status = ocfs2_flush_truncate_log(osb); 5876 if (status < 0) 5877 mlog_errno(status); 5878 5879 brelse(osb->osb_tl_bh); 5880 iput(osb->osb_tl_inode); 5881 } 5882 5883 mlog_exit_void(); 5884 } 5885 5886 int ocfs2_truncate_log_init(struct ocfs2_super *osb) 5887 { 5888 int status; 5889 struct inode *tl_inode = NULL; 5890 struct buffer_head *tl_bh = NULL; 5891 5892 mlog_entry_void(); 5893 5894 status = ocfs2_get_truncate_log_info(osb, 5895 osb->slot_num, 5896 &tl_inode, 5897 &tl_bh); 5898 if (status < 0) 5899 mlog_errno(status); 5900 5901 /* ocfs2_truncate_log_shutdown keys on the existence of 5902 * osb->osb_tl_inode so we don't set any of the osb variables 5903 * until we're sure all is well. */ 5904 INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 5905 ocfs2_truncate_log_worker); 5906 osb->osb_tl_bh = tl_bh; 5907 osb->osb_tl_inode = tl_inode; 5908 5909 mlog_exit(status); 5910 return status; 5911 } 5912 5913 /* 5914 * Delayed de-allocation of suballocator blocks. 5915 * 5916 * Some sets of block de-allocations might involve multiple suballocator inodes. 5917 * 5918 * The locking for this can get extremely complicated, especially when 5919 * the suballocator inodes to delete from aren't known until deep 5920 * within an unrelated codepath. 5921 * 5922 * ocfs2_extent_block structures are a good example of this - an inode 5923 * btree could have been grown by any number of nodes each allocating 5924 * out of their own suballoc inode. 5925 * 5926 * These structures allow the delay of block de-allocation until a 5927 * later time, when locking of multiple cluster inodes won't cause 5928 * deadlock. 5929 */ 5930 5931 /* 5932 * Describe a single bit freed from a suballocator. For the block 5933 * suballocators, it represents one block. For the global cluster 5934 * allocator, it represents some clusters and free_bit indicates 5935 * clusters number. 5936 */ 5937 struct ocfs2_cached_block_free { 5938 struct ocfs2_cached_block_free *free_next; 5939 u64 free_blk; 5940 unsigned int free_bit; 5941 }; 5942 5943 struct ocfs2_per_slot_free_list { 5944 struct ocfs2_per_slot_free_list *f_next_suballocator; 5945 int f_inode_type; 5946 int f_slot; 5947 struct ocfs2_cached_block_free *f_first; 5948 }; 5949 5950 static int ocfs2_free_cached_blocks(struct ocfs2_super *osb, 5951 int sysfile_type, 5952 int slot, 5953 struct ocfs2_cached_block_free *head) 5954 { 5955 int ret; 5956 u64 bg_blkno; 5957 handle_t *handle; 5958 struct inode *inode; 5959 struct buffer_head *di_bh = NULL; 5960 struct ocfs2_cached_block_free *tmp; 5961 5962 inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 5963 if (!inode) { 5964 ret = -EINVAL; 5965 mlog_errno(ret); 5966 goto out; 5967 } 5968 5969 mutex_lock(&inode->i_mutex); 5970 5971 ret = ocfs2_inode_lock(inode, &di_bh, 1); 5972 if (ret) { 5973 mlog_errno(ret); 5974 goto out_mutex; 5975 } 5976 5977 handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 5978 if (IS_ERR(handle)) { 5979 ret = PTR_ERR(handle); 5980 mlog_errno(ret); 5981 goto out_unlock; 5982 } 5983 5984 while (head) { 5985 bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 5986 head->free_bit); 5987 mlog(0, "Free bit: (bit %u, blkno %llu)\n", 5988 head->free_bit, (unsigned long long)head->free_blk); 5989 5990 ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 5991 head->free_bit, bg_blkno, 1); 5992 if (ret) { 5993 mlog_errno(ret); 5994 goto out_journal; 5995 } 5996 5997 ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 5998 if (ret) { 5999 mlog_errno(ret); 6000 goto out_journal; 6001 } 6002 6003 tmp = head; 6004 head = head->free_next; 6005 kfree(tmp); 6006 } 6007 6008 out_journal: 6009 ocfs2_commit_trans(osb, handle); 6010 6011 out_unlock: 6012 ocfs2_inode_unlock(inode, 1); 6013 brelse(di_bh); 6014 out_mutex: 6015 mutex_unlock(&inode->i_mutex); 6016 iput(inode); 6017 out: 6018 while(head) { 6019 /* Premature exit may have left some dangling items. */ 6020 tmp = head; 6021 head = head->free_next; 6022 kfree(tmp); 6023 } 6024 6025 return ret; 6026 } 6027 6028 int ocfs2_cache_cluster_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6029 u64 blkno, unsigned int bit) 6030 { 6031 int ret = 0; 6032 struct ocfs2_cached_block_free *item; 6033 6034 item = kmalloc(sizeof(*item), GFP_NOFS); 6035 if (item == NULL) { 6036 ret = -ENOMEM; 6037 mlog_errno(ret); 6038 return ret; 6039 } 6040 6041 mlog(0, "Insert clusters: (bit %u, blk %llu)\n", 6042 bit, (unsigned long long)blkno); 6043 6044 item->free_blk = blkno; 6045 item->free_bit = bit; 6046 item->free_next = ctxt->c_global_allocator; 6047 6048 ctxt->c_global_allocator = item; 6049 return ret; 6050 } 6051 6052 static int ocfs2_free_cached_clusters(struct ocfs2_super *osb, 6053 struct ocfs2_cached_block_free *head) 6054 { 6055 struct ocfs2_cached_block_free *tmp; 6056 struct inode *tl_inode = osb->osb_tl_inode; 6057 handle_t *handle; 6058 int ret = 0; 6059 6060 mutex_lock(&tl_inode->i_mutex); 6061 6062 while (head) { 6063 if (ocfs2_truncate_log_needs_flush(osb)) { 6064 ret = __ocfs2_flush_truncate_log(osb); 6065 if (ret < 0) { 6066 mlog_errno(ret); 6067 break; 6068 } 6069 } 6070 6071 handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 6072 if (IS_ERR(handle)) { 6073 ret = PTR_ERR(handle); 6074 mlog_errno(ret); 6075 break; 6076 } 6077 6078 ret = ocfs2_truncate_log_append(osb, handle, head->free_blk, 6079 head->free_bit); 6080 6081 ocfs2_commit_trans(osb, handle); 6082 tmp = head; 6083 head = head->free_next; 6084 kfree(tmp); 6085 6086 if (ret < 0) { 6087 mlog_errno(ret); 6088 break; 6089 } 6090 } 6091 6092 mutex_unlock(&tl_inode->i_mutex); 6093 6094 while (head) { 6095 /* Premature exit may have left some dangling items. */ 6096 tmp = head; 6097 head = head->free_next; 6098 kfree(tmp); 6099 } 6100 6101 return ret; 6102 } 6103 6104 int ocfs2_run_deallocs(struct ocfs2_super *osb, 6105 struct ocfs2_cached_dealloc_ctxt *ctxt) 6106 { 6107 int ret = 0, ret2; 6108 struct ocfs2_per_slot_free_list *fl; 6109 6110 if (!ctxt) 6111 return 0; 6112 6113 while (ctxt->c_first_suballocator) { 6114 fl = ctxt->c_first_suballocator; 6115 6116 if (fl->f_first) { 6117 mlog(0, "Free items: (type %u, slot %d)\n", 6118 fl->f_inode_type, fl->f_slot); 6119 ret2 = ocfs2_free_cached_blocks(osb, 6120 fl->f_inode_type, 6121 fl->f_slot, 6122 fl->f_first); 6123 if (ret2) 6124 mlog_errno(ret2); 6125 if (!ret) 6126 ret = ret2; 6127 } 6128 6129 ctxt->c_first_suballocator = fl->f_next_suballocator; 6130 kfree(fl); 6131 } 6132 6133 if (ctxt->c_global_allocator) { 6134 ret2 = ocfs2_free_cached_clusters(osb, 6135 ctxt->c_global_allocator); 6136 if (ret2) 6137 mlog_errno(ret2); 6138 if (!ret) 6139 ret = ret2; 6140 6141 ctxt->c_global_allocator = NULL; 6142 } 6143 6144 return ret; 6145 } 6146 6147 static struct ocfs2_per_slot_free_list * 6148 ocfs2_find_per_slot_free_list(int type, 6149 int slot, 6150 struct ocfs2_cached_dealloc_ctxt *ctxt) 6151 { 6152 struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 6153 6154 while (fl) { 6155 if (fl->f_inode_type == type && fl->f_slot == slot) 6156 return fl; 6157 6158 fl = fl->f_next_suballocator; 6159 } 6160 6161 fl = kmalloc(sizeof(*fl), GFP_NOFS); 6162 if (fl) { 6163 fl->f_inode_type = type; 6164 fl->f_slot = slot; 6165 fl->f_first = NULL; 6166 fl->f_next_suballocator = ctxt->c_first_suballocator; 6167 6168 ctxt->c_first_suballocator = fl; 6169 } 6170 return fl; 6171 } 6172 6173 static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 6174 int type, int slot, u64 blkno, 6175 unsigned int bit) 6176 { 6177 int ret; 6178 struct ocfs2_per_slot_free_list *fl; 6179 struct ocfs2_cached_block_free *item; 6180 6181 fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 6182 if (fl == NULL) { 6183 ret = -ENOMEM; 6184 mlog_errno(ret); 6185 goto out; 6186 } 6187 6188 item = kmalloc(sizeof(*item), GFP_NOFS); 6189 if (item == NULL) { 6190 ret = -ENOMEM; 6191 mlog_errno(ret); 6192 goto out; 6193 } 6194 6195 mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 6196 type, slot, bit, (unsigned long long)blkno); 6197 6198 item->free_blk = blkno; 6199 item->free_bit = bit; 6200 item->free_next = fl->f_first; 6201 6202 fl->f_first = item; 6203 6204 ret = 0; 6205 out: 6206 return ret; 6207 } 6208 6209 static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 6210 struct ocfs2_extent_block *eb) 6211 { 6212 return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 6213 le16_to_cpu(eb->h_suballoc_slot), 6214 le64_to_cpu(eb->h_blkno), 6215 le16_to_cpu(eb->h_suballoc_bit)); 6216 } 6217 6218 /* This function will figure out whether the currently last extent 6219 * block will be deleted, and if it will, what the new last extent 6220 * block will be so we can update his h_next_leaf_blk field, as well 6221 * as the dinodes i_last_eb_blk */ 6222 static int ocfs2_find_new_last_ext_blk(struct inode *inode, 6223 unsigned int clusters_to_del, 6224 struct ocfs2_path *path, 6225 struct buffer_head **new_last_eb) 6226 { 6227 int next_free, ret = 0; 6228 u32 cpos; 6229 struct ocfs2_extent_rec *rec; 6230 struct ocfs2_extent_block *eb; 6231 struct ocfs2_extent_list *el; 6232 struct buffer_head *bh = NULL; 6233 6234 *new_last_eb = NULL; 6235 6236 /* we have no tree, so of course, no last_eb. */ 6237 if (!path->p_tree_depth) 6238 goto out; 6239 6240 /* trunc to zero special case - this makes tree_depth = 0 6241 * regardless of what it is. */ 6242 if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 6243 goto out; 6244 6245 el = path_leaf_el(path); 6246 BUG_ON(!el->l_next_free_rec); 6247 6248 /* 6249 * Make sure that this extent list will actually be empty 6250 * after we clear away the data. We can shortcut out if 6251 * there's more than one non-empty extent in the 6252 * list. Otherwise, a check of the remaining extent is 6253 * necessary. 6254 */ 6255 next_free = le16_to_cpu(el->l_next_free_rec); 6256 rec = NULL; 6257 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6258 if (next_free > 2) 6259 goto out; 6260 6261 /* We may have a valid extent in index 1, check it. */ 6262 if (next_free == 2) 6263 rec = &el->l_recs[1]; 6264 6265 /* 6266 * Fall through - no more nonempty extents, so we want 6267 * to delete this leaf. 6268 */ 6269 } else { 6270 if (next_free > 1) 6271 goto out; 6272 6273 rec = &el->l_recs[0]; 6274 } 6275 6276 if (rec) { 6277 /* 6278 * Check it we'll only be trimming off the end of this 6279 * cluster. 6280 */ 6281 if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 6282 goto out; 6283 } 6284 6285 ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 6286 if (ret) { 6287 mlog_errno(ret); 6288 goto out; 6289 } 6290 6291 ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 6292 if (ret) { 6293 mlog_errno(ret); 6294 goto out; 6295 } 6296 6297 eb = (struct ocfs2_extent_block *) bh->b_data; 6298 el = &eb->h_list; 6299 6300 /* ocfs2_find_leaf() gets the eb from ocfs2_read_extent_block(). 6301 * Any corruption is a code bug. */ 6302 BUG_ON(!OCFS2_IS_VALID_EXTENT_BLOCK(eb)); 6303 6304 *new_last_eb = bh; 6305 get_bh(*new_last_eb); 6306 mlog(0, "returning block %llu, (cpos: %u)\n", 6307 (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 6308 out: 6309 brelse(bh); 6310 6311 return ret; 6312 } 6313 6314 /* 6315 * Trim some clusters off the rightmost edge of a tree. Only called 6316 * during truncate. 6317 * 6318 * The caller needs to: 6319 * - start journaling of each path component. 6320 * - compute and fully set up any new last ext block 6321 */ 6322 static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 6323 handle_t *handle, struct ocfs2_truncate_context *tc, 6324 u32 clusters_to_del, u64 *delete_start) 6325 { 6326 int ret, i, index = path->p_tree_depth; 6327 u32 new_edge = 0; 6328 u64 deleted_eb = 0; 6329 struct buffer_head *bh; 6330 struct ocfs2_extent_list *el; 6331 struct ocfs2_extent_rec *rec; 6332 6333 *delete_start = 0; 6334 6335 while (index >= 0) { 6336 bh = path->p_node[index].bh; 6337 el = path->p_node[index].el; 6338 6339 mlog(0, "traveling tree (index = %d, block = %llu)\n", 6340 index, (unsigned long long)bh->b_blocknr); 6341 6342 BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 6343 6344 if (index != 6345 (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 6346 ocfs2_error(inode->i_sb, 6347 "Inode %lu has invalid ext. block %llu", 6348 inode->i_ino, 6349 (unsigned long long)bh->b_blocknr); 6350 ret = -EROFS; 6351 goto out; 6352 } 6353 6354 find_tail_record: 6355 i = le16_to_cpu(el->l_next_free_rec) - 1; 6356 rec = &el->l_recs[i]; 6357 6358 mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 6359 "next = %u\n", i, le32_to_cpu(rec->e_cpos), 6360 ocfs2_rec_clusters(el, rec), 6361 (unsigned long long)le64_to_cpu(rec->e_blkno), 6362 le16_to_cpu(el->l_next_free_rec)); 6363 6364 BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 6365 6366 if (le16_to_cpu(el->l_tree_depth) == 0) { 6367 /* 6368 * If the leaf block contains a single empty 6369 * extent and no records, we can just remove 6370 * the block. 6371 */ 6372 if (i == 0 && ocfs2_is_empty_extent(rec)) { 6373 memset(rec, 0, 6374 sizeof(struct ocfs2_extent_rec)); 6375 el->l_next_free_rec = cpu_to_le16(0); 6376 6377 goto delete; 6378 } 6379 6380 /* 6381 * Remove any empty extents by shifting things 6382 * left. That should make life much easier on 6383 * the code below. This condition is rare 6384 * enough that we shouldn't see a performance 6385 * hit. 6386 */ 6387 if (ocfs2_is_empty_extent(&el->l_recs[0])) { 6388 le16_add_cpu(&el->l_next_free_rec, -1); 6389 6390 for(i = 0; 6391 i < le16_to_cpu(el->l_next_free_rec); i++) 6392 el->l_recs[i] = el->l_recs[i + 1]; 6393 6394 memset(&el->l_recs[i], 0, 6395 sizeof(struct ocfs2_extent_rec)); 6396 6397 /* 6398 * We've modified our extent list. The 6399 * simplest way to handle this change 6400 * is to being the search from the 6401 * start again. 6402 */ 6403 goto find_tail_record; 6404 } 6405 6406 le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 6407 6408 /* 6409 * We'll use "new_edge" on our way back up the 6410 * tree to know what our rightmost cpos is. 6411 */ 6412 new_edge = le16_to_cpu(rec->e_leaf_clusters); 6413 new_edge += le32_to_cpu(rec->e_cpos); 6414 6415 /* 6416 * The caller will use this to delete data blocks. 6417 */ 6418 *delete_start = le64_to_cpu(rec->e_blkno) 6419 + ocfs2_clusters_to_blocks(inode->i_sb, 6420 le16_to_cpu(rec->e_leaf_clusters)); 6421 6422 /* 6423 * If it's now empty, remove this record. 6424 */ 6425 if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 6426 memset(rec, 0, 6427 sizeof(struct ocfs2_extent_rec)); 6428 le16_add_cpu(&el->l_next_free_rec, -1); 6429 } 6430 } else { 6431 if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 6432 memset(rec, 0, 6433 sizeof(struct ocfs2_extent_rec)); 6434 le16_add_cpu(&el->l_next_free_rec, -1); 6435 6436 goto delete; 6437 } 6438 6439 /* Can this actually happen? */ 6440 if (le16_to_cpu(el->l_next_free_rec) == 0) 6441 goto delete; 6442 6443 /* 6444 * We never actually deleted any clusters 6445 * because our leaf was empty. There's no 6446 * reason to adjust the rightmost edge then. 6447 */ 6448 if (new_edge == 0) 6449 goto delete; 6450 6451 rec->e_int_clusters = cpu_to_le32(new_edge); 6452 le32_add_cpu(&rec->e_int_clusters, 6453 -le32_to_cpu(rec->e_cpos)); 6454 6455 /* 6456 * A deleted child record should have been 6457 * caught above. 6458 */ 6459 BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 6460 } 6461 6462 delete: 6463 ret = ocfs2_journal_dirty(handle, bh); 6464 if (ret) { 6465 mlog_errno(ret); 6466 goto out; 6467 } 6468 6469 mlog(0, "extent list container %llu, after: record %d: " 6470 "(%u, %u, %llu), next = %u.\n", 6471 (unsigned long long)bh->b_blocknr, i, 6472 le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 6473 (unsigned long long)le64_to_cpu(rec->e_blkno), 6474 le16_to_cpu(el->l_next_free_rec)); 6475 6476 /* 6477 * We must be careful to only attempt delete of an 6478 * extent block (and not the root inode block). 6479 */ 6480 if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 6481 struct ocfs2_extent_block *eb = 6482 (struct ocfs2_extent_block *)bh->b_data; 6483 6484 /* 6485 * Save this for use when processing the 6486 * parent block. 6487 */ 6488 deleted_eb = le64_to_cpu(eb->h_blkno); 6489 6490 mlog(0, "deleting this extent block.\n"); 6491 6492 ocfs2_remove_from_cache(inode, bh); 6493 6494 BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 6495 BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 6496 BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 6497 6498 ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 6499 /* An error here is not fatal. */ 6500 if (ret < 0) 6501 mlog_errno(ret); 6502 } else { 6503 deleted_eb = 0; 6504 } 6505 6506 index--; 6507 } 6508 6509 ret = 0; 6510 out: 6511 return ret; 6512 } 6513 6514 static int ocfs2_do_truncate(struct ocfs2_super *osb, 6515 unsigned int clusters_to_del, 6516 struct inode *inode, 6517 struct buffer_head *fe_bh, 6518 handle_t *handle, 6519 struct ocfs2_truncate_context *tc, 6520 struct ocfs2_path *path) 6521 { 6522 int status; 6523 struct ocfs2_dinode *fe; 6524 struct ocfs2_extent_block *last_eb = NULL; 6525 struct ocfs2_extent_list *el; 6526 struct buffer_head *last_eb_bh = NULL; 6527 u64 delete_blk = 0; 6528 6529 fe = (struct ocfs2_dinode *) fe_bh->b_data; 6530 6531 status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 6532 path, &last_eb_bh); 6533 if (status < 0) { 6534 mlog_errno(status); 6535 goto bail; 6536 } 6537 6538 /* 6539 * Each component will be touched, so we might as well journal 6540 * here to avoid having to handle errors later. 6541 */ 6542 status = ocfs2_journal_access_path(inode, handle, path); 6543 if (status < 0) { 6544 mlog_errno(status); 6545 goto bail; 6546 } 6547 6548 if (last_eb_bh) { 6549 status = ocfs2_journal_access(handle, inode, last_eb_bh, 6550 OCFS2_JOURNAL_ACCESS_WRITE); 6551 if (status < 0) { 6552 mlog_errno(status); 6553 goto bail; 6554 } 6555 6556 last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 6557 } 6558 6559 el = &(fe->id2.i_list); 6560 6561 /* 6562 * Lower levels depend on this never happening, but it's best 6563 * to check it up here before changing the tree. 6564 */ 6565 if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 6566 ocfs2_error(inode->i_sb, 6567 "Inode %lu has an empty extent record, depth %u\n", 6568 inode->i_ino, le16_to_cpu(el->l_tree_depth)); 6569 status = -EROFS; 6570 goto bail; 6571 } 6572 6573 vfs_dq_free_space_nodirty(inode, 6574 ocfs2_clusters_to_bytes(osb->sb, clusters_to_del)); 6575 spin_lock(&OCFS2_I(inode)->ip_lock); 6576 OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 6577 clusters_to_del; 6578 spin_unlock(&OCFS2_I(inode)->ip_lock); 6579 le32_add_cpu(&fe->i_clusters, -clusters_to_del); 6580 inode->i_blocks = ocfs2_inode_sector_count(inode); 6581 6582 status = ocfs2_trim_tree(inode, path, handle, tc, 6583 clusters_to_del, &delete_blk); 6584 if (status) { 6585 mlog_errno(status); 6586 goto bail; 6587 } 6588 6589 if (le32_to_cpu(fe->i_clusters) == 0) { 6590 /* trunc to zero is a special case. */ 6591 el->l_tree_depth = 0; 6592 fe->i_last_eb_blk = 0; 6593 } else if (last_eb) 6594 fe->i_last_eb_blk = last_eb->h_blkno; 6595 6596 status = ocfs2_journal_dirty(handle, fe_bh); 6597 if (status < 0) { 6598 mlog_errno(status); 6599 goto bail; 6600 } 6601 6602 if (last_eb) { 6603 /* If there will be a new last extent block, then by 6604 * definition, there cannot be any leaves to the right of 6605 * him. */ 6606 last_eb->h_next_leaf_blk = 0; 6607 status = ocfs2_journal_dirty(handle, last_eb_bh); 6608 if (status < 0) { 6609 mlog_errno(status); 6610 goto bail; 6611 } 6612 } 6613 6614 if (delete_blk) { 6615 status = ocfs2_truncate_log_append(osb, handle, delete_blk, 6616 clusters_to_del); 6617 if (status < 0) { 6618 mlog_errno(status); 6619 goto bail; 6620 } 6621 } 6622 status = 0; 6623 bail: 6624 6625 mlog_exit(status); 6626 return status; 6627 } 6628 6629 static int ocfs2_zero_func(handle_t *handle, struct buffer_head *bh) 6630 { 6631 set_buffer_uptodate(bh); 6632 mark_buffer_dirty(bh); 6633 return 0; 6634 } 6635 6636 static void ocfs2_map_and_dirty_page(struct inode *inode, handle_t *handle, 6637 unsigned int from, unsigned int to, 6638 struct page *page, int zero, u64 *phys) 6639 { 6640 int ret, partial = 0; 6641 6642 ret = ocfs2_map_page_blocks(page, phys, inode, from, to, 0); 6643 if (ret) 6644 mlog_errno(ret); 6645 6646 if (zero) 6647 zero_user_segment(page, from, to); 6648 6649 /* 6650 * Need to set the buffers we zero'd into uptodate 6651 * here if they aren't - ocfs2_map_page_blocks() 6652 * might've skipped some 6653 */ 6654 ret = walk_page_buffers(handle, page_buffers(page), 6655 from, to, &partial, 6656 ocfs2_zero_func); 6657 if (ret < 0) 6658 mlog_errno(ret); 6659 else if (ocfs2_should_order_data(inode)) { 6660 ret = ocfs2_jbd2_file_inode(handle, inode); 6661 if (ret < 0) 6662 mlog_errno(ret); 6663 } 6664 6665 if (!partial) 6666 SetPageUptodate(page); 6667 6668 flush_dcache_page(page); 6669 } 6670 6671 static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t start, 6672 loff_t end, struct page **pages, 6673 int numpages, u64 phys, handle_t *handle) 6674 { 6675 int i; 6676 struct page *page; 6677 unsigned int from, to = PAGE_CACHE_SIZE; 6678 struct super_block *sb = inode->i_sb; 6679 6680 BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 6681 6682 if (numpages == 0) 6683 goto out; 6684 6685 to = PAGE_CACHE_SIZE; 6686 for(i = 0; i < numpages; i++) { 6687 page = pages[i]; 6688 6689 from = start & (PAGE_CACHE_SIZE - 1); 6690 if ((end >> PAGE_CACHE_SHIFT) == page->index) 6691 to = end & (PAGE_CACHE_SIZE - 1); 6692 6693 BUG_ON(from > PAGE_CACHE_SIZE); 6694 BUG_ON(to > PAGE_CACHE_SIZE); 6695 6696 ocfs2_map_and_dirty_page(inode, handle, from, to, page, 1, 6697 &phys); 6698 6699 start = (page->index + 1) << PAGE_CACHE_SHIFT; 6700 } 6701 out: 6702 if (pages) 6703 ocfs2_unlock_and_free_pages(pages, numpages); 6704 } 6705 6706 static int ocfs2_grab_eof_pages(struct inode *inode, loff_t start, loff_t end, 6707 struct page **pages, int *num) 6708 { 6709 int numpages, ret = 0; 6710 struct super_block *sb = inode->i_sb; 6711 struct address_space *mapping = inode->i_mapping; 6712 unsigned long index; 6713 loff_t last_page_bytes; 6714 6715 BUG_ON(start > end); 6716 6717 BUG_ON(start >> OCFS2_SB(sb)->s_clustersize_bits != 6718 (end - 1) >> OCFS2_SB(sb)->s_clustersize_bits); 6719 6720 numpages = 0; 6721 last_page_bytes = PAGE_ALIGN(end); 6722 index = start >> PAGE_CACHE_SHIFT; 6723 do { 6724 pages[numpages] = grab_cache_page(mapping, index); 6725 if (!pages[numpages]) { 6726 ret = -ENOMEM; 6727 mlog_errno(ret); 6728 goto out; 6729 } 6730 6731 numpages++; 6732 index++; 6733 } while (index < (last_page_bytes >> PAGE_CACHE_SHIFT)); 6734 6735 out: 6736 if (ret != 0) { 6737 if (pages) 6738 ocfs2_unlock_and_free_pages(pages, numpages); 6739 numpages = 0; 6740 } 6741 6742 *num = numpages; 6743 6744 return ret; 6745 } 6746 6747 /* 6748 * Zero the area past i_size but still within an allocated 6749 * cluster. This avoids exposing nonzero data on subsequent file 6750 * extends. 6751 * 6752 * We need to call this before i_size is updated on the inode because 6753 * otherwise block_write_full_page() will skip writeout of pages past 6754 * i_size. The new_i_size parameter is passed for this reason. 6755 */ 6756 int ocfs2_zero_range_for_truncate(struct inode *inode, handle_t *handle, 6757 u64 range_start, u64 range_end) 6758 { 6759 int ret = 0, numpages; 6760 struct page **pages = NULL; 6761 u64 phys; 6762 unsigned int ext_flags; 6763 struct super_block *sb = inode->i_sb; 6764 6765 /* 6766 * File systems which don't support sparse files zero on every 6767 * extend. 6768 */ 6769 if (!ocfs2_sparse_alloc(OCFS2_SB(sb))) 6770 return 0; 6771 6772 pages = kcalloc(ocfs2_pages_per_cluster(sb), 6773 sizeof(struct page *), GFP_NOFS); 6774 if (pages == NULL) { 6775 ret = -ENOMEM; 6776 mlog_errno(ret); 6777 goto out; 6778 } 6779 6780 if (range_start == range_end) 6781 goto out; 6782 6783 ret = ocfs2_extent_map_get_blocks(inode, 6784 range_start >> sb->s_blocksize_bits, 6785 &phys, NULL, &ext_flags); 6786 if (ret) { 6787 mlog_errno(ret); 6788 goto out; 6789 } 6790 6791 /* 6792 * Tail is a hole, or is marked unwritten. In either case, we 6793 * can count on read and write to return/push zero's. 6794 */ 6795 if (phys == 0 || ext_flags & OCFS2_EXT_UNWRITTEN) 6796 goto out; 6797 6798 ret = ocfs2_grab_eof_pages(inode, range_start, range_end, pages, 6799 &numpages); 6800 if (ret) { 6801 mlog_errno(ret); 6802 goto out; 6803 } 6804 6805 ocfs2_zero_cluster_pages(inode, range_start, range_end, pages, 6806 numpages, phys, handle); 6807 6808 /* 6809 * Initiate writeout of the pages we zero'd here. We don't 6810 * wait on them - the truncate_inode_pages() call later will 6811 * do that for us. 6812 */ 6813 ret = do_sync_mapping_range(inode->i_mapping, range_start, 6814 range_end - 1, SYNC_FILE_RANGE_WRITE); 6815 if (ret) 6816 mlog_errno(ret); 6817 6818 out: 6819 if (pages) 6820 kfree(pages); 6821 6822 return ret; 6823 } 6824 6825 static void ocfs2_zero_dinode_id2_with_xattr(struct inode *inode, 6826 struct ocfs2_dinode *di) 6827 { 6828 unsigned int blocksize = 1 << inode->i_sb->s_blocksize_bits; 6829 unsigned int xattrsize = le16_to_cpu(di->i_xattr_inline_size); 6830 6831 if (le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_XATTR_FL) 6832 memset(&di->id2, 0, blocksize - 6833 offsetof(struct ocfs2_dinode, id2) - 6834 xattrsize); 6835 else 6836 memset(&di->id2, 0, blocksize - 6837 offsetof(struct ocfs2_dinode, id2)); 6838 } 6839 6840 void ocfs2_dinode_new_extent_list(struct inode *inode, 6841 struct ocfs2_dinode *di) 6842 { 6843 ocfs2_zero_dinode_id2_with_xattr(inode, di); 6844 di->id2.i_list.l_tree_depth = 0; 6845 di->id2.i_list.l_next_free_rec = 0; 6846 di->id2.i_list.l_count = cpu_to_le16( 6847 ocfs2_extent_recs_per_inode_with_xattr(inode->i_sb, di)); 6848 } 6849 6850 void ocfs2_set_inode_data_inline(struct inode *inode, struct ocfs2_dinode *di) 6851 { 6852 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6853 struct ocfs2_inline_data *idata = &di->id2.i_data; 6854 6855 spin_lock(&oi->ip_lock); 6856 oi->ip_dyn_features |= OCFS2_INLINE_DATA_FL; 6857 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6858 spin_unlock(&oi->ip_lock); 6859 6860 /* 6861 * We clear the entire i_data structure here so that all 6862 * fields can be properly initialized. 6863 */ 6864 ocfs2_zero_dinode_id2_with_xattr(inode, di); 6865 6866 idata->id_count = cpu_to_le16( 6867 ocfs2_max_inline_data_with_xattr(inode->i_sb, di)); 6868 } 6869 6870 int ocfs2_convert_inline_data_to_extents(struct inode *inode, 6871 struct buffer_head *di_bh) 6872 { 6873 int ret, i, has_data, num_pages = 0; 6874 handle_t *handle; 6875 u64 uninitialized_var(block); 6876 struct ocfs2_inode_info *oi = OCFS2_I(inode); 6877 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 6878 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 6879 struct ocfs2_alloc_context *data_ac = NULL; 6880 struct page **pages = NULL; 6881 loff_t end = osb->s_clustersize; 6882 struct ocfs2_extent_tree et; 6883 int did_quota = 0; 6884 6885 has_data = i_size_read(inode) ? 1 : 0; 6886 6887 if (has_data) { 6888 pages = kcalloc(ocfs2_pages_per_cluster(osb->sb), 6889 sizeof(struct page *), GFP_NOFS); 6890 if (pages == NULL) { 6891 ret = -ENOMEM; 6892 mlog_errno(ret); 6893 goto out; 6894 } 6895 6896 ret = ocfs2_reserve_clusters(osb, 1, &data_ac); 6897 if (ret) { 6898 mlog_errno(ret); 6899 goto out; 6900 } 6901 } 6902 6903 handle = ocfs2_start_trans(osb, 6904 ocfs2_inline_to_extents_credits(osb->sb)); 6905 if (IS_ERR(handle)) { 6906 ret = PTR_ERR(handle); 6907 mlog_errno(ret); 6908 goto out_unlock; 6909 } 6910 6911 ret = ocfs2_journal_access(handle, inode, di_bh, 6912 OCFS2_JOURNAL_ACCESS_WRITE); 6913 if (ret) { 6914 mlog_errno(ret); 6915 goto out_commit; 6916 } 6917 6918 if (has_data) { 6919 u32 bit_off, num; 6920 unsigned int page_end; 6921 u64 phys; 6922 6923 if (vfs_dq_alloc_space_nodirty(inode, 6924 ocfs2_clusters_to_bytes(osb->sb, 1))) { 6925 ret = -EDQUOT; 6926 goto out_commit; 6927 } 6928 did_quota = 1; 6929 6930 ret = ocfs2_claim_clusters(osb, handle, data_ac, 1, &bit_off, 6931 &num); 6932 if (ret) { 6933 mlog_errno(ret); 6934 goto out_commit; 6935 } 6936 6937 /* 6938 * Save two copies, one for insert, and one that can 6939 * be changed by ocfs2_map_and_dirty_page() below. 6940 */ 6941 block = phys = ocfs2_clusters_to_blocks(inode->i_sb, bit_off); 6942 6943 /* 6944 * Non sparse file systems zero on extend, so no need 6945 * to do that now. 6946 */ 6947 if (!ocfs2_sparse_alloc(osb) && 6948 PAGE_CACHE_SIZE < osb->s_clustersize) 6949 end = PAGE_CACHE_SIZE; 6950 6951 ret = ocfs2_grab_eof_pages(inode, 0, end, pages, &num_pages); 6952 if (ret) { 6953 mlog_errno(ret); 6954 goto out_commit; 6955 } 6956 6957 /* 6958 * This should populate the 1st page for us and mark 6959 * it up to date. 6960 */ 6961 ret = ocfs2_read_inline_data(inode, pages[0], di_bh); 6962 if (ret) { 6963 mlog_errno(ret); 6964 goto out_commit; 6965 } 6966 6967 page_end = PAGE_CACHE_SIZE; 6968 if (PAGE_CACHE_SIZE > osb->s_clustersize) 6969 page_end = osb->s_clustersize; 6970 6971 for (i = 0; i < num_pages; i++) 6972 ocfs2_map_and_dirty_page(inode, handle, 0, page_end, 6973 pages[i], i > 0, &phys); 6974 } 6975 6976 spin_lock(&oi->ip_lock); 6977 oi->ip_dyn_features &= ~OCFS2_INLINE_DATA_FL; 6978 di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features); 6979 spin_unlock(&oi->ip_lock); 6980 6981 ocfs2_dinode_new_extent_list(inode, di); 6982 6983 ocfs2_journal_dirty(handle, di_bh); 6984 6985 if (has_data) { 6986 /* 6987 * An error at this point should be extremely rare. If 6988 * this proves to be false, we could always re-build 6989 * the in-inode data from our pages. 6990 */ 6991 ocfs2_init_dinode_extent_tree(&et, inode, di_bh); 6992 ret = ocfs2_insert_extent(osb, handle, inode, &et, 6993 0, block, 1, 0, NULL); 6994 if (ret) { 6995 mlog_errno(ret); 6996 goto out_commit; 6997 } 6998 6999 inode->i_blocks = ocfs2_inode_sector_count(inode); 7000 } 7001 7002 out_commit: 7003 if (ret < 0 && did_quota) 7004 vfs_dq_free_space_nodirty(inode, 7005 ocfs2_clusters_to_bytes(osb->sb, 1)); 7006 7007 ocfs2_commit_trans(osb, handle); 7008 7009 out_unlock: 7010 if (data_ac) 7011 ocfs2_free_alloc_context(data_ac); 7012 7013 out: 7014 if (pages) { 7015 ocfs2_unlock_and_free_pages(pages, num_pages); 7016 kfree(pages); 7017 } 7018 7019 return ret; 7020 } 7021 7022 /* 7023 * It is expected, that by the time you call this function, 7024 * inode->i_size and fe->i_size have been adjusted. 7025 * 7026 * WARNING: This will kfree the truncate context 7027 */ 7028 int ocfs2_commit_truncate(struct ocfs2_super *osb, 7029 struct inode *inode, 7030 struct buffer_head *fe_bh, 7031 struct ocfs2_truncate_context *tc) 7032 { 7033 int status, i, credits, tl_sem = 0; 7034 u32 clusters_to_del, new_highest_cpos, range; 7035 struct ocfs2_extent_list *el; 7036 handle_t *handle = NULL; 7037 struct inode *tl_inode = osb->osb_tl_inode; 7038 struct ocfs2_path *path = NULL; 7039 struct ocfs2_dinode *di = (struct ocfs2_dinode *)fe_bh->b_data; 7040 7041 mlog_entry_void(); 7042 7043 new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 7044 i_size_read(inode)); 7045 7046 path = ocfs2_new_path(fe_bh, &di->id2.i_list); 7047 if (!path) { 7048 status = -ENOMEM; 7049 mlog_errno(status); 7050 goto bail; 7051 } 7052 7053 ocfs2_extent_map_trunc(inode, new_highest_cpos); 7054 7055 start: 7056 /* 7057 * Check that we still have allocation to delete. 7058 */ 7059 if (OCFS2_I(inode)->ip_clusters == 0) { 7060 status = 0; 7061 goto bail; 7062 } 7063 7064 /* 7065 * Truncate always works against the rightmost tree branch. 7066 */ 7067 status = ocfs2_find_path(inode, path, UINT_MAX); 7068 if (status) { 7069 mlog_errno(status); 7070 goto bail; 7071 } 7072 7073 mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 7074 OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 7075 7076 /* 7077 * By now, el will point to the extent list on the bottom most 7078 * portion of this tree. Only the tail record is considered in 7079 * each pass. 7080 * 7081 * We handle the following cases, in order: 7082 * - empty extent: delete the remaining branch 7083 * - remove the entire record 7084 * - remove a partial record 7085 * - no record needs to be removed (truncate has completed) 7086 */ 7087 el = path_leaf_el(path); 7088 if (le16_to_cpu(el->l_next_free_rec) == 0) { 7089 ocfs2_error(inode->i_sb, 7090 "Inode %llu has empty extent block at %llu\n", 7091 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7092 (unsigned long long)path_leaf_bh(path)->b_blocknr); 7093 status = -EROFS; 7094 goto bail; 7095 } 7096 7097 i = le16_to_cpu(el->l_next_free_rec) - 1; 7098 range = le32_to_cpu(el->l_recs[i].e_cpos) + 7099 ocfs2_rec_clusters(el, &el->l_recs[i]); 7100 if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 7101 clusters_to_del = 0; 7102 } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 7103 clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 7104 } else if (range > new_highest_cpos) { 7105 clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 7106 le32_to_cpu(el->l_recs[i].e_cpos)) - 7107 new_highest_cpos; 7108 } else { 7109 status = 0; 7110 goto bail; 7111 } 7112 7113 mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 7114 clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 7115 7116 mutex_lock(&tl_inode->i_mutex); 7117 tl_sem = 1; 7118 /* ocfs2_truncate_log_needs_flush guarantees us at least one 7119 * record is free for use. If there isn't any, we flush to get 7120 * an empty truncate log. */ 7121 if (ocfs2_truncate_log_needs_flush(osb)) { 7122 status = __ocfs2_flush_truncate_log(osb); 7123 if (status < 0) { 7124 mlog_errno(status); 7125 goto bail; 7126 } 7127 } 7128 7129 credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 7130 (struct ocfs2_dinode *)fe_bh->b_data, 7131 el); 7132 handle = ocfs2_start_trans(osb, credits); 7133 if (IS_ERR(handle)) { 7134 status = PTR_ERR(handle); 7135 handle = NULL; 7136 mlog_errno(status); 7137 goto bail; 7138 } 7139 7140 status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 7141 tc, path); 7142 if (status < 0) { 7143 mlog_errno(status); 7144 goto bail; 7145 } 7146 7147 mutex_unlock(&tl_inode->i_mutex); 7148 tl_sem = 0; 7149 7150 ocfs2_commit_trans(osb, handle); 7151 handle = NULL; 7152 7153 ocfs2_reinit_path(path, 1); 7154 7155 /* 7156 * The check above will catch the case where we've truncated 7157 * away all allocation. 7158 */ 7159 goto start; 7160 7161 bail: 7162 7163 ocfs2_schedule_truncate_log_flush(osb, 1); 7164 7165 if (tl_sem) 7166 mutex_unlock(&tl_inode->i_mutex); 7167 7168 if (handle) 7169 ocfs2_commit_trans(osb, handle); 7170 7171 ocfs2_run_deallocs(osb, &tc->tc_dealloc); 7172 7173 ocfs2_free_path(path); 7174 7175 /* This will drop the ext_alloc cluster lock for us */ 7176 ocfs2_free_truncate_context(tc); 7177 7178 mlog_exit(status); 7179 return status; 7180 } 7181 7182 /* 7183 * Expects the inode to already be locked. 7184 */ 7185 int ocfs2_prepare_truncate(struct ocfs2_super *osb, 7186 struct inode *inode, 7187 struct buffer_head *fe_bh, 7188 struct ocfs2_truncate_context **tc) 7189 { 7190 int status; 7191 unsigned int new_i_clusters; 7192 struct ocfs2_dinode *fe; 7193 struct ocfs2_extent_block *eb; 7194 struct buffer_head *last_eb_bh = NULL; 7195 7196 mlog_entry_void(); 7197 7198 *tc = NULL; 7199 7200 new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 7201 i_size_read(inode)); 7202 fe = (struct ocfs2_dinode *) fe_bh->b_data; 7203 7204 mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 7205 "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 7206 (unsigned long long)le64_to_cpu(fe->i_size)); 7207 7208 *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 7209 if (!(*tc)) { 7210 status = -ENOMEM; 7211 mlog_errno(status); 7212 goto bail; 7213 } 7214 ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 7215 7216 if (fe->id2.i_list.l_tree_depth) { 7217 status = ocfs2_read_extent_block(inode, 7218 le64_to_cpu(fe->i_last_eb_blk), 7219 &last_eb_bh); 7220 if (status < 0) { 7221 mlog_errno(status); 7222 goto bail; 7223 } 7224 eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 7225 } 7226 7227 (*tc)->tc_last_eb_bh = last_eb_bh; 7228 7229 status = 0; 7230 bail: 7231 if (status < 0) { 7232 if (*tc) 7233 ocfs2_free_truncate_context(*tc); 7234 *tc = NULL; 7235 } 7236 mlog_exit_void(); 7237 return status; 7238 } 7239 7240 /* 7241 * 'start' is inclusive, 'end' is not. 7242 */ 7243 int ocfs2_truncate_inline(struct inode *inode, struct buffer_head *di_bh, 7244 unsigned int start, unsigned int end, int trunc) 7245 { 7246 int ret; 7247 unsigned int numbytes; 7248 handle_t *handle; 7249 struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 7250 struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 7251 struct ocfs2_inline_data *idata = &di->id2.i_data; 7252 7253 if (end > i_size_read(inode)) 7254 end = i_size_read(inode); 7255 7256 BUG_ON(start >= end); 7257 7258 if (!(OCFS2_I(inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) || 7259 !(le16_to_cpu(di->i_dyn_features) & OCFS2_INLINE_DATA_FL) || 7260 !ocfs2_supports_inline_data(osb)) { 7261 ocfs2_error(inode->i_sb, 7262 "Inline data flags for inode %llu don't agree! " 7263 "Disk: 0x%x, Memory: 0x%x, Superblock: 0x%x\n", 7264 (unsigned long long)OCFS2_I(inode)->ip_blkno, 7265 le16_to_cpu(di->i_dyn_features), 7266 OCFS2_I(inode)->ip_dyn_features, 7267 osb->s_feature_incompat); 7268 ret = -EROFS; 7269 goto out; 7270 } 7271 7272 handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS); 7273 if (IS_ERR(handle)) { 7274 ret = PTR_ERR(handle); 7275 mlog_errno(ret); 7276 goto out; 7277 } 7278 7279 ret = ocfs2_journal_access(handle, inode, di_bh, 7280 OCFS2_JOURNAL_ACCESS_WRITE); 7281 if (ret) { 7282 mlog_errno(ret); 7283 goto out_commit; 7284 } 7285 7286 numbytes = end - start; 7287 memset(idata->id_data + start, 0, numbytes); 7288 7289 /* 7290 * No need to worry about the data page here - it's been 7291 * truncated already and inline data doesn't need it for 7292 * pushing zero's to disk, so we'll let readpage pick it up 7293 * later. 7294 */ 7295 if (trunc) { 7296 i_size_write(inode, start); 7297 di->i_size = cpu_to_le64(start); 7298 } 7299 7300 inode->i_blocks = ocfs2_inode_sector_count(inode); 7301 inode->i_ctime = inode->i_mtime = CURRENT_TIME; 7302 7303 di->i_ctime = di->i_mtime = cpu_to_le64(inode->i_ctime.tv_sec); 7304 di->i_ctime_nsec = di->i_mtime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec); 7305 7306 ocfs2_journal_dirty(handle, di_bh); 7307 7308 out_commit: 7309 ocfs2_commit_trans(osb, handle); 7310 7311 out: 7312 return ret; 7313 } 7314 7315 static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 7316 { 7317 /* 7318 * The caller is responsible for completing deallocation 7319 * before freeing the context. 7320 */ 7321 if (tc->tc_dealloc.c_first_suballocator != NULL) 7322 mlog(ML_NOTICE, 7323 "Truncate completion has non-empty dealloc context\n"); 7324 7325 brelse(tc->tc_last_eb_bh); 7326 7327 kfree(tc); 7328 } 7329