1ccd979bdSMark Fasheh /* -*- mode: c; c-basic-offset: 8; -*- 2ccd979bdSMark Fasheh * vim: noexpandtab sw=8 ts=8 sts=0: 3ccd979bdSMark Fasheh * 4ccd979bdSMark Fasheh * alloc.c 5ccd979bdSMark Fasheh * 6ccd979bdSMark Fasheh * Extent allocs and frees 7ccd979bdSMark Fasheh * 8ccd979bdSMark Fasheh * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9ccd979bdSMark Fasheh * 10ccd979bdSMark Fasheh * This program is free software; you can redistribute it and/or 11ccd979bdSMark Fasheh * modify it under the terms of the GNU General Public 12ccd979bdSMark Fasheh * License as published by the Free Software Foundation; either 13ccd979bdSMark Fasheh * version 2 of the License, or (at your option) any later version. 14ccd979bdSMark Fasheh * 15ccd979bdSMark Fasheh * This program is distributed in the hope that it will be useful, 16ccd979bdSMark Fasheh * but WITHOUT ANY WARRANTY; without even the implied warranty of 17ccd979bdSMark Fasheh * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18ccd979bdSMark Fasheh * General Public License for more details. 19ccd979bdSMark Fasheh * 20ccd979bdSMark Fasheh * You should have received a copy of the GNU General Public 21ccd979bdSMark Fasheh * License along with this program; if not, write to the 22ccd979bdSMark Fasheh * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23ccd979bdSMark Fasheh * Boston, MA 021110-1307, USA. 24ccd979bdSMark Fasheh */ 25ccd979bdSMark Fasheh 26ccd979bdSMark Fasheh #include <linux/fs.h> 27ccd979bdSMark Fasheh #include <linux/types.h> 28ccd979bdSMark Fasheh #include <linux/slab.h> 29ccd979bdSMark Fasheh #include <linux/highmem.h> 3060b11392SMark Fasheh #include <linux/swap.h> 31ccd979bdSMark Fasheh 32ccd979bdSMark Fasheh #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33ccd979bdSMark Fasheh #include <cluster/masklog.h> 34ccd979bdSMark Fasheh 35ccd979bdSMark Fasheh #include "ocfs2.h" 36ccd979bdSMark Fasheh 37ccd979bdSMark Fasheh #include "alloc.h" 3860b11392SMark Fasheh #include "aops.h" 39ccd979bdSMark Fasheh #include "dlmglue.h" 40ccd979bdSMark Fasheh #include "extent_map.h" 41ccd979bdSMark Fasheh #include "inode.h" 42ccd979bdSMark Fasheh #include "journal.h" 43ccd979bdSMark Fasheh #include "localalloc.h" 44ccd979bdSMark Fasheh #include "suballoc.h" 45ccd979bdSMark Fasheh #include "sysfile.h" 46ccd979bdSMark Fasheh #include "file.h" 47ccd979bdSMark Fasheh #include "super.h" 48ccd979bdSMark Fasheh #include "uptodate.h" 49ccd979bdSMark Fasheh 50ccd979bdSMark Fasheh #include "buffer_head_io.h" 51ccd979bdSMark Fasheh 52ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 53ccd979bdSMark Fasheh 54dcd0538fSMark Fasheh /* 55dcd0538fSMark Fasheh * Structures which describe a path through a btree, and functions to 56dcd0538fSMark Fasheh * manipulate them. 57dcd0538fSMark Fasheh * 58dcd0538fSMark Fasheh * The idea here is to be as generic as possible with the tree 59dcd0538fSMark Fasheh * manipulation code. 60dcd0538fSMark Fasheh */ 61dcd0538fSMark Fasheh struct ocfs2_path_item { 62dcd0538fSMark Fasheh struct buffer_head *bh; 63dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 64dcd0538fSMark Fasheh }; 65dcd0538fSMark Fasheh 66dcd0538fSMark Fasheh #define OCFS2_MAX_PATH_DEPTH 5 67dcd0538fSMark Fasheh 68dcd0538fSMark Fasheh struct ocfs2_path { 69dcd0538fSMark Fasheh int p_tree_depth; 70dcd0538fSMark Fasheh struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 71dcd0538fSMark Fasheh }; 72dcd0538fSMark Fasheh 73dcd0538fSMark Fasheh #define path_root_bh(_path) ((_path)->p_node[0].bh) 74dcd0538fSMark Fasheh #define path_root_el(_path) ((_path)->p_node[0].el) 75dcd0538fSMark Fasheh #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 76dcd0538fSMark Fasheh #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 77dcd0538fSMark Fasheh #define path_num_items(_path) ((_path)->p_tree_depth + 1) 78dcd0538fSMark Fasheh 79dcd0538fSMark Fasheh /* 80dcd0538fSMark Fasheh * Reset the actual path elements so that we can re-use the structure 81dcd0538fSMark Fasheh * to build another path. Generally, this involves freeing the buffer 82dcd0538fSMark Fasheh * heads. 83dcd0538fSMark Fasheh */ 84dcd0538fSMark Fasheh static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 85dcd0538fSMark Fasheh { 86dcd0538fSMark Fasheh int i, start = 0, depth = 0; 87dcd0538fSMark Fasheh struct ocfs2_path_item *node; 88dcd0538fSMark Fasheh 89dcd0538fSMark Fasheh if (keep_root) 90dcd0538fSMark Fasheh start = 1; 91dcd0538fSMark Fasheh 92dcd0538fSMark Fasheh for(i = start; i < path_num_items(path); i++) { 93dcd0538fSMark Fasheh node = &path->p_node[i]; 94dcd0538fSMark Fasheh 95dcd0538fSMark Fasheh brelse(node->bh); 96dcd0538fSMark Fasheh node->bh = NULL; 97dcd0538fSMark Fasheh node->el = NULL; 98dcd0538fSMark Fasheh } 99dcd0538fSMark Fasheh 100dcd0538fSMark Fasheh /* 101dcd0538fSMark Fasheh * Tree depth may change during truncate, or insert. If we're 102dcd0538fSMark Fasheh * keeping the root extent list, then make sure that our path 103dcd0538fSMark Fasheh * structure reflects the proper depth. 104dcd0538fSMark Fasheh */ 105dcd0538fSMark Fasheh if (keep_root) 106dcd0538fSMark Fasheh depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 107dcd0538fSMark Fasheh 108dcd0538fSMark Fasheh path->p_tree_depth = depth; 109dcd0538fSMark Fasheh } 110dcd0538fSMark Fasheh 111dcd0538fSMark Fasheh static void ocfs2_free_path(struct ocfs2_path *path) 112dcd0538fSMark Fasheh { 113dcd0538fSMark Fasheh if (path) { 114dcd0538fSMark Fasheh ocfs2_reinit_path(path, 0); 115dcd0538fSMark Fasheh kfree(path); 116dcd0538fSMark Fasheh } 117dcd0538fSMark Fasheh } 118dcd0538fSMark Fasheh 119dcd0538fSMark Fasheh /* 120dcd0538fSMark Fasheh * Make the *dest path the same as src and re-initialize src path to 121dcd0538fSMark Fasheh * have a root only. 122dcd0538fSMark Fasheh */ 123dcd0538fSMark Fasheh static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 124dcd0538fSMark Fasheh { 125dcd0538fSMark Fasheh int i; 126dcd0538fSMark Fasheh 127dcd0538fSMark Fasheh BUG_ON(path_root_bh(dest) != path_root_bh(src)); 128dcd0538fSMark Fasheh 129dcd0538fSMark Fasheh for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 130dcd0538fSMark Fasheh brelse(dest->p_node[i].bh); 131dcd0538fSMark Fasheh 132dcd0538fSMark Fasheh dest->p_node[i].bh = src->p_node[i].bh; 133dcd0538fSMark Fasheh dest->p_node[i].el = src->p_node[i].el; 134dcd0538fSMark Fasheh 135dcd0538fSMark Fasheh src->p_node[i].bh = NULL; 136dcd0538fSMark Fasheh src->p_node[i].el = NULL; 137dcd0538fSMark Fasheh } 138dcd0538fSMark Fasheh } 139dcd0538fSMark Fasheh 140dcd0538fSMark Fasheh /* 141dcd0538fSMark Fasheh * Insert an extent block at given index. 142dcd0538fSMark Fasheh * 143dcd0538fSMark Fasheh * This will not take an additional reference on eb_bh. 144dcd0538fSMark Fasheh */ 145dcd0538fSMark Fasheh static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 146dcd0538fSMark Fasheh struct buffer_head *eb_bh) 147dcd0538fSMark Fasheh { 148dcd0538fSMark Fasheh struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 149dcd0538fSMark Fasheh 150dcd0538fSMark Fasheh /* 151dcd0538fSMark Fasheh * Right now, no root bh is an extent block, so this helps 152dcd0538fSMark Fasheh * catch code errors with dinode trees. The assertion can be 153dcd0538fSMark Fasheh * safely removed if we ever need to insert extent block 154dcd0538fSMark Fasheh * structures at the root. 155dcd0538fSMark Fasheh */ 156dcd0538fSMark Fasheh BUG_ON(index == 0); 157dcd0538fSMark Fasheh 158dcd0538fSMark Fasheh path->p_node[index].bh = eb_bh; 159dcd0538fSMark Fasheh path->p_node[index].el = &eb->h_list; 160dcd0538fSMark Fasheh } 161dcd0538fSMark Fasheh 162dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 163dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el) 164dcd0538fSMark Fasheh { 165dcd0538fSMark Fasheh struct ocfs2_path *path; 166dcd0538fSMark Fasheh 167dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 168dcd0538fSMark Fasheh 169dcd0538fSMark Fasheh path = kzalloc(sizeof(*path), GFP_NOFS); 170dcd0538fSMark Fasheh if (path) { 171dcd0538fSMark Fasheh path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 172dcd0538fSMark Fasheh get_bh(root_bh); 173dcd0538fSMark Fasheh path_root_bh(path) = root_bh; 174dcd0538fSMark Fasheh path_root_el(path) = root_el; 175dcd0538fSMark Fasheh } 176dcd0538fSMark Fasheh 177dcd0538fSMark Fasheh return path; 178dcd0538fSMark Fasheh } 179dcd0538fSMark Fasheh 180dcd0538fSMark Fasheh /* 181dcd0538fSMark Fasheh * Allocate and initialize a new path based on a disk inode tree. 182dcd0538fSMark Fasheh */ 183dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) 184dcd0538fSMark Fasheh { 185dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 186dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &di->id2.i_list; 187dcd0538fSMark Fasheh 188dcd0538fSMark Fasheh return ocfs2_new_path(di_bh, el); 189dcd0538fSMark Fasheh } 190dcd0538fSMark Fasheh 191dcd0538fSMark Fasheh /* 192dcd0538fSMark Fasheh * Convenience function to journal all components in a path. 193dcd0538fSMark Fasheh */ 194dcd0538fSMark Fasheh static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 195dcd0538fSMark Fasheh struct ocfs2_path *path) 196dcd0538fSMark Fasheh { 197dcd0538fSMark Fasheh int i, ret = 0; 198dcd0538fSMark Fasheh 199dcd0538fSMark Fasheh if (!path) 200dcd0538fSMark Fasheh goto out; 201dcd0538fSMark Fasheh 202dcd0538fSMark Fasheh for(i = 0; i < path_num_items(path); i++) { 203dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 204dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 205dcd0538fSMark Fasheh if (ret < 0) { 206dcd0538fSMark Fasheh mlog_errno(ret); 207dcd0538fSMark Fasheh goto out; 208dcd0538fSMark Fasheh } 209dcd0538fSMark Fasheh } 210dcd0538fSMark Fasheh 211dcd0538fSMark Fasheh out: 212dcd0538fSMark Fasheh return ret; 213dcd0538fSMark Fasheh } 214dcd0538fSMark Fasheh 215dcd0538fSMark Fasheh enum ocfs2_contig_type { 216dcd0538fSMark Fasheh CONTIG_NONE = 0, 217dcd0538fSMark Fasheh CONTIG_LEFT, 218dcd0538fSMark Fasheh CONTIG_RIGHT 219dcd0538fSMark Fasheh }; 220dcd0538fSMark Fasheh 221e48edee2SMark Fasheh 222e48edee2SMark Fasheh /* 223e48edee2SMark Fasheh * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 224e48edee2SMark Fasheh * ocfs2_extent_contig only work properly against leaf nodes! 225e48edee2SMark Fasheh */ 226dcd0538fSMark Fasheh static int ocfs2_block_extent_contig(struct super_block *sb, 227ccd979bdSMark Fasheh struct ocfs2_extent_rec *ext, 228ccd979bdSMark Fasheh u64 blkno) 229ccd979bdSMark Fasheh { 230e48edee2SMark Fasheh u64 blk_end = le64_to_cpu(ext->e_blkno); 231e48edee2SMark Fasheh 232e48edee2SMark Fasheh blk_end += ocfs2_clusters_to_blocks(sb, 233e48edee2SMark Fasheh le16_to_cpu(ext->e_leaf_clusters)); 234e48edee2SMark Fasheh 235e48edee2SMark Fasheh return blkno == blk_end; 236ccd979bdSMark Fasheh } 237ccd979bdSMark Fasheh 238dcd0538fSMark Fasheh static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 239dcd0538fSMark Fasheh struct ocfs2_extent_rec *right) 240dcd0538fSMark Fasheh { 241e48edee2SMark Fasheh u32 left_range; 242e48edee2SMark Fasheh 243e48edee2SMark Fasheh left_range = le32_to_cpu(left->e_cpos) + 244e48edee2SMark Fasheh le16_to_cpu(left->e_leaf_clusters); 245e48edee2SMark Fasheh 246e48edee2SMark Fasheh return (left_range == le32_to_cpu(right->e_cpos)); 247dcd0538fSMark Fasheh } 248dcd0538fSMark Fasheh 249dcd0538fSMark Fasheh static enum ocfs2_contig_type 250dcd0538fSMark Fasheh ocfs2_extent_contig(struct inode *inode, 251dcd0538fSMark Fasheh struct ocfs2_extent_rec *ext, 252dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 253dcd0538fSMark Fasheh { 254dcd0538fSMark Fasheh u64 blkno = le64_to_cpu(insert_rec->e_blkno); 255dcd0538fSMark Fasheh 256dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(ext, insert_rec) && 257dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 258dcd0538fSMark Fasheh return CONTIG_RIGHT; 259dcd0538fSMark Fasheh 260dcd0538fSMark Fasheh blkno = le64_to_cpu(ext->e_blkno); 261dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(insert_rec, ext) && 262dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 263dcd0538fSMark Fasheh return CONTIG_LEFT; 264dcd0538fSMark Fasheh 265dcd0538fSMark Fasheh return CONTIG_NONE; 266dcd0538fSMark Fasheh } 267dcd0538fSMark Fasheh 268dcd0538fSMark Fasheh /* 269dcd0538fSMark Fasheh * NOTE: We can have pretty much any combination of contiguousness and 270dcd0538fSMark Fasheh * appending. 271dcd0538fSMark Fasheh * 272dcd0538fSMark Fasheh * The usefulness of APPEND_TAIL is more in that it lets us know that 273dcd0538fSMark Fasheh * we'll have to update the path to that leaf. 274dcd0538fSMark Fasheh */ 275dcd0538fSMark Fasheh enum ocfs2_append_type { 276dcd0538fSMark Fasheh APPEND_NONE = 0, 277dcd0538fSMark Fasheh APPEND_TAIL, 278dcd0538fSMark Fasheh }; 279dcd0538fSMark Fasheh 280dcd0538fSMark Fasheh struct ocfs2_insert_type { 281dcd0538fSMark Fasheh enum ocfs2_append_type ins_appending; 282dcd0538fSMark Fasheh enum ocfs2_contig_type ins_contig; 283dcd0538fSMark Fasheh int ins_contig_index; 284dcd0538fSMark Fasheh int ins_free_records; 285dcd0538fSMark Fasheh int ins_tree_depth; 286dcd0538fSMark Fasheh }; 287dcd0538fSMark Fasheh 288ccd979bdSMark Fasheh /* 289ccd979bdSMark Fasheh * How many free extents have we got before we need more meta data? 290ccd979bdSMark Fasheh */ 291ccd979bdSMark Fasheh int ocfs2_num_free_extents(struct ocfs2_super *osb, 292ccd979bdSMark Fasheh struct inode *inode, 293ccd979bdSMark Fasheh struct ocfs2_dinode *fe) 294ccd979bdSMark Fasheh { 295ccd979bdSMark Fasheh int retval; 296ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 297ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 298ccd979bdSMark Fasheh struct buffer_head *eb_bh = NULL; 299ccd979bdSMark Fasheh 300ccd979bdSMark Fasheh mlog_entry_void(); 301ccd979bdSMark Fasheh 302ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(fe)) { 303ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 304ccd979bdSMark Fasheh retval = -EIO; 305ccd979bdSMark Fasheh goto bail; 306ccd979bdSMark Fasheh } 307ccd979bdSMark Fasheh 308ccd979bdSMark Fasheh if (fe->i_last_eb_blk) { 309ccd979bdSMark Fasheh retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 310ccd979bdSMark Fasheh &eb_bh, OCFS2_BH_CACHED, inode); 311ccd979bdSMark Fasheh if (retval < 0) { 312ccd979bdSMark Fasheh mlog_errno(retval); 313ccd979bdSMark Fasheh goto bail; 314ccd979bdSMark Fasheh } 315ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 316ccd979bdSMark Fasheh el = &eb->h_list; 317ccd979bdSMark Fasheh } else 318ccd979bdSMark Fasheh el = &fe->id2.i_list; 319ccd979bdSMark Fasheh 320ccd979bdSMark Fasheh BUG_ON(el->l_tree_depth != 0); 321ccd979bdSMark Fasheh 322ccd979bdSMark Fasheh retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 323ccd979bdSMark Fasheh bail: 324ccd979bdSMark Fasheh if (eb_bh) 325ccd979bdSMark Fasheh brelse(eb_bh); 326ccd979bdSMark Fasheh 327ccd979bdSMark Fasheh mlog_exit(retval); 328ccd979bdSMark Fasheh return retval; 329ccd979bdSMark Fasheh } 330ccd979bdSMark Fasheh 331ccd979bdSMark Fasheh /* expects array to already be allocated 332ccd979bdSMark Fasheh * 333ccd979bdSMark Fasheh * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 334ccd979bdSMark Fasheh * l_count for you 335ccd979bdSMark Fasheh */ 336ccd979bdSMark Fasheh static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 3371fabe148SMark Fasheh handle_t *handle, 338ccd979bdSMark Fasheh struct inode *inode, 339ccd979bdSMark Fasheh int wanted, 340ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 341ccd979bdSMark Fasheh struct buffer_head *bhs[]) 342ccd979bdSMark Fasheh { 343ccd979bdSMark Fasheh int count, status, i; 344ccd979bdSMark Fasheh u16 suballoc_bit_start; 345ccd979bdSMark Fasheh u32 num_got; 346ccd979bdSMark Fasheh u64 first_blkno; 347ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 348ccd979bdSMark Fasheh 349ccd979bdSMark Fasheh mlog_entry_void(); 350ccd979bdSMark Fasheh 351ccd979bdSMark Fasheh count = 0; 352ccd979bdSMark Fasheh while (count < wanted) { 353ccd979bdSMark Fasheh status = ocfs2_claim_metadata(osb, 354ccd979bdSMark Fasheh handle, 355ccd979bdSMark Fasheh meta_ac, 356ccd979bdSMark Fasheh wanted - count, 357ccd979bdSMark Fasheh &suballoc_bit_start, 358ccd979bdSMark Fasheh &num_got, 359ccd979bdSMark Fasheh &first_blkno); 360ccd979bdSMark Fasheh if (status < 0) { 361ccd979bdSMark Fasheh mlog_errno(status); 362ccd979bdSMark Fasheh goto bail; 363ccd979bdSMark Fasheh } 364ccd979bdSMark Fasheh 365ccd979bdSMark Fasheh for(i = count; i < (num_got + count); i++) { 366ccd979bdSMark Fasheh bhs[i] = sb_getblk(osb->sb, first_blkno); 367ccd979bdSMark Fasheh if (bhs[i] == NULL) { 368ccd979bdSMark Fasheh status = -EIO; 369ccd979bdSMark Fasheh mlog_errno(status); 370ccd979bdSMark Fasheh goto bail; 371ccd979bdSMark Fasheh } 372ccd979bdSMark Fasheh ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 373ccd979bdSMark Fasheh 374ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bhs[i], 375ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 376ccd979bdSMark Fasheh if (status < 0) { 377ccd979bdSMark Fasheh mlog_errno(status); 378ccd979bdSMark Fasheh goto bail; 379ccd979bdSMark Fasheh } 380ccd979bdSMark Fasheh 381ccd979bdSMark Fasheh memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 382ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 383ccd979bdSMark Fasheh /* Ok, setup the minimal stuff here. */ 384ccd979bdSMark Fasheh strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 385ccd979bdSMark Fasheh eb->h_blkno = cpu_to_le64(first_blkno); 386ccd979bdSMark Fasheh eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 387ccd979bdSMark Fasheh 388ccd979bdSMark Fasheh #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS 389ccd979bdSMark Fasheh /* we always use slot zero's suballocator */ 390ccd979bdSMark Fasheh eb->h_suballoc_slot = 0; 391ccd979bdSMark Fasheh #else 392ccd979bdSMark Fasheh eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 393ccd979bdSMark Fasheh #endif 394ccd979bdSMark Fasheh eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 395ccd979bdSMark Fasheh eb->h_list.l_count = 396ccd979bdSMark Fasheh cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 397ccd979bdSMark Fasheh 398ccd979bdSMark Fasheh suballoc_bit_start++; 399ccd979bdSMark Fasheh first_blkno++; 400ccd979bdSMark Fasheh 401ccd979bdSMark Fasheh /* We'll also be dirtied by the caller, so 402ccd979bdSMark Fasheh * this isn't absolutely necessary. */ 403ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bhs[i]); 404ccd979bdSMark Fasheh if (status < 0) { 405ccd979bdSMark Fasheh mlog_errno(status); 406ccd979bdSMark Fasheh goto bail; 407ccd979bdSMark Fasheh } 408ccd979bdSMark Fasheh } 409ccd979bdSMark Fasheh 410ccd979bdSMark Fasheh count += num_got; 411ccd979bdSMark Fasheh } 412ccd979bdSMark Fasheh 413ccd979bdSMark Fasheh status = 0; 414ccd979bdSMark Fasheh bail: 415ccd979bdSMark Fasheh if (status < 0) { 416ccd979bdSMark Fasheh for(i = 0; i < wanted; i++) { 417ccd979bdSMark Fasheh if (bhs[i]) 418ccd979bdSMark Fasheh brelse(bhs[i]); 419ccd979bdSMark Fasheh bhs[i] = NULL; 420ccd979bdSMark Fasheh } 421ccd979bdSMark Fasheh } 422ccd979bdSMark Fasheh mlog_exit(status); 423ccd979bdSMark Fasheh return status; 424ccd979bdSMark Fasheh } 425ccd979bdSMark Fasheh 426ccd979bdSMark Fasheh /* 427dcd0538fSMark Fasheh * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 428dcd0538fSMark Fasheh * 429dcd0538fSMark Fasheh * Returns the sum of the rightmost extent rec logical offset and 430dcd0538fSMark Fasheh * cluster count. 431dcd0538fSMark Fasheh * 432dcd0538fSMark Fasheh * ocfs2_add_branch() uses this to determine what logical cluster 433dcd0538fSMark Fasheh * value should be populated into the leftmost new branch records. 434dcd0538fSMark Fasheh * 435dcd0538fSMark Fasheh * ocfs2_shift_tree_depth() uses this to determine the # clusters 436dcd0538fSMark Fasheh * value for the new topmost tree record. 437dcd0538fSMark Fasheh */ 438dcd0538fSMark Fasheh static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 439dcd0538fSMark Fasheh { 440dcd0538fSMark Fasheh int i; 441dcd0538fSMark Fasheh 442dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 443dcd0538fSMark Fasheh 444dcd0538fSMark Fasheh return le32_to_cpu(el->l_recs[i].e_cpos) + 445e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 446dcd0538fSMark Fasheh } 447dcd0538fSMark Fasheh 448dcd0538fSMark Fasheh /* 449ccd979bdSMark Fasheh * Add an entire tree branch to our inode. eb_bh is the extent block 450ccd979bdSMark Fasheh * to start at, if we don't want to start the branch at the dinode 451ccd979bdSMark Fasheh * structure. 452ccd979bdSMark Fasheh * 453ccd979bdSMark Fasheh * last_eb_bh is required as we have to update it's next_leaf pointer 454ccd979bdSMark Fasheh * for the new last extent block. 455ccd979bdSMark Fasheh * 456ccd979bdSMark Fasheh * the new branch will be 'empty' in the sense that every block will 457e48edee2SMark Fasheh * contain a single record with cluster count == 0. 458ccd979bdSMark Fasheh */ 459ccd979bdSMark Fasheh static int ocfs2_add_branch(struct ocfs2_super *osb, 4601fabe148SMark Fasheh handle_t *handle, 461ccd979bdSMark Fasheh struct inode *inode, 462ccd979bdSMark Fasheh struct buffer_head *fe_bh, 463ccd979bdSMark Fasheh struct buffer_head *eb_bh, 464ccd979bdSMark Fasheh struct buffer_head *last_eb_bh, 465ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 466ccd979bdSMark Fasheh { 467ccd979bdSMark Fasheh int status, new_blocks, i; 468ccd979bdSMark Fasheh u64 next_blkno, new_last_eb_blk; 469ccd979bdSMark Fasheh struct buffer_head *bh; 470ccd979bdSMark Fasheh struct buffer_head **new_eb_bhs = NULL; 471ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 472ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 473ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 474ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 475dcd0538fSMark Fasheh u32 new_cpos; 476ccd979bdSMark Fasheh 477ccd979bdSMark Fasheh mlog_entry_void(); 478ccd979bdSMark Fasheh 479ccd979bdSMark Fasheh BUG_ON(!last_eb_bh); 480ccd979bdSMark Fasheh 481ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 482ccd979bdSMark Fasheh 483ccd979bdSMark Fasheh if (eb_bh) { 484ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 485ccd979bdSMark Fasheh el = &eb->h_list; 486ccd979bdSMark Fasheh } else 487ccd979bdSMark Fasheh el = &fe->id2.i_list; 488ccd979bdSMark Fasheh 489ccd979bdSMark Fasheh /* we never add a branch to a leaf. */ 490ccd979bdSMark Fasheh BUG_ON(!el->l_tree_depth); 491ccd979bdSMark Fasheh 492ccd979bdSMark Fasheh new_blocks = le16_to_cpu(el->l_tree_depth); 493ccd979bdSMark Fasheh 494ccd979bdSMark Fasheh /* allocate the number of new eb blocks we need */ 495ccd979bdSMark Fasheh new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 496ccd979bdSMark Fasheh GFP_KERNEL); 497ccd979bdSMark Fasheh if (!new_eb_bhs) { 498ccd979bdSMark Fasheh status = -ENOMEM; 499ccd979bdSMark Fasheh mlog_errno(status); 500ccd979bdSMark Fasheh goto bail; 501ccd979bdSMark Fasheh } 502ccd979bdSMark Fasheh 503ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 504ccd979bdSMark Fasheh meta_ac, new_eb_bhs); 505ccd979bdSMark Fasheh if (status < 0) { 506ccd979bdSMark Fasheh mlog_errno(status); 507ccd979bdSMark Fasheh goto bail; 508ccd979bdSMark Fasheh } 509ccd979bdSMark Fasheh 510dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 511dcd0538fSMark Fasheh new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 512dcd0538fSMark Fasheh 513ccd979bdSMark Fasheh /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 514ccd979bdSMark Fasheh * linked with the rest of the tree. 515ccd979bdSMark Fasheh * conversly, new_eb_bhs[0] is the new bottommost leaf. 516ccd979bdSMark Fasheh * 517ccd979bdSMark Fasheh * when we leave the loop, new_last_eb_blk will point to the 518ccd979bdSMark Fasheh * newest leaf, and next_blkno will point to the topmost extent 519ccd979bdSMark Fasheh * block. */ 520ccd979bdSMark Fasheh next_blkno = new_last_eb_blk = 0; 521ccd979bdSMark Fasheh for(i = 0; i < new_blocks; i++) { 522ccd979bdSMark Fasheh bh = new_eb_bhs[i]; 523ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 524ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 525ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 526ccd979bdSMark Fasheh status = -EIO; 527ccd979bdSMark Fasheh goto bail; 528ccd979bdSMark Fasheh } 529ccd979bdSMark Fasheh eb_el = &eb->h_list; 530ccd979bdSMark Fasheh 531ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bh, 532ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 533ccd979bdSMark Fasheh if (status < 0) { 534ccd979bdSMark Fasheh mlog_errno(status); 535ccd979bdSMark Fasheh goto bail; 536ccd979bdSMark Fasheh } 537ccd979bdSMark Fasheh 538ccd979bdSMark Fasheh eb->h_next_leaf_blk = 0; 539ccd979bdSMark Fasheh eb_el->l_tree_depth = cpu_to_le16(i); 540ccd979bdSMark Fasheh eb_el->l_next_free_rec = cpu_to_le16(1); 541dcd0538fSMark Fasheh /* 542dcd0538fSMark Fasheh * This actually counts as an empty extent as 543dcd0538fSMark Fasheh * c_clusters == 0 544dcd0538fSMark Fasheh */ 545dcd0538fSMark Fasheh eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 546ccd979bdSMark Fasheh eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 547e48edee2SMark Fasheh /* 548e48edee2SMark Fasheh * eb_el isn't always an interior node, but even leaf 549e48edee2SMark Fasheh * nodes want a zero'd flags and reserved field so 550e48edee2SMark Fasheh * this gets the whole 32 bits regardless of use. 551e48edee2SMark Fasheh */ 552e48edee2SMark Fasheh eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 553ccd979bdSMark Fasheh if (!eb_el->l_tree_depth) 554ccd979bdSMark Fasheh new_last_eb_blk = le64_to_cpu(eb->h_blkno); 555ccd979bdSMark Fasheh 556ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bh); 557ccd979bdSMark Fasheh if (status < 0) { 558ccd979bdSMark Fasheh mlog_errno(status); 559ccd979bdSMark Fasheh goto bail; 560ccd979bdSMark Fasheh } 561ccd979bdSMark Fasheh 562ccd979bdSMark Fasheh next_blkno = le64_to_cpu(eb->h_blkno); 563ccd979bdSMark Fasheh } 564ccd979bdSMark Fasheh 565ccd979bdSMark Fasheh /* This is a bit hairy. We want to update up to three blocks 566ccd979bdSMark Fasheh * here without leaving any of them in an inconsistent state 567ccd979bdSMark Fasheh * in case of error. We don't have to worry about 568ccd979bdSMark Fasheh * journal_dirty erroring as it won't unless we've aborted the 569ccd979bdSMark Fasheh * handle (in which case we would never be here) so reserving 570ccd979bdSMark Fasheh * the write with journal_access is all we need to do. */ 571ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 572ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 573ccd979bdSMark Fasheh if (status < 0) { 574ccd979bdSMark Fasheh mlog_errno(status); 575ccd979bdSMark Fasheh goto bail; 576ccd979bdSMark Fasheh } 577ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 578ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 579ccd979bdSMark Fasheh if (status < 0) { 580ccd979bdSMark Fasheh mlog_errno(status); 581ccd979bdSMark Fasheh goto bail; 582ccd979bdSMark Fasheh } 583ccd979bdSMark Fasheh if (eb_bh) { 584ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, eb_bh, 585ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 586ccd979bdSMark Fasheh if (status < 0) { 587ccd979bdSMark Fasheh mlog_errno(status); 588ccd979bdSMark Fasheh goto bail; 589ccd979bdSMark Fasheh } 590ccd979bdSMark Fasheh } 591ccd979bdSMark Fasheh 592ccd979bdSMark Fasheh /* Link the new branch into the rest of the tree (el will 593ccd979bdSMark Fasheh * either be on the fe, or the extent block passed in. */ 594ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec); 595ccd979bdSMark Fasheh el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 596dcd0538fSMark Fasheh el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 597e48edee2SMark Fasheh el->l_recs[i].e_int_clusters = 0; 598ccd979bdSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 599ccd979bdSMark Fasheh 600ccd979bdSMark Fasheh /* fe needs a new last extent block pointer, as does the 601ccd979bdSMark Fasheh * next_leaf on the previously last-extent-block. */ 602ccd979bdSMark Fasheh fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 603ccd979bdSMark Fasheh 604ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 605ccd979bdSMark Fasheh eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 606ccd979bdSMark Fasheh 607ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 608ccd979bdSMark Fasheh if (status < 0) 609ccd979bdSMark Fasheh mlog_errno(status); 610ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 611ccd979bdSMark Fasheh if (status < 0) 612ccd979bdSMark Fasheh mlog_errno(status); 613ccd979bdSMark Fasheh if (eb_bh) { 614ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, eb_bh); 615ccd979bdSMark Fasheh if (status < 0) 616ccd979bdSMark Fasheh mlog_errno(status); 617ccd979bdSMark Fasheh } 618ccd979bdSMark Fasheh 619ccd979bdSMark Fasheh status = 0; 620ccd979bdSMark Fasheh bail: 621ccd979bdSMark Fasheh if (new_eb_bhs) { 622ccd979bdSMark Fasheh for (i = 0; i < new_blocks; i++) 623ccd979bdSMark Fasheh if (new_eb_bhs[i]) 624ccd979bdSMark Fasheh brelse(new_eb_bhs[i]); 625ccd979bdSMark Fasheh kfree(new_eb_bhs); 626ccd979bdSMark Fasheh } 627ccd979bdSMark Fasheh 628ccd979bdSMark Fasheh mlog_exit(status); 629ccd979bdSMark Fasheh return status; 630ccd979bdSMark Fasheh } 631ccd979bdSMark Fasheh 632ccd979bdSMark Fasheh /* 633ccd979bdSMark Fasheh * adds another level to the allocation tree. 634ccd979bdSMark Fasheh * returns back the new extent block so you can add a branch to it 635ccd979bdSMark Fasheh * after this call. 636ccd979bdSMark Fasheh */ 637ccd979bdSMark Fasheh static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 6381fabe148SMark Fasheh handle_t *handle, 639ccd979bdSMark Fasheh struct inode *inode, 640ccd979bdSMark Fasheh struct buffer_head *fe_bh, 641ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 642ccd979bdSMark Fasheh struct buffer_head **ret_new_eb_bh) 643ccd979bdSMark Fasheh { 644ccd979bdSMark Fasheh int status, i; 645dcd0538fSMark Fasheh u32 new_clusters; 646ccd979bdSMark Fasheh struct buffer_head *new_eb_bh = NULL; 647ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 648ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 649ccd979bdSMark Fasheh struct ocfs2_extent_list *fe_el; 650ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 651ccd979bdSMark Fasheh 652ccd979bdSMark Fasheh mlog_entry_void(); 653ccd979bdSMark Fasheh 654ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 655ccd979bdSMark Fasheh &new_eb_bh); 656ccd979bdSMark Fasheh if (status < 0) { 657ccd979bdSMark Fasheh mlog_errno(status); 658ccd979bdSMark Fasheh goto bail; 659ccd979bdSMark Fasheh } 660ccd979bdSMark Fasheh 661ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 662ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 663ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 664ccd979bdSMark Fasheh status = -EIO; 665ccd979bdSMark Fasheh goto bail; 666ccd979bdSMark Fasheh } 667ccd979bdSMark Fasheh 668ccd979bdSMark Fasheh eb_el = &eb->h_list; 669ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 670ccd979bdSMark Fasheh fe_el = &fe->id2.i_list; 671ccd979bdSMark Fasheh 672ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, new_eb_bh, 673ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 674ccd979bdSMark Fasheh if (status < 0) { 675ccd979bdSMark Fasheh mlog_errno(status); 676ccd979bdSMark Fasheh goto bail; 677ccd979bdSMark Fasheh } 678ccd979bdSMark Fasheh 679ccd979bdSMark Fasheh /* copy the fe data into the new extent block */ 680ccd979bdSMark Fasheh eb_el->l_tree_depth = fe_el->l_tree_depth; 681ccd979bdSMark Fasheh eb_el->l_next_free_rec = fe_el->l_next_free_rec; 682e48edee2SMark Fasheh for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 683e48edee2SMark Fasheh eb_el->l_recs[i] = fe_el->l_recs[i]; 684ccd979bdSMark Fasheh 685ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, new_eb_bh); 686ccd979bdSMark Fasheh if (status < 0) { 687ccd979bdSMark Fasheh mlog_errno(status); 688ccd979bdSMark Fasheh goto bail; 689ccd979bdSMark Fasheh } 690ccd979bdSMark Fasheh 691ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 692ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 693ccd979bdSMark Fasheh if (status < 0) { 694ccd979bdSMark Fasheh mlog_errno(status); 695ccd979bdSMark Fasheh goto bail; 696ccd979bdSMark Fasheh } 697ccd979bdSMark Fasheh 698dcd0538fSMark Fasheh new_clusters = ocfs2_sum_rightmost_rec(eb_el); 699dcd0538fSMark Fasheh 700ccd979bdSMark Fasheh /* update fe now */ 701ccd979bdSMark Fasheh le16_add_cpu(&fe_el->l_tree_depth, 1); 702ccd979bdSMark Fasheh fe_el->l_recs[0].e_cpos = 0; 703ccd979bdSMark Fasheh fe_el->l_recs[0].e_blkno = eb->h_blkno; 704e48edee2SMark Fasheh fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 705e48edee2SMark Fasheh for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 706e48edee2SMark Fasheh memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 707ccd979bdSMark Fasheh fe_el->l_next_free_rec = cpu_to_le16(1); 708ccd979bdSMark Fasheh 709ccd979bdSMark Fasheh /* If this is our 1st tree depth shift, then last_eb_blk 710ccd979bdSMark Fasheh * becomes the allocated extent block */ 711ccd979bdSMark Fasheh if (fe_el->l_tree_depth == cpu_to_le16(1)) 712ccd979bdSMark Fasheh fe->i_last_eb_blk = eb->h_blkno; 713ccd979bdSMark Fasheh 714ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 715ccd979bdSMark Fasheh if (status < 0) { 716ccd979bdSMark Fasheh mlog_errno(status); 717ccd979bdSMark Fasheh goto bail; 718ccd979bdSMark Fasheh } 719ccd979bdSMark Fasheh 720ccd979bdSMark Fasheh *ret_new_eb_bh = new_eb_bh; 721ccd979bdSMark Fasheh new_eb_bh = NULL; 722ccd979bdSMark Fasheh status = 0; 723ccd979bdSMark Fasheh bail: 724ccd979bdSMark Fasheh if (new_eb_bh) 725ccd979bdSMark Fasheh brelse(new_eb_bh); 726ccd979bdSMark Fasheh 727ccd979bdSMark Fasheh mlog_exit(status); 728ccd979bdSMark Fasheh return status; 729ccd979bdSMark Fasheh } 730ccd979bdSMark Fasheh 731ccd979bdSMark Fasheh /* 732ccd979bdSMark Fasheh * Should only be called when there is no space left in any of the 733ccd979bdSMark Fasheh * leaf nodes. What we want to do is find the lowest tree depth 734ccd979bdSMark Fasheh * non-leaf extent block with room for new records. There are three 735ccd979bdSMark Fasheh * valid results of this search: 736ccd979bdSMark Fasheh * 737ccd979bdSMark Fasheh * 1) a lowest extent block is found, then we pass it back in 738ccd979bdSMark Fasheh * *lowest_eb_bh and return '0' 739ccd979bdSMark Fasheh * 740ccd979bdSMark Fasheh * 2) the search fails to find anything, but the dinode has room. We 741ccd979bdSMark Fasheh * pass NULL back in *lowest_eb_bh, but still return '0' 742ccd979bdSMark Fasheh * 743ccd979bdSMark Fasheh * 3) the search fails to find anything AND the dinode is full, in 744ccd979bdSMark Fasheh * which case we return > 0 745ccd979bdSMark Fasheh * 746ccd979bdSMark Fasheh * return status < 0 indicates an error. 747ccd979bdSMark Fasheh */ 748ccd979bdSMark Fasheh static int ocfs2_find_branch_target(struct ocfs2_super *osb, 749ccd979bdSMark Fasheh struct inode *inode, 750ccd979bdSMark Fasheh struct buffer_head *fe_bh, 751ccd979bdSMark Fasheh struct buffer_head **target_bh) 752ccd979bdSMark Fasheh { 753ccd979bdSMark Fasheh int status = 0, i; 754ccd979bdSMark Fasheh u64 blkno; 755ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 756ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 757ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 758ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 759ccd979bdSMark Fasheh struct buffer_head *lowest_bh = NULL; 760ccd979bdSMark Fasheh 761ccd979bdSMark Fasheh mlog_entry_void(); 762ccd979bdSMark Fasheh 763ccd979bdSMark Fasheh *target_bh = NULL; 764ccd979bdSMark Fasheh 765ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 766ccd979bdSMark Fasheh el = &fe->id2.i_list; 767ccd979bdSMark Fasheh 768ccd979bdSMark Fasheh while(le16_to_cpu(el->l_tree_depth) > 1) { 769ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 770b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has empty " 771ccd979bdSMark Fasheh "extent list (next_free_rec == 0)", 772b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 773ccd979bdSMark Fasheh status = -EIO; 774ccd979bdSMark Fasheh goto bail; 775ccd979bdSMark Fasheh } 776ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 777ccd979bdSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 778ccd979bdSMark Fasheh if (!blkno) { 779b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has extent " 780ccd979bdSMark Fasheh "list where extent # %d has no physical " 781ccd979bdSMark Fasheh "block start", 782b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 783ccd979bdSMark Fasheh status = -EIO; 784ccd979bdSMark Fasheh goto bail; 785ccd979bdSMark Fasheh } 786ccd979bdSMark Fasheh 787ccd979bdSMark Fasheh if (bh) { 788ccd979bdSMark Fasheh brelse(bh); 789ccd979bdSMark Fasheh bh = NULL; 790ccd979bdSMark Fasheh } 791ccd979bdSMark Fasheh 792ccd979bdSMark Fasheh status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 793ccd979bdSMark Fasheh inode); 794ccd979bdSMark Fasheh if (status < 0) { 795ccd979bdSMark Fasheh mlog_errno(status); 796ccd979bdSMark Fasheh goto bail; 797ccd979bdSMark Fasheh } 798ccd979bdSMark Fasheh 799ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 800ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 801ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 802ccd979bdSMark Fasheh status = -EIO; 803ccd979bdSMark Fasheh goto bail; 804ccd979bdSMark Fasheh } 805ccd979bdSMark Fasheh el = &eb->h_list; 806ccd979bdSMark Fasheh 807ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) < 808ccd979bdSMark Fasheh le16_to_cpu(el->l_count)) { 809ccd979bdSMark Fasheh if (lowest_bh) 810ccd979bdSMark Fasheh brelse(lowest_bh); 811ccd979bdSMark Fasheh lowest_bh = bh; 812ccd979bdSMark Fasheh get_bh(lowest_bh); 813ccd979bdSMark Fasheh } 814ccd979bdSMark Fasheh } 815ccd979bdSMark Fasheh 816ccd979bdSMark Fasheh /* If we didn't find one and the fe doesn't have any room, 817ccd979bdSMark Fasheh * then return '1' */ 818ccd979bdSMark Fasheh if (!lowest_bh 819ccd979bdSMark Fasheh && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) 820ccd979bdSMark Fasheh status = 1; 821ccd979bdSMark Fasheh 822ccd979bdSMark Fasheh *target_bh = lowest_bh; 823ccd979bdSMark Fasheh bail: 824ccd979bdSMark Fasheh if (bh) 825ccd979bdSMark Fasheh brelse(bh); 826ccd979bdSMark Fasheh 827ccd979bdSMark Fasheh mlog_exit(status); 828ccd979bdSMark Fasheh return status; 829ccd979bdSMark Fasheh } 830ccd979bdSMark Fasheh 831e48edee2SMark Fasheh /* 832e48edee2SMark Fasheh * This is only valid for leaf nodes, which are the only ones that can 833e48edee2SMark Fasheh * have empty extents anyway. 834e48edee2SMark Fasheh */ 835dcd0538fSMark Fasheh static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) 836dcd0538fSMark Fasheh { 837e48edee2SMark Fasheh return !rec->e_leaf_clusters; 838dcd0538fSMark Fasheh } 839dcd0538fSMark Fasheh 840dcd0538fSMark Fasheh /* 841dcd0538fSMark Fasheh * This function will discard the rightmost extent record. 842dcd0538fSMark Fasheh */ 843dcd0538fSMark Fasheh static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 844dcd0538fSMark Fasheh { 845dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 846dcd0538fSMark Fasheh int count = le16_to_cpu(el->l_count); 847dcd0538fSMark Fasheh unsigned int num_bytes; 848dcd0538fSMark Fasheh 849dcd0538fSMark Fasheh BUG_ON(!next_free); 850dcd0538fSMark Fasheh /* This will cause us to go off the end of our extent list. */ 851dcd0538fSMark Fasheh BUG_ON(next_free >= count); 852dcd0538fSMark Fasheh 853dcd0538fSMark Fasheh num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 854dcd0538fSMark Fasheh 855dcd0538fSMark Fasheh memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 856dcd0538fSMark Fasheh } 857dcd0538fSMark Fasheh 858dcd0538fSMark Fasheh static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 859dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 860dcd0538fSMark Fasheh { 861dcd0538fSMark Fasheh int i, insert_index, next_free, has_empty, num_bytes; 862dcd0538fSMark Fasheh u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 863dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 864dcd0538fSMark Fasheh 865dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 866dcd0538fSMark Fasheh has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 867dcd0538fSMark Fasheh 868dcd0538fSMark Fasheh BUG_ON(!next_free); 869dcd0538fSMark Fasheh 870dcd0538fSMark Fasheh /* The tree code before us didn't allow enough room in the leaf. */ 871dcd0538fSMark Fasheh if (el->l_next_free_rec == el->l_count && !has_empty) 872dcd0538fSMark Fasheh BUG(); 873dcd0538fSMark Fasheh 874dcd0538fSMark Fasheh /* 875dcd0538fSMark Fasheh * The easiest way to approach this is to just remove the 876dcd0538fSMark Fasheh * empty extent and temporarily decrement next_free. 877dcd0538fSMark Fasheh */ 878dcd0538fSMark Fasheh if (has_empty) { 879dcd0538fSMark Fasheh /* 880dcd0538fSMark Fasheh * If next_free was 1 (only an empty extent), this 881dcd0538fSMark Fasheh * loop won't execute, which is fine. We still want 882dcd0538fSMark Fasheh * the decrement above to happen. 883dcd0538fSMark Fasheh */ 884dcd0538fSMark Fasheh for(i = 0; i < (next_free - 1); i++) 885dcd0538fSMark Fasheh el->l_recs[i] = el->l_recs[i+1]; 886dcd0538fSMark Fasheh 887dcd0538fSMark Fasheh next_free--; 888dcd0538fSMark Fasheh } 889dcd0538fSMark Fasheh 890dcd0538fSMark Fasheh /* 891dcd0538fSMark Fasheh * Figure out what the new record index should be. 892dcd0538fSMark Fasheh */ 893dcd0538fSMark Fasheh for(i = 0; i < next_free; i++) { 894dcd0538fSMark Fasheh rec = &el->l_recs[i]; 895dcd0538fSMark Fasheh 896dcd0538fSMark Fasheh if (insert_cpos < le32_to_cpu(rec->e_cpos)) 897dcd0538fSMark Fasheh break; 898dcd0538fSMark Fasheh } 899dcd0538fSMark Fasheh insert_index = i; 900dcd0538fSMark Fasheh 901dcd0538fSMark Fasheh mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 902dcd0538fSMark Fasheh insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 903dcd0538fSMark Fasheh 904dcd0538fSMark Fasheh BUG_ON(insert_index < 0); 905dcd0538fSMark Fasheh BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 906dcd0538fSMark Fasheh BUG_ON(insert_index > next_free); 907dcd0538fSMark Fasheh 908dcd0538fSMark Fasheh /* 909dcd0538fSMark Fasheh * No need to memmove if we're just adding to the tail. 910dcd0538fSMark Fasheh */ 911dcd0538fSMark Fasheh if (insert_index != next_free) { 912dcd0538fSMark Fasheh BUG_ON(next_free >= le16_to_cpu(el->l_count)); 913dcd0538fSMark Fasheh 914dcd0538fSMark Fasheh num_bytes = next_free - insert_index; 915dcd0538fSMark Fasheh num_bytes *= sizeof(struct ocfs2_extent_rec); 916dcd0538fSMark Fasheh memmove(&el->l_recs[insert_index + 1], 917dcd0538fSMark Fasheh &el->l_recs[insert_index], 918dcd0538fSMark Fasheh num_bytes); 919dcd0538fSMark Fasheh } 920dcd0538fSMark Fasheh 921dcd0538fSMark Fasheh /* 922dcd0538fSMark Fasheh * Either we had an empty extent, and need to re-increment or 923dcd0538fSMark Fasheh * there was no empty extent on a non full rightmost leaf node, 924dcd0538fSMark Fasheh * in which case we still need to increment. 925dcd0538fSMark Fasheh */ 926dcd0538fSMark Fasheh next_free++; 927dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(next_free); 928dcd0538fSMark Fasheh /* 929dcd0538fSMark Fasheh * Make sure none of the math above just messed up our tree. 930dcd0538fSMark Fasheh */ 931dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 932dcd0538fSMark Fasheh 933dcd0538fSMark Fasheh el->l_recs[insert_index] = *insert_rec; 934dcd0538fSMark Fasheh 935dcd0538fSMark Fasheh } 936dcd0538fSMark Fasheh 937dcd0538fSMark Fasheh /* 938dcd0538fSMark Fasheh * Create an empty extent record . 939dcd0538fSMark Fasheh * 940dcd0538fSMark Fasheh * l_next_free_rec may be updated. 941dcd0538fSMark Fasheh * 942dcd0538fSMark Fasheh * If an empty extent already exists do nothing. 943dcd0538fSMark Fasheh */ 944dcd0538fSMark Fasheh static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 945dcd0538fSMark Fasheh { 946dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 947dcd0538fSMark Fasheh 948e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 949e48edee2SMark Fasheh 950dcd0538fSMark Fasheh if (next_free == 0) 951dcd0538fSMark Fasheh goto set_and_inc; 952dcd0538fSMark Fasheh 953dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) 954dcd0538fSMark Fasheh return; 955dcd0538fSMark Fasheh 956dcd0538fSMark Fasheh mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 957dcd0538fSMark Fasheh "Asked to create an empty extent in a full list:\n" 958dcd0538fSMark Fasheh "count = %u, tree depth = %u", 959dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 960dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 961dcd0538fSMark Fasheh 962dcd0538fSMark Fasheh ocfs2_shift_records_right(el); 963dcd0538fSMark Fasheh 964dcd0538fSMark Fasheh set_and_inc: 965dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 966dcd0538fSMark Fasheh memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 967dcd0538fSMark Fasheh } 968dcd0538fSMark Fasheh 969dcd0538fSMark Fasheh /* 970dcd0538fSMark Fasheh * For a rotation which involves two leaf nodes, the "root node" is 971dcd0538fSMark Fasheh * the lowest level tree node which contains a path to both leafs. This 972dcd0538fSMark Fasheh * resulting set of information can be used to form a complete "subtree" 973dcd0538fSMark Fasheh * 974dcd0538fSMark Fasheh * This function is passed two full paths from the dinode down to a 975dcd0538fSMark Fasheh * pair of adjacent leaves. It's task is to figure out which path 976dcd0538fSMark Fasheh * index contains the subtree root - this can be the root index itself 977dcd0538fSMark Fasheh * in a worst-case rotation. 978dcd0538fSMark Fasheh * 979dcd0538fSMark Fasheh * The array index of the subtree root is passed back. 980dcd0538fSMark Fasheh */ 981dcd0538fSMark Fasheh static int ocfs2_find_subtree_root(struct inode *inode, 982dcd0538fSMark Fasheh struct ocfs2_path *left, 983dcd0538fSMark Fasheh struct ocfs2_path *right) 984dcd0538fSMark Fasheh { 985dcd0538fSMark Fasheh int i = 0; 986dcd0538fSMark Fasheh 987dcd0538fSMark Fasheh /* 988dcd0538fSMark Fasheh * Check that the caller passed in two paths from the same tree. 989dcd0538fSMark Fasheh */ 990dcd0538fSMark Fasheh BUG_ON(path_root_bh(left) != path_root_bh(right)); 991dcd0538fSMark Fasheh 992dcd0538fSMark Fasheh do { 993dcd0538fSMark Fasheh i++; 994dcd0538fSMark Fasheh 995dcd0538fSMark Fasheh /* 996dcd0538fSMark Fasheh * The caller didn't pass two adjacent paths. 997dcd0538fSMark Fasheh */ 998dcd0538fSMark Fasheh mlog_bug_on_msg(i > left->p_tree_depth, 999dcd0538fSMark Fasheh "Inode %lu, left depth %u, right depth %u\n" 1000dcd0538fSMark Fasheh "left leaf blk %llu, right leaf blk %llu\n", 1001dcd0538fSMark Fasheh inode->i_ino, left->p_tree_depth, 1002dcd0538fSMark Fasheh right->p_tree_depth, 1003dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(left)->b_blocknr, 1004dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(right)->b_blocknr); 1005dcd0538fSMark Fasheh } while (left->p_node[i].bh->b_blocknr == 1006dcd0538fSMark Fasheh right->p_node[i].bh->b_blocknr); 1007dcd0538fSMark Fasheh 1008dcd0538fSMark Fasheh return i - 1; 1009dcd0538fSMark Fasheh } 1010dcd0538fSMark Fasheh 1011dcd0538fSMark Fasheh typedef void (path_insert_t)(void *, struct buffer_head *); 1012dcd0538fSMark Fasheh 1013dcd0538fSMark Fasheh /* 1014dcd0538fSMark Fasheh * Traverse a btree path in search of cpos, starting at root_el. 1015dcd0538fSMark Fasheh * 1016dcd0538fSMark Fasheh * This code can be called with a cpos larger than the tree, in which 1017dcd0538fSMark Fasheh * case it will return the rightmost path. 1018dcd0538fSMark Fasheh */ 1019dcd0538fSMark Fasheh static int __ocfs2_find_path(struct inode *inode, 1020dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el, u32 cpos, 1021dcd0538fSMark Fasheh path_insert_t *func, void *data) 1022dcd0538fSMark Fasheh { 1023dcd0538fSMark Fasheh int i, ret = 0; 1024dcd0538fSMark Fasheh u32 range; 1025dcd0538fSMark Fasheh u64 blkno; 1026dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1027dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 1028dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1029dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1030dcd0538fSMark Fasheh struct ocfs2_inode_info *oi = OCFS2_I(inode); 1031dcd0538fSMark Fasheh 1032dcd0538fSMark Fasheh el = root_el; 1033dcd0538fSMark Fasheh while (el->l_tree_depth) { 1034dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 1035dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1036dcd0538fSMark Fasheh "Inode %llu has empty extent list at " 1037dcd0538fSMark Fasheh "depth %u\n", 1038dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1039dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 1040dcd0538fSMark Fasheh ret = -EROFS; 1041dcd0538fSMark Fasheh goto out; 1042dcd0538fSMark Fasheh 1043dcd0538fSMark Fasheh } 1044dcd0538fSMark Fasheh 1045dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1046dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1047dcd0538fSMark Fasheh 1048dcd0538fSMark Fasheh /* 1049dcd0538fSMark Fasheh * In the case that cpos is off the allocation 1050dcd0538fSMark Fasheh * tree, this should just wind up returning the 1051dcd0538fSMark Fasheh * rightmost record. 1052dcd0538fSMark Fasheh */ 1053dcd0538fSMark Fasheh range = le32_to_cpu(rec->e_cpos) + 1054e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec); 1055dcd0538fSMark Fasheh if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1056dcd0538fSMark Fasheh break; 1057dcd0538fSMark Fasheh } 1058dcd0538fSMark Fasheh 1059dcd0538fSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1060dcd0538fSMark Fasheh if (blkno == 0) { 1061dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1062dcd0538fSMark Fasheh "Inode %llu has bad blkno in extent list " 1063dcd0538fSMark Fasheh "at depth %u (index %d)\n", 1064dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1065dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), i); 1066dcd0538fSMark Fasheh ret = -EROFS; 1067dcd0538fSMark Fasheh goto out; 1068dcd0538fSMark Fasheh } 1069dcd0538fSMark Fasheh 1070dcd0538fSMark Fasheh brelse(bh); 1071dcd0538fSMark Fasheh bh = NULL; 1072dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1073dcd0538fSMark Fasheh &bh, OCFS2_BH_CACHED, inode); 1074dcd0538fSMark Fasheh if (ret) { 1075dcd0538fSMark Fasheh mlog_errno(ret); 1076dcd0538fSMark Fasheh goto out; 1077dcd0538fSMark Fasheh } 1078dcd0538fSMark Fasheh 1079dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 1080dcd0538fSMark Fasheh el = &eb->h_list; 1081dcd0538fSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1082dcd0538fSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1083dcd0538fSMark Fasheh ret = -EIO; 1084dcd0538fSMark Fasheh goto out; 1085dcd0538fSMark Fasheh } 1086dcd0538fSMark Fasheh 1087dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) > 1088dcd0538fSMark Fasheh le16_to_cpu(el->l_count)) { 1089dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1090dcd0538fSMark Fasheh "Inode %llu has bad count in extent list " 1091dcd0538fSMark Fasheh "at block %llu (next free=%u, count=%u)\n", 1092dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1093dcd0538fSMark Fasheh (unsigned long long)bh->b_blocknr, 1094dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1095dcd0538fSMark Fasheh le16_to_cpu(el->l_count)); 1096dcd0538fSMark Fasheh ret = -EROFS; 1097dcd0538fSMark Fasheh goto out; 1098dcd0538fSMark Fasheh } 1099dcd0538fSMark Fasheh 1100dcd0538fSMark Fasheh if (func) 1101dcd0538fSMark Fasheh func(data, bh); 1102dcd0538fSMark Fasheh } 1103dcd0538fSMark Fasheh 1104dcd0538fSMark Fasheh out: 1105dcd0538fSMark Fasheh /* 1106dcd0538fSMark Fasheh * Catch any trailing bh that the loop didn't handle. 1107dcd0538fSMark Fasheh */ 1108dcd0538fSMark Fasheh brelse(bh); 1109dcd0538fSMark Fasheh 1110dcd0538fSMark Fasheh return ret; 1111dcd0538fSMark Fasheh } 1112dcd0538fSMark Fasheh 1113dcd0538fSMark Fasheh /* 1114dcd0538fSMark Fasheh * Given an initialized path (that is, it has a valid root extent 1115dcd0538fSMark Fasheh * list), this function will traverse the btree in search of the path 1116dcd0538fSMark Fasheh * which would contain cpos. 1117dcd0538fSMark Fasheh * 1118dcd0538fSMark Fasheh * The path traveled is recorded in the path structure. 1119dcd0538fSMark Fasheh * 1120dcd0538fSMark Fasheh * Note that this will not do any comparisons on leaf node extent 1121dcd0538fSMark Fasheh * records, so it will work fine in the case that we just added a tree 1122dcd0538fSMark Fasheh * branch. 1123dcd0538fSMark Fasheh */ 1124dcd0538fSMark Fasheh struct find_path_data { 1125dcd0538fSMark Fasheh int index; 1126dcd0538fSMark Fasheh struct ocfs2_path *path; 1127dcd0538fSMark Fasheh }; 1128dcd0538fSMark Fasheh static void find_path_ins(void *data, struct buffer_head *bh) 1129dcd0538fSMark Fasheh { 1130dcd0538fSMark Fasheh struct find_path_data *fp = data; 1131dcd0538fSMark Fasheh 1132dcd0538fSMark Fasheh get_bh(bh); 1133dcd0538fSMark Fasheh ocfs2_path_insert_eb(fp->path, fp->index, bh); 1134dcd0538fSMark Fasheh fp->index++; 1135dcd0538fSMark Fasheh } 1136dcd0538fSMark Fasheh static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1137dcd0538fSMark Fasheh u32 cpos) 1138dcd0538fSMark Fasheh { 1139dcd0538fSMark Fasheh struct find_path_data data; 1140dcd0538fSMark Fasheh 1141dcd0538fSMark Fasheh data.index = 1; 1142dcd0538fSMark Fasheh data.path = path; 1143dcd0538fSMark Fasheh return __ocfs2_find_path(inode, path_root_el(path), cpos, 1144dcd0538fSMark Fasheh find_path_ins, &data); 1145dcd0538fSMark Fasheh } 1146dcd0538fSMark Fasheh 1147dcd0538fSMark Fasheh static void find_leaf_ins(void *data, struct buffer_head *bh) 1148dcd0538fSMark Fasheh { 1149dcd0538fSMark Fasheh struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1150dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &eb->h_list; 1151dcd0538fSMark Fasheh struct buffer_head **ret = data; 1152dcd0538fSMark Fasheh 1153dcd0538fSMark Fasheh /* We want to retain only the leaf block. */ 1154dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 1155dcd0538fSMark Fasheh get_bh(bh); 1156dcd0538fSMark Fasheh *ret = bh; 1157dcd0538fSMark Fasheh } 1158dcd0538fSMark Fasheh } 1159dcd0538fSMark Fasheh /* 1160dcd0538fSMark Fasheh * Find the leaf block in the tree which would contain cpos. No 1161dcd0538fSMark Fasheh * checking of the actual leaf is done. 1162dcd0538fSMark Fasheh * 1163dcd0538fSMark Fasheh * Some paths want to call this instead of allocating a path structure 1164dcd0538fSMark Fasheh * and calling ocfs2_find_path(). 1165dcd0538fSMark Fasheh * 1166dcd0538fSMark Fasheh * This function doesn't handle non btree extent lists. 1167dcd0538fSMark Fasheh */ 1168363041a5SMark Fasheh int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1169363041a5SMark Fasheh u32 cpos, struct buffer_head **leaf_bh) 1170dcd0538fSMark Fasheh { 1171dcd0538fSMark Fasheh int ret; 1172dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1173dcd0538fSMark Fasheh 1174dcd0538fSMark Fasheh ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1175dcd0538fSMark Fasheh if (ret) { 1176dcd0538fSMark Fasheh mlog_errno(ret); 1177dcd0538fSMark Fasheh goto out; 1178dcd0538fSMark Fasheh } 1179dcd0538fSMark Fasheh 1180dcd0538fSMark Fasheh *leaf_bh = bh; 1181dcd0538fSMark Fasheh out: 1182dcd0538fSMark Fasheh return ret; 1183dcd0538fSMark Fasheh } 1184dcd0538fSMark Fasheh 1185dcd0538fSMark Fasheh /* 1186dcd0538fSMark Fasheh * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1187dcd0538fSMark Fasheh * 1188dcd0538fSMark Fasheh * Basically, we've moved stuff around at the bottom of the tree and 1189dcd0538fSMark Fasheh * we need to fix up the extent records above the changes to reflect 1190dcd0538fSMark Fasheh * the new changes. 1191dcd0538fSMark Fasheh * 1192dcd0538fSMark Fasheh * left_rec: the record on the left. 1193dcd0538fSMark Fasheh * left_child_el: is the child list pointed to by left_rec 1194dcd0538fSMark Fasheh * right_rec: the record to the right of left_rec 1195dcd0538fSMark Fasheh * right_child_el: is the child list pointed to by right_rec 1196dcd0538fSMark Fasheh * 1197dcd0538fSMark Fasheh * By definition, this only works on interior nodes. 1198dcd0538fSMark Fasheh */ 1199dcd0538fSMark Fasheh static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1200dcd0538fSMark Fasheh struct ocfs2_extent_list *left_child_el, 1201dcd0538fSMark Fasheh struct ocfs2_extent_rec *right_rec, 1202dcd0538fSMark Fasheh struct ocfs2_extent_list *right_child_el) 1203dcd0538fSMark Fasheh { 1204dcd0538fSMark Fasheh u32 left_clusters, right_end; 1205dcd0538fSMark Fasheh 1206dcd0538fSMark Fasheh /* 1207dcd0538fSMark Fasheh * Interior nodes never have holes. Their cpos is the cpos of 1208dcd0538fSMark Fasheh * the leftmost record in their child list. Their cluster 1209dcd0538fSMark Fasheh * count covers the full theoretical range of their child list 1210dcd0538fSMark Fasheh * - the range between their cpos and the cpos of the record 1211dcd0538fSMark Fasheh * immediately to their right. 1212dcd0538fSMark Fasheh */ 1213dcd0538fSMark Fasheh left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1214dcd0538fSMark Fasheh left_clusters -= le32_to_cpu(left_rec->e_cpos); 1215e48edee2SMark Fasheh left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1216dcd0538fSMark Fasheh 1217dcd0538fSMark Fasheh /* 1218dcd0538fSMark Fasheh * Calculate the rightmost cluster count boundary before 1219e48edee2SMark Fasheh * moving cpos - we will need to adjust clusters after 1220dcd0538fSMark Fasheh * updating e_cpos to keep the same highest cluster count. 1221dcd0538fSMark Fasheh */ 1222dcd0538fSMark Fasheh right_end = le32_to_cpu(right_rec->e_cpos); 1223e48edee2SMark Fasheh right_end += le32_to_cpu(right_rec->e_int_clusters); 1224dcd0538fSMark Fasheh 1225dcd0538fSMark Fasheh right_rec->e_cpos = left_rec->e_cpos; 1226dcd0538fSMark Fasheh le32_add_cpu(&right_rec->e_cpos, left_clusters); 1227dcd0538fSMark Fasheh 1228dcd0538fSMark Fasheh right_end -= le32_to_cpu(right_rec->e_cpos); 1229e48edee2SMark Fasheh right_rec->e_int_clusters = cpu_to_le32(right_end); 1230dcd0538fSMark Fasheh } 1231dcd0538fSMark Fasheh 1232dcd0538fSMark Fasheh /* 1233dcd0538fSMark Fasheh * Adjust the adjacent root node records involved in a 1234dcd0538fSMark Fasheh * rotation. left_el_blkno is passed in as a key so that we can easily 1235dcd0538fSMark Fasheh * find it's index in the root list. 1236dcd0538fSMark Fasheh */ 1237dcd0538fSMark Fasheh static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1238dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el, 1239dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, 1240dcd0538fSMark Fasheh u64 left_el_blkno) 1241dcd0538fSMark Fasheh { 1242dcd0538fSMark Fasheh int i; 1243dcd0538fSMark Fasheh 1244dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1245dcd0538fSMark Fasheh le16_to_cpu(left_el->l_tree_depth)); 1246dcd0538fSMark Fasheh 1247dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1248dcd0538fSMark Fasheh if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1249dcd0538fSMark Fasheh break; 1250dcd0538fSMark Fasheh } 1251dcd0538fSMark Fasheh 1252dcd0538fSMark Fasheh /* 1253dcd0538fSMark Fasheh * The path walking code should have never returned a root and 1254dcd0538fSMark Fasheh * two paths which are not adjacent. 1255dcd0538fSMark Fasheh */ 1256dcd0538fSMark Fasheh BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1257dcd0538fSMark Fasheh 1258dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1259dcd0538fSMark Fasheh &root_el->l_recs[i + 1], right_el); 1260dcd0538fSMark Fasheh } 1261dcd0538fSMark Fasheh 1262dcd0538fSMark Fasheh /* 1263dcd0538fSMark Fasheh * We've changed a leaf block (in right_path) and need to reflect that 1264dcd0538fSMark Fasheh * change back up the subtree. 1265dcd0538fSMark Fasheh * 1266dcd0538fSMark Fasheh * This happens in multiple places: 1267dcd0538fSMark Fasheh * - When we've moved an extent record from the left path leaf to the right 1268dcd0538fSMark Fasheh * path leaf to make room for an empty extent in the left path leaf. 1269dcd0538fSMark Fasheh * - When our insert into the right path leaf is at the leftmost edge 1270dcd0538fSMark Fasheh * and requires an update of the path immediately to it's left. This 1271dcd0538fSMark Fasheh * can occur at the end of some types of rotation and appending inserts. 1272dcd0538fSMark Fasheh */ 1273dcd0538fSMark Fasheh static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1274dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1275dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1276dcd0538fSMark Fasheh int subtree_index) 1277dcd0538fSMark Fasheh { 1278dcd0538fSMark Fasheh int ret, i, idx; 1279dcd0538fSMark Fasheh struct ocfs2_extent_list *el, *left_el, *right_el; 1280dcd0538fSMark Fasheh struct ocfs2_extent_rec *left_rec, *right_rec; 1281dcd0538fSMark Fasheh struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1282dcd0538fSMark Fasheh 1283dcd0538fSMark Fasheh /* 1284dcd0538fSMark Fasheh * Update the counts and position values within all the 1285dcd0538fSMark Fasheh * interior nodes to reflect the leaf rotation we just did. 1286dcd0538fSMark Fasheh * 1287dcd0538fSMark Fasheh * The root node is handled below the loop. 1288dcd0538fSMark Fasheh * 1289dcd0538fSMark Fasheh * We begin the loop with right_el and left_el pointing to the 1290dcd0538fSMark Fasheh * leaf lists and work our way up. 1291dcd0538fSMark Fasheh * 1292dcd0538fSMark Fasheh * NOTE: within this loop, left_el and right_el always refer 1293dcd0538fSMark Fasheh * to the *child* lists. 1294dcd0538fSMark Fasheh */ 1295dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1296dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1297dcd0538fSMark Fasheh for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1298dcd0538fSMark Fasheh mlog(0, "Adjust records at index %u\n", i); 1299dcd0538fSMark Fasheh 1300dcd0538fSMark Fasheh /* 1301dcd0538fSMark Fasheh * One nice property of knowing that all of these 1302dcd0538fSMark Fasheh * nodes are below the root is that we only deal with 1303dcd0538fSMark Fasheh * the leftmost right node record and the rightmost 1304dcd0538fSMark Fasheh * left node record. 1305dcd0538fSMark Fasheh */ 1306dcd0538fSMark Fasheh el = left_path->p_node[i].el; 1307dcd0538fSMark Fasheh idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1308dcd0538fSMark Fasheh left_rec = &el->l_recs[idx]; 1309dcd0538fSMark Fasheh 1310dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1311dcd0538fSMark Fasheh right_rec = &el->l_recs[0]; 1312dcd0538fSMark Fasheh 1313dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1314dcd0538fSMark Fasheh right_el); 1315dcd0538fSMark Fasheh 1316dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1317dcd0538fSMark Fasheh if (ret) 1318dcd0538fSMark Fasheh mlog_errno(ret); 1319dcd0538fSMark Fasheh 1320dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1321dcd0538fSMark Fasheh if (ret) 1322dcd0538fSMark Fasheh mlog_errno(ret); 1323dcd0538fSMark Fasheh 1324dcd0538fSMark Fasheh /* 1325dcd0538fSMark Fasheh * Setup our list pointers now so that the current 1326dcd0538fSMark Fasheh * parents become children in the next iteration. 1327dcd0538fSMark Fasheh */ 1328dcd0538fSMark Fasheh left_el = left_path->p_node[i].el; 1329dcd0538fSMark Fasheh right_el = right_path->p_node[i].el; 1330dcd0538fSMark Fasheh } 1331dcd0538fSMark Fasheh 1332dcd0538fSMark Fasheh /* 1333dcd0538fSMark Fasheh * At the root node, adjust the two adjacent records which 1334dcd0538fSMark Fasheh * begin our path to the leaves. 1335dcd0538fSMark Fasheh */ 1336dcd0538fSMark Fasheh 1337dcd0538fSMark Fasheh el = left_path->p_node[subtree_index].el; 1338dcd0538fSMark Fasheh left_el = left_path->p_node[subtree_index + 1].el; 1339dcd0538fSMark Fasheh right_el = right_path->p_node[subtree_index + 1].el; 1340dcd0538fSMark Fasheh 1341dcd0538fSMark Fasheh ocfs2_adjust_root_records(el, left_el, right_el, 1342dcd0538fSMark Fasheh left_path->p_node[subtree_index + 1].bh->b_blocknr); 1343dcd0538fSMark Fasheh 1344dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1345dcd0538fSMark Fasheh 1346dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, root_bh); 1347dcd0538fSMark Fasheh if (ret) 1348dcd0538fSMark Fasheh mlog_errno(ret); 1349dcd0538fSMark Fasheh } 1350dcd0538fSMark Fasheh 1351dcd0538fSMark Fasheh static int ocfs2_rotate_subtree_right(struct inode *inode, 1352dcd0538fSMark Fasheh handle_t *handle, 1353dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1354dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1355dcd0538fSMark Fasheh int subtree_index) 1356dcd0538fSMark Fasheh { 1357dcd0538fSMark Fasheh int ret, i; 1358dcd0538fSMark Fasheh struct buffer_head *right_leaf_bh; 1359dcd0538fSMark Fasheh struct buffer_head *left_leaf_bh = NULL; 1360dcd0538fSMark Fasheh struct buffer_head *root_bh; 1361dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, *left_el; 1362dcd0538fSMark Fasheh struct ocfs2_extent_rec move_rec; 1363dcd0538fSMark Fasheh 1364dcd0538fSMark Fasheh left_leaf_bh = path_leaf_bh(left_path); 1365dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1366dcd0538fSMark Fasheh 1367dcd0538fSMark Fasheh if (left_el->l_next_free_rec != left_el->l_count) { 1368dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1369dcd0538fSMark Fasheh "Inode %llu has non-full interior leaf node %llu" 1370dcd0538fSMark Fasheh "(next free = %u)", 1371dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1372dcd0538fSMark Fasheh (unsigned long long)left_leaf_bh->b_blocknr, 1373dcd0538fSMark Fasheh le16_to_cpu(left_el->l_next_free_rec)); 1374dcd0538fSMark Fasheh return -EROFS; 1375dcd0538fSMark Fasheh } 1376dcd0538fSMark Fasheh 1377dcd0538fSMark Fasheh /* 1378dcd0538fSMark Fasheh * This extent block may already have an empty record, so we 1379dcd0538fSMark Fasheh * return early if so. 1380dcd0538fSMark Fasheh */ 1381dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1382dcd0538fSMark Fasheh return 0; 1383dcd0538fSMark Fasheh 1384dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1385dcd0538fSMark Fasheh BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1386dcd0538fSMark Fasheh 1387dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, root_bh, 1388dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1389dcd0538fSMark Fasheh if (ret) { 1390dcd0538fSMark Fasheh mlog_errno(ret); 1391dcd0538fSMark Fasheh goto out; 1392dcd0538fSMark Fasheh } 1393dcd0538fSMark Fasheh 1394dcd0538fSMark Fasheh for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1395dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1396dcd0538fSMark Fasheh right_path->p_node[i].bh, 1397dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1398dcd0538fSMark Fasheh if (ret) { 1399dcd0538fSMark Fasheh mlog_errno(ret); 1400dcd0538fSMark Fasheh goto out; 1401dcd0538fSMark Fasheh } 1402dcd0538fSMark Fasheh 1403dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1404dcd0538fSMark Fasheh left_path->p_node[i].bh, 1405dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1406dcd0538fSMark Fasheh if (ret) { 1407dcd0538fSMark Fasheh mlog_errno(ret); 1408dcd0538fSMark Fasheh goto out; 1409dcd0538fSMark Fasheh } 1410dcd0538fSMark Fasheh } 1411dcd0538fSMark Fasheh 1412dcd0538fSMark Fasheh right_leaf_bh = path_leaf_bh(right_path); 1413dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1414dcd0538fSMark Fasheh 1415dcd0538fSMark Fasheh /* This is a code error, not a disk corruption. */ 1416dcd0538fSMark Fasheh mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1417dcd0538fSMark Fasheh "because rightmost leaf block %llu is empty\n", 1418dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1419dcd0538fSMark Fasheh (unsigned long long)right_leaf_bh->b_blocknr); 1420dcd0538fSMark Fasheh 1421dcd0538fSMark Fasheh ocfs2_create_empty_extent(right_el); 1422dcd0538fSMark Fasheh 1423dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1424dcd0538fSMark Fasheh if (ret) { 1425dcd0538fSMark Fasheh mlog_errno(ret); 1426dcd0538fSMark Fasheh goto out; 1427dcd0538fSMark Fasheh } 1428dcd0538fSMark Fasheh 1429dcd0538fSMark Fasheh /* Do the copy now. */ 1430dcd0538fSMark Fasheh i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1431dcd0538fSMark Fasheh move_rec = left_el->l_recs[i]; 1432dcd0538fSMark Fasheh right_el->l_recs[0] = move_rec; 1433dcd0538fSMark Fasheh 1434dcd0538fSMark Fasheh /* 1435dcd0538fSMark Fasheh * Clear out the record we just copied and shift everything 1436dcd0538fSMark Fasheh * over, leaving an empty extent in the left leaf. 1437dcd0538fSMark Fasheh * 1438dcd0538fSMark Fasheh * We temporarily subtract from next_free_rec so that the 1439dcd0538fSMark Fasheh * shift will lose the tail record (which is now defunct). 1440dcd0538fSMark Fasheh */ 1441dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, -1); 1442dcd0538fSMark Fasheh ocfs2_shift_records_right(left_el); 1443dcd0538fSMark Fasheh memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1444dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, 1); 1445dcd0538fSMark Fasheh 1446dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1447dcd0538fSMark Fasheh if (ret) { 1448dcd0538fSMark Fasheh mlog_errno(ret); 1449dcd0538fSMark Fasheh goto out; 1450dcd0538fSMark Fasheh } 1451dcd0538fSMark Fasheh 1452dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1453dcd0538fSMark Fasheh subtree_index); 1454dcd0538fSMark Fasheh 1455dcd0538fSMark Fasheh out: 1456dcd0538fSMark Fasheh return ret; 1457dcd0538fSMark Fasheh } 1458dcd0538fSMark Fasheh 1459dcd0538fSMark Fasheh /* 1460dcd0538fSMark Fasheh * Given a full path, determine what cpos value would return us a path 1461dcd0538fSMark Fasheh * containing the leaf immediately to the left of the current one. 1462dcd0538fSMark Fasheh * 1463dcd0538fSMark Fasheh * Will return zero if the path passed in is already the leftmost path. 1464dcd0538fSMark Fasheh */ 1465dcd0538fSMark Fasheh static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 1466dcd0538fSMark Fasheh struct ocfs2_path *path, u32 *cpos) 1467dcd0538fSMark Fasheh { 1468dcd0538fSMark Fasheh int i, j, ret = 0; 1469dcd0538fSMark Fasheh u64 blkno; 1470dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1471dcd0538fSMark Fasheh 1472e48edee2SMark Fasheh BUG_ON(path->p_tree_depth == 0); 1473e48edee2SMark Fasheh 1474dcd0538fSMark Fasheh *cpos = 0; 1475dcd0538fSMark Fasheh 1476dcd0538fSMark Fasheh blkno = path_leaf_bh(path)->b_blocknr; 1477dcd0538fSMark Fasheh 1478dcd0538fSMark Fasheh /* Start at the tree node just above the leaf and work our way up. */ 1479dcd0538fSMark Fasheh i = path->p_tree_depth - 1; 1480dcd0538fSMark Fasheh while (i >= 0) { 1481dcd0538fSMark Fasheh el = path->p_node[i].el; 1482dcd0538fSMark Fasheh 1483dcd0538fSMark Fasheh /* 1484dcd0538fSMark Fasheh * Find the extent record just before the one in our 1485dcd0538fSMark Fasheh * path. 1486dcd0538fSMark Fasheh */ 1487dcd0538fSMark Fasheh for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 1488dcd0538fSMark Fasheh if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 1489dcd0538fSMark Fasheh if (j == 0) { 1490dcd0538fSMark Fasheh if (i == 0) { 1491dcd0538fSMark Fasheh /* 1492dcd0538fSMark Fasheh * We've determined that the 1493dcd0538fSMark Fasheh * path specified is already 1494dcd0538fSMark Fasheh * the leftmost one - return a 1495dcd0538fSMark Fasheh * cpos of zero. 1496dcd0538fSMark Fasheh */ 1497dcd0538fSMark Fasheh goto out; 1498dcd0538fSMark Fasheh } 1499dcd0538fSMark Fasheh /* 1500dcd0538fSMark Fasheh * The leftmost record points to our 1501dcd0538fSMark Fasheh * leaf - we need to travel up the 1502dcd0538fSMark Fasheh * tree one level. 1503dcd0538fSMark Fasheh */ 1504dcd0538fSMark Fasheh goto next_node; 1505dcd0538fSMark Fasheh } 1506dcd0538fSMark Fasheh 1507dcd0538fSMark Fasheh *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 1508e48edee2SMark Fasheh *cpos = *cpos + ocfs2_rec_clusters(el, 1509e48edee2SMark Fasheh &el->l_recs[j - 1]); 1510e48edee2SMark Fasheh *cpos = *cpos - 1; 1511dcd0538fSMark Fasheh goto out; 1512dcd0538fSMark Fasheh } 1513dcd0538fSMark Fasheh } 1514dcd0538fSMark Fasheh 1515dcd0538fSMark Fasheh /* 1516dcd0538fSMark Fasheh * If we got here, we never found a valid node where 1517dcd0538fSMark Fasheh * the tree indicated one should be. 1518dcd0538fSMark Fasheh */ 1519dcd0538fSMark Fasheh ocfs2_error(sb, 1520dcd0538fSMark Fasheh "Invalid extent tree at extent block %llu\n", 1521dcd0538fSMark Fasheh (unsigned long long)blkno); 1522dcd0538fSMark Fasheh ret = -EROFS; 1523dcd0538fSMark Fasheh goto out; 1524dcd0538fSMark Fasheh 1525dcd0538fSMark Fasheh next_node: 1526dcd0538fSMark Fasheh blkno = path->p_node[i].bh->b_blocknr; 1527dcd0538fSMark Fasheh i--; 1528dcd0538fSMark Fasheh } 1529dcd0538fSMark Fasheh 1530dcd0538fSMark Fasheh out: 1531dcd0538fSMark Fasheh return ret; 1532dcd0538fSMark Fasheh } 1533dcd0538fSMark Fasheh 1534dcd0538fSMark Fasheh static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1535dcd0538fSMark Fasheh struct ocfs2_path *path) 1536dcd0538fSMark Fasheh { 1537dcd0538fSMark Fasheh int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1538dcd0538fSMark Fasheh 1539dcd0538fSMark Fasheh if (handle->h_buffer_credits < credits) 1540dcd0538fSMark Fasheh return ocfs2_extend_trans(handle, credits); 1541dcd0538fSMark Fasheh 1542dcd0538fSMark Fasheh return 0; 1543dcd0538fSMark Fasheh } 1544dcd0538fSMark Fasheh 1545dcd0538fSMark Fasheh /* 1546dcd0538fSMark Fasheh * Trap the case where we're inserting into the theoretical range past 1547dcd0538fSMark Fasheh * the _actual_ left leaf range. Otherwise, we'll rotate a record 1548dcd0538fSMark Fasheh * whose cpos is less than ours into the right leaf. 1549dcd0538fSMark Fasheh * 1550dcd0538fSMark Fasheh * It's only necessary to look at the rightmost record of the left 1551dcd0538fSMark Fasheh * leaf because the logic that calls us should ensure that the 1552dcd0538fSMark Fasheh * theoretical ranges in the path components above the leaves are 1553dcd0538fSMark Fasheh * correct. 1554dcd0538fSMark Fasheh */ 1555dcd0538fSMark Fasheh static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 1556dcd0538fSMark Fasheh u32 insert_cpos) 1557dcd0538fSMark Fasheh { 1558dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el; 1559dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1560dcd0538fSMark Fasheh int next_free; 1561dcd0538fSMark Fasheh 1562dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1563dcd0538fSMark Fasheh next_free = le16_to_cpu(left_el->l_next_free_rec); 1564dcd0538fSMark Fasheh rec = &left_el->l_recs[next_free - 1]; 1565dcd0538fSMark Fasheh 1566dcd0538fSMark Fasheh if (insert_cpos > le32_to_cpu(rec->e_cpos)) 1567dcd0538fSMark Fasheh return 1; 1568dcd0538fSMark Fasheh return 0; 1569dcd0538fSMark Fasheh } 1570dcd0538fSMark Fasheh 1571dcd0538fSMark Fasheh /* 1572dcd0538fSMark Fasheh * Rotate all the records in a btree right one record, starting at insert_cpos. 1573dcd0538fSMark Fasheh * 1574dcd0538fSMark Fasheh * The path to the rightmost leaf should be passed in. 1575dcd0538fSMark Fasheh * 1576dcd0538fSMark Fasheh * The array is assumed to be large enough to hold an entire path (tree depth). 1577dcd0538fSMark Fasheh * 1578dcd0538fSMark Fasheh * Upon succesful return from this function: 1579dcd0538fSMark Fasheh * 1580dcd0538fSMark Fasheh * - The 'right_path' array will contain a path to the leaf block 1581dcd0538fSMark Fasheh * whose range contains e_cpos. 1582dcd0538fSMark Fasheh * - That leaf block will have a single empty extent in list index 0. 1583dcd0538fSMark Fasheh * - In the case that the rotation requires a post-insert update, 1584dcd0538fSMark Fasheh * *ret_left_path will contain a valid path which can be passed to 1585dcd0538fSMark Fasheh * ocfs2_insert_path(). 1586dcd0538fSMark Fasheh */ 1587dcd0538fSMark Fasheh static int ocfs2_rotate_tree_right(struct inode *inode, 1588dcd0538fSMark Fasheh handle_t *handle, 1589dcd0538fSMark Fasheh u32 insert_cpos, 1590dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1591dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1592dcd0538fSMark Fasheh { 1593dcd0538fSMark Fasheh int ret, start; 1594dcd0538fSMark Fasheh u32 cpos; 1595dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1596dcd0538fSMark Fasheh 1597dcd0538fSMark Fasheh *ret_left_path = NULL; 1598dcd0538fSMark Fasheh 1599dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1600dcd0538fSMark Fasheh path_root_el(right_path)); 1601dcd0538fSMark Fasheh if (!left_path) { 1602dcd0538fSMark Fasheh ret = -ENOMEM; 1603dcd0538fSMark Fasheh mlog_errno(ret); 1604dcd0538fSMark Fasheh goto out; 1605dcd0538fSMark Fasheh } 1606dcd0538fSMark Fasheh 1607dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 1608dcd0538fSMark Fasheh if (ret) { 1609dcd0538fSMark Fasheh mlog_errno(ret); 1610dcd0538fSMark Fasheh goto out; 1611dcd0538fSMark Fasheh } 1612dcd0538fSMark Fasheh 1613dcd0538fSMark Fasheh mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 1614dcd0538fSMark Fasheh 1615dcd0538fSMark Fasheh /* 1616dcd0538fSMark Fasheh * What we want to do here is: 1617dcd0538fSMark Fasheh * 1618dcd0538fSMark Fasheh * 1) Start with the rightmost path. 1619dcd0538fSMark Fasheh * 1620dcd0538fSMark Fasheh * 2) Determine a path to the leaf block directly to the left 1621dcd0538fSMark Fasheh * of that leaf. 1622dcd0538fSMark Fasheh * 1623dcd0538fSMark Fasheh * 3) Determine the 'subtree root' - the lowest level tree node 1624dcd0538fSMark Fasheh * which contains a path to both leaves. 1625dcd0538fSMark Fasheh * 1626dcd0538fSMark Fasheh * 4) Rotate the subtree. 1627dcd0538fSMark Fasheh * 1628dcd0538fSMark Fasheh * 5) Find the next subtree by considering the left path to be 1629dcd0538fSMark Fasheh * the new right path. 1630dcd0538fSMark Fasheh * 1631dcd0538fSMark Fasheh * The check at the top of this while loop also accepts 1632dcd0538fSMark Fasheh * insert_cpos == cpos because cpos is only a _theoretical_ 1633dcd0538fSMark Fasheh * value to get us the left path - insert_cpos might very well 1634dcd0538fSMark Fasheh * be filling that hole. 1635dcd0538fSMark Fasheh * 1636dcd0538fSMark Fasheh * Stop at a cpos of '0' because we either started at the 1637dcd0538fSMark Fasheh * leftmost branch (i.e., a tree with one branch and a 1638dcd0538fSMark Fasheh * rotation inside of it), or we've gone as far as we can in 1639dcd0538fSMark Fasheh * rotating subtrees. 1640dcd0538fSMark Fasheh */ 1641dcd0538fSMark Fasheh while (cpos && insert_cpos <= cpos) { 1642dcd0538fSMark Fasheh mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 1643dcd0538fSMark Fasheh insert_cpos, cpos); 1644dcd0538fSMark Fasheh 1645dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, cpos); 1646dcd0538fSMark Fasheh if (ret) { 1647dcd0538fSMark Fasheh mlog_errno(ret); 1648dcd0538fSMark Fasheh goto out; 1649dcd0538fSMark Fasheh } 1650dcd0538fSMark Fasheh 1651dcd0538fSMark Fasheh mlog_bug_on_msg(path_leaf_bh(left_path) == 1652dcd0538fSMark Fasheh path_leaf_bh(right_path), 1653dcd0538fSMark Fasheh "Inode %lu: error during insert of %u " 1654dcd0538fSMark Fasheh "(left path cpos %u) results in two identical " 1655dcd0538fSMark Fasheh "paths ending at %llu\n", 1656dcd0538fSMark Fasheh inode->i_ino, insert_cpos, cpos, 1657dcd0538fSMark Fasheh (unsigned long long) 1658dcd0538fSMark Fasheh path_leaf_bh(left_path)->b_blocknr); 1659dcd0538fSMark Fasheh 1660dcd0538fSMark Fasheh if (ocfs2_rotate_requires_path_adjustment(left_path, 1661dcd0538fSMark Fasheh insert_cpos)) { 1662dcd0538fSMark Fasheh mlog(0, "Path adjustment required\n"); 1663dcd0538fSMark Fasheh 1664dcd0538fSMark Fasheh /* 1665dcd0538fSMark Fasheh * We've rotated the tree as much as we 1666dcd0538fSMark Fasheh * should. The rest is up to 1667dcd0538fSMark Fasheh * ocfs2_insert_path() to complete, after the 1668dcd0538fSMark Fasheh * record insertion. We indicate this 1669dcd0538fSMark Fasheh * situation by returning the left path. 1670dcd0538fSMark Fasheh * 1671dcd0538fSMark Fasheh * The reason we don't adjust the records here 1672dcd0538fSMark Fasheh * before the record insert is that an error 1673dcd0538fSMark Fasheh * later might break the rule where a parent 1674dcd0538fSMark Fasheh * record e_cpos will reflect the actual 1675dcd0538fSMark Fasheh * e_cpos of the 1st nonempty record of the 1676dcd0538fSMark Fasheh * child list. 1677dcd0538fSMark Fasheh */ 1678dcd0538fSMark Fasheh *ret_left_path = left_path; 1679dcd0538fSMark Fasheh goto out_ret_path; 1680dcd0538fSMark Fasheh } 1681dcd0538fSMark Fasheh 1682dcd0538fSMark Fasheh start = ocfs2_find_subtree_root(inode, left_path, right_path); 1683dcd0538fSMark Fasheh 1684dcd0538fSMark Fasheh mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 1685dcd0538fSMark Fasheh start, 1686dcd0538fSMark Fasheh (unsigned long long) right_path->p_node[start].bh->b_blocknr, 1687dcd0538fSMark Fasheh right_path->p_tree_depth); 1688dcd0538fSMark Fasheh 1689dcd0538fSMark Fasheh ret = ocfs2_extend_rotate_transaction(handle, start, 1690dcd0538fSMark Fasheh right_path); 1691dcd0538fSMark Fasheh if (ret) { 1692dcd0538fSMark Fasheh mlog_errno(ret); 1693dcd0538fSMark Fasheh goto out; 1694dcd0538fSMark Fasheh } 1695dcd0538fSMark Fasheh 1696dcd0538fSMark Fasheh ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 1697dcd0538fSMark Fasheh right_path, start); 1698dcd0538fSMark Fasheh if (ret) { 1699dcd0538fSMark Fasheh mlog_errno(ret); 1700dcd0538fSMark Fasheh goto out; 1701dcd0538fSMark Fasheh } 1702dcd0538fSMark Fasheh 1703dcd0538fSMark Fasheh /* 1704dcd0538fSMark Fasheh * There is no need to re-read the next right path 1705dcd0538fSMark Fasheh * as we know that it'll be our current left 1706dcd0538fSMark Fasheh * path. Optimize by copying values instead. 1707dcd0538fSMark Fasheh */ 1708dcd0538fSMark Fasheh ocfs2_mv_path(right_path, left_path); 1709dcd0538fSMark Fasheh 1710dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1711dcd0538fSMark Fasheh &cpos); 1712dcd0538fSMark Fasheh if (ret) { 1713dcd0538fSMark Fasheh mlog_errno(ret); 1714dcd0538fSMark Fasheh goto out; 1715dcd0538fSMark Fasheh } 1716dcd0538fSMark Fasheh } 1717dcd0538fSMark Fasheh 1718dcd0538fSMark Fasheh out: 1719dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1720dcd0538fSMark Fasheh 1721dcd0538fSMark Fasheh out_ret_path: 1722dcd0538fSMark Fasheh return ret; 1723dcd0538fSMark Fasheh } 1724dcd0538fSMark Fasheh 1725dcd0538fSMark Fasheh /* 1726dcd0538fSMark Fasheh * Do the final bits of extent record insertion at the target leaf 1727dcd0538fSMark Fasheh * list. If this leaf is part of an allocation tree, it is assumed 1728dcd0538fSMark Fasheh * that the tree above has been prepared. 1729dcd0538fSMark Fasheh */ 1730dcd0538fSMark Fasheh static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 1731dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 1732dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 1733dcd0538fSMark Fasheh struct inode *inode) 1734dcd0538fSMark Fasheh { 1735dcd0538fSMark Fasheh int i = insert->ins_contig_index; 1736dcd0538fSMark Fasheh unsigned int range; 1737dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1738dcd0538fSMark Fasheh 1739e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1740dcd0538fSMark Fasheh 1741dcd0538fSMark Fasheh /* 1742dcd0538fSMark Fasheh * Contiguous insert - either left or right. 1743dcd0538fSMark Fasheh */ 1744dcd0538fSMark Fasheh if (insert->ins_contig != CONTIG_NONE) { 1745dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1746dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT) { 1747dcd0538fSMark Fasheh rec->e_blkno = insert_rec->e_blkno; 1748dcd0538fSMark Fasheh rec->e_cpos = insert_rec->e_cpos; 1749dcd0538fSMark Fasheh } 1750e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, 1751e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1752dcd0538fSMark Fasheh return; 1753dcd0538fSMark Fasheh } 1754dcd0538fSMark Fasheh 1755dcd0538fSMark Fasheh /* 1756dcd0538fSMark Fasheh * Handle insert into an empty leaf. 1757dcd0538fSMark Fasheh */ 1758dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0 || 1759dcd0538fSMark Fasheh ((le16_to_cpu(el->l_next_free_rec) == 1) && 1760dcd0538fSMark Fasheh ocfs2_is_empty_extent(&el->l_recs[0]))) { 1761dcd0538fSMark Fasheh el->l_recs[0] = *insert_rec; 1762dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(1); 1763dcd0538fSMark Fasheh return; 1764dcd0538fSMark Fasheh } 1765dcd0538fSMark Fasheh 1766dcd0538fSMark Fasheh /* 1767dcd0538fSMark Fasheh * Appending insert. 1768dcd0538fSMark Fasheh */ 1769dcd0538fSMark Fasheh if (insert->ins_appending == APPEND_TAIL) { 1770dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 1771dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1772e48edee2SMark Fasheh range = le32_to_cpu(rec->e_cpos) 1773e48edee2SMark Fasheh + le16_to_cpu(rec->e_leaf_clusters); 1774dcd0538fSMark Fasheh BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 1775dcd0538fSMark Fasheh 1776dcd0538fSMark Fasheh mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 1777dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1778dcd0538fSMark Fasheh "inode %lu, depth %u, count %u, next free %u, " 1779dcd0538fSMark Fasheh "rec.cpos %u, rec.clusters %u, " 1780dcd0538fSMark Fasheh "insert.cpos %u, insert.clusters %u\n", 1781dcd0538fSMark Fasheh inode->i_ino, 1782dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), 1783dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1784dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1785dcd0538fSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos), 1786e48edee2SMark Fasheh le16_to_cpu(el->l_recs[i].e_leaf_clusters), 1787dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 1788e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1789dcd0538fSMark Fasheh i++; 1790dcd0538fSMark Fasheh el->l_recs[i] = *insert_rec; 1791dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 1792dcd0538fSMark Fasheh return; 1793dcd0538fSMark Fasheh } 1794dcd0538fSMark Fasheh 1795dcd0538fSMark Fasheh /* 1796dcd0538fSMark Fasheh * Ok, we have to rotate. 1797dcd0538fSMark Fasheh * 1798dcd0538fSMark Fasheh * At this point, it is safe to assume that inserting into an 1799dcd0538fSMark Fasheh * empty leaf and appending to a leaf have both been handled 1800dcd0538fSMark Fasheh * above. 1801dcd0538fSMark Fasheh * 1802dcd0538fSMark Fasheh * This leaf needs to have space, either by the empty 1st 1803dcd0538fSMark Fasheh * extent record, or by virtue of an l_next_rec < l_count. 1804dcd0538fSMark Fasheh */ 1805dcd0538fSMark Fasheh ocfs2_rotate_leaf(el, insert_rec); 1806dcd0538fSMark Fasheh } 1807dcd0538fSMark Fasheh 1808dcd0538fSMark Fasheh static inline void ocfs2_update_dinode_clusters(struct inode *inode, 1809dcd0538fSMark Fasheh struct ocfs2_dinode *di, 1810dcd0538fSMark Fasheh u32 clusters) 1811dcd0538fSMark Fasheh { 1812dcd0538fSMark Fasheh le32_add_cpu(&di->i_clusters, clusters); 1813dcd0538fSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 1814dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 1815dcd0538fSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 1816dcd0538fSMark Fasheh } 1817dcd0538fSMark Fasheh 1818dcd0538fSMark Fasheh static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 1819dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 1820dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1821dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1822dcd0538fSMark Fasheh { 1823dcd0538fSMark Fasheh int ret, i, next_free; 1824dcd0538fSMark Fasheh struct buffer_head *bh; 1825dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1826dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1827dcd0538fSMark Fasheh 1828dcd0538fSMark Fasheh *ret_left_path = NULL; 1829dcd0538fSMark Fasheh 1830dcd0538fSMark Fasheh /* 1831e48edee2SMark Fasheh * This shouldn't happen for non-trees. The extent rec cluster 1832e48edee2SMark Fasheh * count manipulation below only works for interior nodes. 1833e48edee2SMark Fasheh */ 1834e48edee2SMark Fasheh BUG_ON(right_path->p_tree_depth == 0); 1835e48edee2SMark Fasheh 1836e48edee2SMark Fasheh /* 1837dcd0538fSMark Fasheh * If our appending insert is at the leftmost edge of a leaf, 1838dcd0538fSMark Fasheh * then we might need to update the rightmost records of the 1839dcd0538fSMark Fasheh * neighboring path. 1840dcd0538fSMark Fasheh */ 1841dcd0538fSMark Fasheh el = path_leaf_el(right_path); 1842dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1843dcd0538fSMark Fasheh if (next_free == 0 || 1844dcd0538fSMark Fasheh (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 1845dcd0538fSMark Fasheh u32 left_cpos; 1846dcd0538fSMark Fasheh 1847dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1848dcd0538fSMark Fasheh &left_cpos); 1849dcd0538fSMark Fasheh if (ret) { 1850dcd0538fSMark Fasheh mlog_errno(ret); 1851dcd0538fSMark Fasheh goto out; 1852dcd0538fSMark Fasheh } 1853dcd0538fSMark Fasheh 1854dcd0538fSMark Fasheh mlog(0, "Append may need a left path update. cpos: %u, " 1855dcd0538fSMark Fasheh "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 1856dcd0538fSMark Fasheh left_cpos); 1857dcd0538fSMark Fasheh 1858dcd0538fSMark Fasheh /* 1859dcd0538fSMark Fasheh * No need to worry if the append is already in the 1860dcd0538fSMark Fasheh * leftmost leaf. 1861dcd0538fSMark Fasheh */ 1862dcd0538fSMark Fasheh if (left_cpos) { 1863dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1864dcd0538fSMark Fasheh path_root_el(right_path)); 1865dcd0538fSMark Fasheh if (!left_path) { 1866dcd0538fSMark Fasheh ret = -ENOMEM; 1867dcd0538fSMark Fasheh mlog_errno(ret); 1868dcd0538fSMark Fasheh goto out; 1869dcd0538fSMark Fasheh } 1870dcd0538fSMark Fasheh 1871dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, left_cpos); 1872dcd0538fSMark Fasheh if (ret) { 1873dcd0538fSMark Fasheh mlog_errno(ret); 1874dcd0538fSMark Fasheh goto out; 1875dcd0538fSMark Fasheh } 1876dcd0538fSMark Fasheh 1877dcd0538fSMark Fasheh /* 1878dcd0538fSMark Fasheh * ocfs2_insert_path() will pass the left_path to the 1879dcd0538fSMark Fasheh * journal for us. 1880dcd0538fSMark Fasheh */ 1881dcd0538fSMark Fasheh } 1882dcd0538fSMark Fasheh } 1883dcd0538fSMark Fasheh 1884dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 1885dcd0538fSMark Fasheh if (ret) { 1886dcd0538fSMark Fasheh mlog_errno(ret); 1887dcd0538fSMark Fasheh goto out; 1888dcd0538fSMark Fasheh } 1889dcd0538fSMark Fasheh 1890dcd0538fSMark Fasheh el = path_root_el(right_path); 1891dcd0538fSMark Fasheh bh = path_root_bh(right_path); 1892dcd0538fSMark Fasheh i = 0; 1893dcd0538fSMark Fasheh while (1) { 1894e48edee2SMark Fasheh struct ocfs2_extent_rec *rec; 1895e48edee2SMark Fasheh 1896dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1897dcd0538fSMark Fasheh if (next_free == 0) { 1898dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1899dcd0538fSMark Fasheh "Dinode %llu has a bad extent list", 1900dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 1901dcd0538fSMark Fasheh ret = -EIO; 1902dcd0538fSMark Fasheh goto out; 1903dcd0538fSMark Fasheh } 1904dcd0538fSMark Fasheh 1905e48edee2SMark Fasheh rec = &el->l_recs[next_free - 1]; 1906e48edee2SMark Fasheh 1907e48edee2SMark Fasheh rec->e_int_clusters = insert_rec->e_cpos; 1908e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1909e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1910e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1911e48edee2SMark Fasheh -le32_to_cpu(rec->e_cpos)); 1912dcd0538fSMark Fasheh 1913dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 1914dcd0538fSMark Fasheh if (ret) 1915dcd0538fSMark Fasheh mlog_errno(ret); 1916dcd0538fSMark Fasheh 1917e48edee2SMark Fasheh /* Don't touch the leaf node */ 1918dcd0538fSMark Fasheh if (++i >= right_path->p_tree_depth) 1919dcd0538fSMark Fasheh break; 1920dcd0538fSMark Fasheh 1921dcd0538fSMark Fasheh bh = right_path->p_node[i].bh; 1922dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1923dcd0538fSMark Fasheh } 1924dcd0538fSMark Fasheh 1925dcd0538fSMark Fasheh *ret_left_path = left_path; 1926dcd0538fSMark Fasheh ret = 0; 1927dcd0538fSMark Fasheh out: 1928dcd0538fSMark Fasheh if (ret != 0) 1929dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1930dcd0538fSMark Fasheh 1931dcd0538fSMark Fasheh return ret; 1932dcd0538fSMark Fasheh } 1933dcd0538fSMark Fasheh 1934dcd0538fSMark Fasheh /* 1935dcd0538fSMark Fasheh * This function only does inserts on an allocation b-tree. For dinode 1936dcd0538fSMark Fasheh * lists, ocfs2_insert_at_leaf() is called directly. 1937dcd0538fSMark Fasheh * 1938dcd0538fSMark Fasheh * right_path is the path we want to do the actual insert 1939dcd0538fSMark Fasheh * in. left_path should only be passed in if we need to update that 1940dcd0538fSMark Fasheh * portion of the tree after an edge insert. 1941dcd0538fSMark Fasheh */ 1942dcd0538fSMark Fasheh static int ocfs2_insert_path(struct inode *inode, 1943dcd0538fSMark Fasheh handle_t *handle, 1944dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1945dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1946dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 1947dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 1948dcd0538fSMark Fasheh { 1949dcd0538fSMark Fasheh int ret, subtree_index; 1950dcd0538fSMark Fasheh struct buffer_head *leaf_bh = path_leaf_bh(right_path); 1951dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1952dcd0538fSMark Fasheh 1953dcd0538fSMark Fasheh /* 1954dcd0538fSMark Fasheh * Pass both paths to the journal. The majority of inserts 1955dcd0538fSMark Fasheh * will be touching all components anyway. 1956dcd0538fSMark Fasheh */ 1957dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 1958dcd0538fSMark Fasheh if (ret < 0) { 1959dcd0538fSMark Fasheh mlog_errno(ret); 1960dcd0538fSMark Fasheh goto out; 1961dcd0538fSMark Fasheh } 1962dcd0538fSMark Fasheh 1963dcd0538fSMark Fasheh if (left_path) { 1964dcd0538fSMark Fasheh int credits = handle->h_buffer_credits; 1965dcd0538fSMark Fasheh 1966dcd0538fSMark Fasheh /* 1967dcd0538fSMark Fasheh * There's a chance that left_path got passed back to 1968dcd0538fSMark Fasheh * us without being accounted for in the 1969dcd0538fSMark Fasheh * journal. Extend our transaction here to be sure we 1970dcd0538fSMark Fasheh * can change those blocks. 1971dcd0538fSMark Fasheh */ 1972dcd0538fSMark Fasheh credits += left_path->p_tree_depth; 1973dcd0538fSMark Fasheh 1974dcd0538fSMark Fasheh ret = ocfs2_extend_trans(handle, credits); 1975dcd0538fSMark Fasheh if (ret < 0) { 1976dcd0538fSMark Fasheh mlog_errno(ret); 1977dcd0538fSMark Fasheh goto out; 1978dcd0538fSMark Fasheh } 1979dcd0538fSMark Fasheh 1980dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, left_path); 1981dcd0538fSMark Fasheh if (ret < 0) { 1982dcd0538fSMark Fasheh mlog_errno(ret); 1983dcd0538fSMark Fasheh goto out; 1984dcd0538fSMark Fasheh } 1985dcd0538fSMark Fasheh } 1986dcd0538fSMark Fasheh 1987dcd0538fSMark Fasheh el = path_leaf_el(right_path); 1988dcd0538fSMark Fasheh 1989dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, insert, inode); 1990dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, leaf_bh); 1991dcd0538fSMark Fasheh if (ret) 1992dcd0538fSMark Fasheh mlog_errno(ret); 1993dcd0538fSMark Fasheh 1994dcd0538fSMark Fasheh if (left_path) { 1995dcd0538fSMark Fasheh /* 1996dcd0538fSMark Fasheh * The rotate code has indicated that we need to fix 1997dcd0538fSMark Fasheh * up portions of the tree after the insert. 1998dcd0538fSMark Fasheh * 1999dcd0538fSMark Fasheh * XXX: Should we extend the transaction here? 2000dcd0538fSMark Fasheh */ 2001dcd0538fSMark Fasheh subtree_index = ocfs2_find_subtree_root(inode, left_path, 2002dcd0538fSMark Fasheh right_path); 2003dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, 2004dcd0538fSMark Fasheh right_path, subtree_index); 2005dcd0538fSMark Fasheh } 2006dcd0538fSMark Fasheh 2007dcd0538fSMark Fasheh ret = 0; 2008dcd0538fSMark Fasheh out: 2009dcd0538fSMark Fasheh return ret; 2010dcd0538fSMark Fasheh } 2011dcd0538fSMark Fasheh 2012dcd0538fSMark Fasheh static int ocfs2_do_insert_extent(struct inode *inode, 2013dcd0538fSMark Fasheh handle_t *handle, 2014dcd0538fSMark Fasheh struct buffer_head *di_bh, 2015dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2016dcd0538fSMark Fasheh struct ocfs2_insert_type *type) 2017dcd0538fSMark Fasheh { 2018dcd0538fSMark Fasheh int ret, rotate = 0; 2019dcd0538fSMark Fasheh u32 cpos; 2020dcd0538fSMark Fasheh struct ocfs2_path *right_path = NULL; 2021dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 2022dcd0538fSMark Fasheh struct ocfs2_dinode *di; 2023dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2024dcd0538fSMark Fasheh 2025dcd0538fSMark Fasheh di = (struct ocfs2_dinode *) di_bh->b_data; 2026dcd0538fSMark Fasheh el = &di->id2.i_list; 2027dcd0538fSMark Fasheh 2028dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, di_bh, 2029dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2030dcd0538fSMark Fasheh if (ret) { 2031dcd0538fSMark Fasheh mlog_errno(ret); 2032dcd0538fSMark Fasheh goto out; 2033dcd0538fSMark Fasheh } 2034dcd0538fSMark Fasheh 2035dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 2036dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, type, inode); 2037dcd0538fSMark Fasheh goto out_update_clusters; 2038dcd0538fSMark Fasheh } 2039dcd0538fSMark Fasheh 2040dcd0538fSMark Fasheh right_path = ocfs2_new_inode_path(di_bh); 2041dcd0538fSMark Fasheh if (!right_path) { 2042dcd0538fSMark Fasheh ret = -ENOMEM; 2043dcd0538fSMark Fasheh mlog_errno(ret); 2044dcd0538fSMark Fasheh goto out; 2045dcd0538fSMark Fasheh } 2046dcd0538fSMark Fasheh 2047dcd0538fSMark Fasheh /* 2048dcd0538fSMark Fasheh * Determine the path to start with. Rotations need the 2049dcd0538fSMark Fasheh * rightmost path, everything else can go directly to the 2050dcd0538fSMark Fasheh * target leaf. 2051dcd0538fSMark Fasheh */ 2052dcd0538fSMark Fasheh cpos = le32_to_cpu(insert_rec->e_cpos); 2053dcd0538fSMark Fasheh if (type->ins_appending == APPEND_NONE && 2054dcd0538fSMark Fasheh type->ins_contig == CONTIG_NONE) { 2055dcd0538fSMark Fasheh rotate = 1; 2056dcd0538fSMark Fasheh cpos = UINT_MAX; 2057dcd0538fSMark Fasheh } 2058dcd0538fSMark Fasheh 2059dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, right_path, cpos); 2060dcd0538fSMark Fasheh if (ret) { 2061dcd0538fSMark Fasheh mlog_errno(ret); 2062dcd0538fSMark Fasheh goto out; 2063dcd0538fSMark Fasheh } 2064dcd0538fSMark Fasheh 2065dcd0538fSMark Fasheh /* 2066dcd0538fSMark Fasheh * Rotations and appends need special treatment - they modify 2067dcd0538fSMark Fasheh * parts of the tree's above them. 2068dcd0538fSMark Fasheh * 2069dcd0538fSMark Fasheh * Both might pass back a path immediate to the left of the 2070dcd0538fSMark Fasheh * one being inserted to. This will be cause 2071dcd0538fSMark Fasheh * ocfs2_insert_path() to modify the rightmost records of 2072dcd0538fSMark Fasheh * left_path to account for an edge insert. 2073dcd0538fSMark Fasheh * 2074dcd0538fSMark Fasheh * XXX: When modifying this code, keep in mind that an insert 2075dcd0538fSMark Fasheh * can wind up skipping both of these two special cases... 2076dcd0538fSMark Fasheh */ 2077dcd0538fSMark Fasheh if (rotate) { 2078dcd0538fSMark Fasheh ret = ocfs2_rotate_tree_right(inode, handle, 2079dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 2080dcd0538fSMark Fasheh right_path, &left_path); 2081dcd0538fSMark Fasheh if (ret) { 2082dcd0538fSMark Fasheh mlog_errno(ret); 2083dcd0538fSMark Fasheh goto out; 2084dcd0538fSMark Fasheh } 2085dcd0538fSMark Fasheh } else if (type->ins_appending == APPEND_TAIL 2086dcd0538fSMark Fasheh && type->ins_contig != CONTIG_LEFT) { 2087dcd0538fSMark Fasheh ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 2088dcd0538fSMark Fasheh right_path, &left_path); 2089dcd0538fSMark Fasheh if (ret) { 2090dcd0538fSMark Fasheh mlog_errno(ret); 2091dcd0538fSMark Fasheh goto out; 2092dcd0538fSMark Fasheh } 2093dcd0538fSMark Fasheh } 2094dcd0538fSMark Fasheh 2095dcd0538fSMark Fasheh ret = ocfs2_insert_path(inode, handle, left_path, right_path, 2096dcd0538fSMark Fasheh insert_rec, type); 2097dcd0538fSMark Fasheh if (ret) { 2098dcd0538fSMark Fasheh mlog_errno(ret); 2099dcd0538fSMark Fasheh goto out; 2100dcd0538fSMark Fasheh } 2101dcd0538fSMark Fasheh 2102dcd0538fSMark Fasheh out_update_clusters: 2103dcd0538fSMark Fasheh ocfs2_update_dinode_clusters(inode, di, 2104e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 2105dcd0538fSMark Fasheh 2106dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, di_bh); 2107dcd0538fSMark Fasheh if (ret) 2108dcd0538fSMark Fasheh mlog_errno(ret); 2109dcd0538fSMark Fasheh 2110dcd0538fSMark Fasheh out: 2111dcd0538fSMark Fasheh ocfs2_free_path(left_path); 2112dcd0538fSMark Fasheh ocfs2_free_path(right_path); 2113dcd0538fSMark Fasheh 2114dcd0538fSMark Fasheh return ret; 2115dcd0538fSMark Fasheh } 2116dcd0538fSMark Fasheh 2117dcd0538fSMark Fasheh static void ocfs2_figure_contig_type(struct inode *inode, 2118dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 2119dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2120dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2121dcd0538fSMark Fasheh { 2122dcd0538fSMark Fasheh int i; 2123dcd0538fSMark Fasheh enum ocfs2_contig_type contig_type = CONTIG_NONE; 2124dcd0538fSMark Fasheh 2125e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2126e48edee2SMark Fasheh 2127dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2128dcd0538fSMark Fasheh contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 2129dcd0538fSMark Fasheh insert_rec); 2130dcd0538fSMark Fasheh if (contig_type != CONTIG_NONE) { 2131dcd0538fSMark Fasheh insert->ins_contig_index = i; 2132dcd0538fSMark Fasheh break; 2133dcd0538fSMark Fasheh } 2134dcd0538fSMark Fasheh } 2135dcd0538fSMark Fasheh insert->ins_contig = contig_type; 2136dcd0538fSMark Fasheh } 2137dcd0538fSMark Fasheh 2138dcd0538fSMark Fasheh /* 2139dcd0538fSMark Fasheh * This should only be called against the righmost leaf extent list. 2140dcd0538fSMark Fasheh * 2141dcd0538fSMark Fasheh * ocfs2_figure_appending_type() will figure out whether we'll have to 2142dcd0538fSMark Fasheh * insert at the tail of the rightmost leaf. 2143dcd0538fSMark Fasheh * 2144dcd0538fSMark Fasheh * This should also work against the dinode list for tree's with 0 2145dcd0538fSMark Fasheh * depth. If we consider the dinode list to be the rightmost leaf node 2146dcd0538fSMark Fasheh * then the logic here makes sense. 2147dcd0538fSMark Fasheh */ 2148dcd0538fSMark Fasheh static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 2149dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2150dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2151dcd0538fSMark Fasheh { 2152dcd0538fSMark Fasheh int i; 2153dcd0538fSMark Fasheh u32 cpos = le32_to_cpu(insert_rec->e_cpos); 2154dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 2155dcd0538fSMark Fasheh 2156dcd0538fSMark Fasheh insert->ins_appending = APPEND_NONE; 2157dcd0538fSMark Fasheh 2158e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2159dcd0538fSMark Fasheh 2160dcd0538fSMark Fasheh if (!el->l_next_free_rec) 2161dcd0538fSMark Fasheh goto set_tail_append; 2162dcd0538fSMark Fasheh 2163dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 2164dcd0538fSMark Fasheh /* Were all records empty? */ 2165dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 1) 2166dcd0538fSMark Fasheh goto set_tail_append; 2167dcd0538fSMark Fasheh } 2168dcd0538fSMark Fasheh 2169dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 2170dcd0538fSMark Fasheh rec = &el->l_recs[i]; 2171dcd0538fSMark Fasheh 2172e48edee2SMark Fasheh if (cpos >= 2173e48edee2SMark Fasheh (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 2174dcd0538fSMark Fasheh goto set_tail_append; 2175dcd0538fSMark Fasheh 2176dcd0538fSMark Fasheh return; 2177dcd0538fSMark Fasheh 2178dcd0538fSMark Fasheh set_tail_append: 2179dcd0538fSMark Fasheh insert->ins_appending = APPEND_TAIL; 2180dcd0538fSMark Fasheh } 2181dcd0538fSMark Fasheh 2182dcd0538fSMark Fasheh /* 2183dcd0538fSMark Fasheh * Helper function called at the begining of an insert. 2184dcd0538fSMark Fasheh * 2185dcd0538fSMark Fasheh * This computes a few things that are commonly used in the process of 2186dcd0538fSMark Fasheh * inserting into the btree: 2187dcd0538fSMark Fasheh * - Whether the new extent is contiguous with an existing one. 2188dcd0538fSMark Fasheh * - The current tree depth. 2189dcd0538fSMark Fasheh * - Whether the insert is an appending one. 2190dcd0538fSMark Fasheh * - The total # of free records in the tree. 2191dcd0538fSMark Fasheh * 2192dcd0538fSMark Fasheh * All of the information is stored on the ocfs2_insert_type 2193dcd0538fSMark Fasheh * structure. 2194dcd0538fSMark Fasheh */ 2195dcd0538fSMark Fasheh static int ocfs2_figure_insert_type(struct inode *inode, 2196dcd0538fSMark Fasheh struct buffer_head *di_bh, 2197dcd0538fSMark Fasheh struct buffer_head **last_eb_bh, 2198dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2199dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 2200dcd0538fSMark Fasheh { 2201dcd0538fSMark Fasheh int ret; 2202dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2203dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 2204dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2205dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 2206dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 2207dcd0538fSMark Fasheh 2208dcd0538fSMark Fasheh el = &di->id2.i_list; 2209dcd0538fSMark Fasheh insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 2210dcd0538fSMark Fasheh 2211dcd0538fSMark Fasheh if (el->l_tree_depth) { 2212dcd0538fSMark Fasheh /* 2213dcd0538fSMark Fasheh * If we have tree depth, we read in the 2214dcd0538fSMark Fasheh * rightmost extent block ahead of time as 2215dcd0538fSMark Fasheh * ocfs2_figure_insert_type() and ocfs2_add_branch() 2216dcd0538fSMark Fasheh * may want it later. 2217dcd0538fSMark Fasheh */ 2218dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 2219dcd0538fSMark Fasheh le64_to_cpu(di->i_last_eb_blk), &bh, 2220dcd0538fSMark Fasheh OCFS2_BH_CACHED, inode); 2221dcd0538fSMark Fasheh if (ret) { 2222dcd0538fSMark Fasheh mlog_exit(ret); 2223dcd0538fSMark Fasheh goto out; 2224dcd0538fSMark Fasheh } 2225dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 2226dcd0538fSMark Fasheh el = &eb->h_list; 2227dcd0538fSMark Fasheh } 2228dcd0538fSMark Fasheh 2229dcd0538fSMark Fasheh /* 2230dcd0538fSMark Fasheh * Unless we have a contiguous insert, we'll need to know if 2231dcd0538fSMark Fasheh * there is room left in our allocation tree for another 2232dcd0538fSMark Fasheh * extent record. 2233dcd0538fSMark Fasheh * 2234dcd0538fSMark Fasheh * XXX: This test is simplistic, we can search for empty 2235dcd0538fSMark Fasheh * extent records too. 2236dcd0538fSMark Fasheh */ 2237dcd0538fSMark Fasheh insert->ins_free_records = le16_to_cpu(el->l_count) - 2238dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec); 2239dcd0538fSMark Fasheh 2240dcd0538fSMark Fasheh if (!insert->ins_tree_depth) { 2241dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2242dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2243dcd0538fSMark Fasheh return 0; 2244dcd0538fSMark Fasheh } 2245dcd0538fSMark Fasheh 2246dcd0538fSMark Fasheh path = ocfs2_new_inode_path(di_bh); 2247dcd0538fSMark Fasheh if (!path) { 2248dcd0538fSMark Fasheh ret = -ENOMEM; 2249dcd0538fSMark Fasheh mlog_errno(ret); 2250dcd0538fSMark Fasheh goto out; 2251dcd0538fSMark Fasheh } 2252dcd0538fSMark Fasheh 2253dcd0538fSMark Fasheh /* 2254dcd0538fSMark Fasheh * In the case that we're inserting past what the tree 2255dcd0538fSMark Fasheh * currently accounts for, ocfs2_find_path() will return for 2256dcd0538fSMark Fasheh * us the rightmost tree path. This is accounted for below in 2257dcd0538fSMark Fasheh * the appending code. 2258dcd0538fSMark Fasheh */ 2259dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 2260dcd0538fSMark Fasheh if (ret) { 2261dcd0538fSMark Fasheh mlog_errno(ret); 2262dcd0538fSMark Fasheh goto out; 2263dcd0538fSMark Fasheh } 2264dcd0538fSMark Fasheh 2265dcd0538fSMark Fasheh el = path_leaf_el(path); 2266dcd0538fSMark Fasheh 2267dcd0538fSMark Fasheh /* 2268dcd0538fSMark Fasheh * Now that we have the path, there's two things we want to determine: 2269dcd0538fSMark Fasheh * 1) Contiguousness (also set contig_index if this is so) 2270dcd0538fSMark Fasheh * 2271dcd0538fSMark Fasheh * 2) Are we doing an append? We can trivially break this up 2272dcd0538fSMark Fasheh * into two types of appends: simple record append, or a 2273dcd0538fSMark Fasheh * rotate inside the tail leaf. 2274dcd0538fSMark Fasheh */ 2275dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2276dcd0538fSMark Fasheh 2277dcd0538fSMark Fasheh /* 2278dcd0538fSMark Fasheh * The insert code isn't quite ready to deal with all cases of 2279dcd0538fSMark Fasheh * left contiguousness. Specifically, if it's an insert into 2280dcd0538fSMark Fasheh * the 1st record in a leaf, it will require the adjustment of 2281e48edee2SMark Fasheh * cluster count on the last record of the path directly to it's 2282dcd0538fSMark Fasheh * left. For now, just catch that case and fool the layers 2283dcd0538fSMark Fasheh * above us. This works just fine for tree_depth == 0, which 2284dcd0538fSMark Fasheh * is why we allow that above. 2285dcd0538fSMark Fasheh */ 2286dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT && 2287dcd0538fSMark Fasheh insert->ins_contig_index == 0) 2288dcd0538fSMark Fasheh insert->ins_contig = CONTIG_NONE; 2289dcd0538fSMark Fasheh 2290dcd0538fSMark Fasheh /* 2291dcd0538fSMark Fasheh * Ok, so we can simply compare against last_eb to figure out 2292dcd0538fSMark Fasheh * whether the path doesn't exist. This will only happen in 2293dcd0538fSMark Fasheh * the case that we're doing a tail append, so maybe we can 2294dcd0538fSMark Fasheh * take advantage of that information somehow. 2295dcd0538fSMark Fasheh */ 2296dcd0538fSMark Fasheh if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { 2297dcd0538fSMark Fasheh /* 2298dcd0538fSMark Fasheh * Ok, ocfs2_find_path() returned us the rightmost 2299dcd0538fSMark Fasheh * tree path. This might be an appending insert. There are 2300dcd0538fSMark Fasheh * two cases: 2301dcd0538fSMark Fasheh * 1) We're doing a true append at the tail: 2302dcd0538fSMark Fasheh * -This might even be off the end of the leaf 2303dcd0538fSMark Fasheh * 2) We're "appending" by rotating in the tail 2304dcd0538fSMark Fasheh */ 2305dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2306dcd0538fSMark Fasheh } 2307dcd0538fSMark Fasheh 2308dcd0538fSMark Fasheh out: 2309dcd0538fSMark Fasheh ocfs2_free_path(path); 2310dcd0538fSMark Fasheh 2311dcd0538fSMark Fasheh if (ret == 0) 2312dcd0538fSMark Fasheh *last_eb_bh = bh; 2313dcd0538fSMark Fasheh else 2314dcd0538fSMark Fasheh brelse(bh); 2315dcd0538fSMark Fasheh return ret; 2316dcd0538fSMark Fasheh } 2317dcd0538fSMark Fasheh 2318dcd0538fSMark Fasheh /* 2319dcd0538fSMark Fasheh * Insert an extent into an inode btree. 2320dcd0538fSMark Fasheh * 2321dcd0538fSMark Fasheh * The caller needs to update fe->i_clusters 2322dcd0538fSMark Fasheh */ 2323ccd979bdSMark Fasheh int ocfs2_insert_extent(struct ocfs2_super *osb, 23241fabe148SMark Fasheh handle_t *handle, 2325ccd979bdSMark Fasheh struct inode *inode, 2326ccd979bdSMark Fasheh struct buffer_head *fe_bh, 2327dcd0538fSMark Fasheh u32 cpos, 2328ccd979bdSMark Fasheh u64 start_blk, 2329ccd979bdSMark Fasheh u32 new_clusters, 2330ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 2331ccd979bdSMark Fasheh { 2332dcd0538fSMark Fasheh int status, shift; 2333ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 2334ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2335dcd0538fSMark Fasheh struct ocfs2_insert_type insert = {0, }; 2336dcd0538fSMark Fasheh struct ocfs2_extent_rec rec; 2337ccd979bdSMark Fasheh 2338dcd0538fSMark Fasheh mlog(0, "add %u clusters at position %u to inode %llu\n", 2339dcd0538fSMark Fasheh new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 2340ccd979bdSMark Fasheh 2341dcd0538fSMark Fasheh mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 2342dcd0538fSMark Fasheh (OCFS2_I(inode)->ip_clusters != cpos), 2343dcd0538fSMark Fasheh "Device %s, asking for sparse allocation: inode %llu, " 2344dcd0538fSMark Fasheh "cpos %u, clusters %u\n", 2345dcd0538fSMark Fasheh osb->dev_str, 2346dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, 2347dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters); 2348ccd979bdSMark Fasheh 2349e48edee2SMark Fasheh memset(&rec, 0, sizeof(rec)); 2350dcd0538fSMark Fasheh rec.e_cpos = cpu_to_le32(cpos); 2351dcd0538fSMark Fasheh rec.e_blkno = cpu_to_le64(start_blk); 2352e48edee2SMark Fasheh rec.e_leaf_clusters = cpu_to_le16(new_clusters); 2353ccd979bdSMark Fasheh 2354dcd0538fSMark Fasheh status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 2355dcd0538fSMark Fasheh &insert); 2356ccd979bdSMark Fasheh if (status < 0) { 2357dcd0538fSMark Fasheh mlog_errno(status); 2358ccd979bdSMark Fasheh goto bail; 2359ccd979bdSMark Fasheh } 2360ccd979bdSMark Fasheh 2361dcd0538fSMark Fasheh mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 2362dcd0538fSMark Fasheh "Insert.contig_index: %d, Insert.free_records: %d, " 2363dcd0538fSMark Fasheh "Insert.tree_depth: %d\n", 2364dcd0538fSMark Fasheh insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 2365dcd0538fSMark Fasheh insert.ins_free_records, insert.ins_tree_depth); 2366dcd0538fSMark Fasheh 2367dcd0538fSMark Fasheh /* 2368dcd0538fSMark Fasheh * Avoid growing the tree unless we're out of records and the 2369dcd0538fSMark Fasheh * insert type requres one. 2370dcd0538fSMark Fasheh */ 2371dcd0538fSMark Fasheh if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 2372ccd979bdSMark Fasheh goto out_add; 2373ccd979bdSMark Fasheh 2374ccd979bdSMark Fasheh shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); 2375ccd979bdSMark Fasheh if (shift < 0) { 2376ccd979bdSMark Fasheh status = shift; 2377ccd979bdSMark Fasheh mlog_errno(status); 2378ccd979bdSMark Fasheh goto bail; 2379ccd979bdSMark Fasheh } 2380ccd979bdSMark Fasheh 2381ccd979bdSMark Fasheh /* We traveled all the way to the bottom of the allocation tree 2382ccd979bdSMark Fasheh * and didn't find room for any more extents - we need to add 2383ccd979bdSMark Fasheh * another tree level */ 2384ccd979bdSMark Fasheh if (shift) { 2385ccd979bdSMark Fasheh BUG_ON(bh); 2386dcd0538fSMark Fasheh mlog(0, "need to shift tree depth " 2387dcd0538fSMark Fasheh "(current = %d)\n", insert.ins_tree_depth); 2388ccd979bdSMark Fasheh 2389ccd979bdSMark Fasheh /* ocfs2_shift_tree_depth will return us a buffer with 2390ccd979bdSMark Fasheh * the new extent block (so we can pass that to 2391ccd979bdSMark Fasheh * ocfs2_add_branch). */ 2392ccd979bdSMark Fasheh status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, 2393ccd979bdSMark Fasheh meta_ac, &bh); 2394ccd979bdSMark Fasheh if (status < 0) { 2395ccd979bdSMark Fasheh mlog_errno(status); 2396ccd979bdSMark Fasheh goto bail; 2397ccd979bdSMark Fasheh } 2398dcd0538fSMark Fasheh insert.ins_tree_depth++; 2399ccd979bdSMark Fasheh /* Special case: we have room now if we shifted from 2400ccd979bdSMark Fasheh * tree_depth 0 */ 2401dcd0538fSMark Fasheh if (insert.ins_tree_depth == 1) 2402ccd979bdSMark Fasheh goto out_add; 2403ccd979bdSMark Fasheh } 2404ccd979bdSMark Fasheh 2405ccd979bdSMark Fasheh /* call ocfs2_add_branch to add the final part of the tree with 2406ccd979bdSMark Fasheh * the new data. */ 2407dcd0538fSMark Fasheh mlog(0, "add branch. bh = %p\n", bh); 2408ccd979bdSMark Fasheh status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 2409ccd979bdSMark Fasheh meta_ac); 2410ccd979bdSMark Fasheh if (status < 0) { 2411ccd979bdSMark Fasheh mlog_errno(status); 2412ccd979bdSMark Fasheh goto bail; 2413ccd979bdSMark Fasheh } 2414ccd979bdSMark Fasheh 2415ccd979bdSMark Fasheh out_add: 2416dcd0538fSMark Fasheh /* Finally, we can add clusters. This might rotate the tree for us. */ 2417dcd0538fSMark Fasheh status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 2418ccd979bdSMark Fasheh if (status < 0) 2419ccd979bdSMark Fasheh mlog_errno(status); 2420ccd979bdSMark Fasheh 2421ccd979bdSMark Fasheh bail: 2422ccd979bdSMark Fasheh if (bh) 2423ccd979bdSMark Fasheh brelse(bh); 2424ccd979bdSMark Fasheh 2425ccd979bdSMark Fasheh if (last_eb_bh) 2426ccd979bdSMark Fasheh brelse(last_eb_bh); 2427ccd979bdSMark Fasheh 2428ccd979bdSMark Fasheh mlog_exit(status); 2429ccd979bdSMark Fasheh return status; 2430ccd979bdSMark Fasheh } 2431ccd979bdSMark Fasheh 2432ccd979bdSMark Fasheh static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 2433ccd979bdSMark Fasheh { 2434ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2435ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2436ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2437ccd979bdSMark Fasheh 2438ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2439ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2440ccd979bdSMark Fasheh 2441ccd979bdSMark Fasheh mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 2442ccd979bdSMark Fasheh "slot %d, invalid truncate log parameters: used = " 2443ccd979bdSMark Fasheh "%u, count = %u\n", osb->slot_num, 2444ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 2445ccd979bdSMark Fasheh return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 2446ccd979bdSMark Fasheh } 2447ccd979bdSMark Fasheh 2448ccd979bdSMark Fasheh static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 2449ccd979bdSMark Fasheh unsigned int new_start) 2450ccd979bdSMark Fasheh { 2451ccd979bdSMark Fasheh unsigned int tail_index; 2452ccd979bdSMark Fasheh unsigned int current_tail; 2453ccd979bdSMark Fasheh 2454ccd979bdSMark Fasheh /* No records, nothing to coalesce */ 2455ccd979bdSMark Fasheh if (!le16_to_cpu(tl->tl_used)) 2456ccd979bdSMark Fasheh return 0; 2457ccd979bdSMark Fasheh 2458ccd979bdSMark Fasheh tail_index = le16_to_cpu(tl->tl_used) - 1; 2459ccd979bdSMark Fasheh current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 2460ccd979bdSMark Fasheh current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 2461ccd979bdSMark Fasheh 2462ccd979bdSMark Fasheh return current_tail == new_start; 2463ccd979bdSMark Fasheh } 2464ccd979bdSMark Fasheh 2465ccd979bdSMark Fasheh static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 24661fabe148SMark Fasheh handle_t *handle, 2467ccd979bdSMark Fasheh u64 start_blk, 2468ccd979bdSMark Fasheh unsigned int num_clusters) 2469ccd979bdSMark Fasheh { 2470ccd979bdSMark Fasheh int status, index; 2471ccd979bdSMark Fasheh unsigned int start_cluster, tl_count; 2472ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2473ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2474ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2475ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2476ccd979bdSMark Fasheh 2477b0697053SMark Fasheh mlog_entry("start_blk = %llu, num_clusters = %u\n", 2478b0697053SMark Fasheh (unsigned long long)start_blk, num_clusters); 2479ccd979bdSMark Fasheh 24801b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2481ccd979bdSMark Fasheh 2482ccd979bdSMark Fasheh start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 2483ccd979bdSMark Fasheh 2484ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2485ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2486ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2487ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2488ccd979bdSMark Fasheh status = -EIO; 2489ccd979bdSMark Fasheh goto bail; 2490ccd979bdSMark Fasheh } 2491ccd979bdSMark Fasheh 2492ccd979bdSMark Fasheh tl_count = le16_to_cpu(tl->tl_count); 2493ccd979bdSMark Fasheh mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 2494ccd979bdSMark Fasheh tl_count == 0, 2495b0697053SMark Fasheh "Truncate record count on #%llu invalid " 2496b0697053SMark Fasheh "wanted %u, actual %u\n", 2497b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 2498ccd979bdSMark Fasheh ocfs2_truncate_recs_per_inode(osb->sb), 2499ccd979bdSMark Fasheh le16_to_cpu(tl->tl_count)); 2500ccd979bdSMark Fasheh 2501ccd979bdSMark Fasheh /* Caller should have known to flush before calling us. */ 2502ccd979bdSMark Fasheh index = le16_to_cpu(tl->tl_used); 2503ccd979bdSMark Fasheh if (index >= tl_count) { 2504ccd979bdSMark Fasheh status = -ENOSPC; 2505ccd979bdSMark Fasheh mlog_errno(status); 2506ccd979bdSMark Fasheh goto bail; 2507ccd979bdSMark Fasheh } 2508ccd979bdSMark Fasheh 2509ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2510ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2511ccd979bdSMark Fasheh if (status < 0) { 2512ccd979bdSMark Fasheh mlog_errno(status); 2513ccd979bdSMark Fasheh goto bail; 2514ccd979bdSMark Fasheh } 2515ccd979bdSMark Fasheh 2516ccd979bdSMark Fasheh mlog(0, "Log truncate of %u clusters starting at cluster %u to " 2517b0697053SMark Fasheh "%llu (index = %d)\n", num_clusters, start_cluster, 2518b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 2519ccd979bdSMark Fasheh 2520ccd979bdSMark Fasheh if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 2521ccd979bdSMark Fasheh /* 2522ccd979bdSMark Fasheh * Move index back to the record we are coalescing with. 2523ccd979bdSMark Fasheh * ocfs2_truncate_log_can_coalesce() guarantees nonzero 2524ccd979bdSMark Fasheh */ 2525ccd979bdSMark Fasheh index--; 2526ccd979bdSMark Fasheh 2527ccd979bdSMark Fasheh num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 2528ccd979bdSMark Fasheh mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 2529ccd979bdSMark Fasheh index, le32_to_cpu(tl->tl_recs[index].t_start), 2530ccd979bdSMark Fasheh num_clusters); 2531ccd979bdSMark Fasheh } else { 2532ccd979bdSMark Fasheh tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 2533ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(index + 1); 2534ccd979bdSMark Fasheh } 2535ccd979bdSMark Fasheh tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 2536ccd979bdSMark Fasheh 2537ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2538ccd979bdSMark Fasheh if (status < 0) { 2539ccd979bdSMark Fasheh mlog_errno(status); 2540ccd979bdSMark Fasheh goto bail; 2541ccd979bdSMark Fasheh } 2542ccd979bdSMark Fasheh 2543ccd979bdSMark Fasheh bail: 2544ccd979bdSMark Fasheh mlog_exit(status); 2545ccd979bdSMark Fasheh return status; 2546ccd979bdSMark Fasheh } 2547ccd979bdSMark Fasheh 2548ccd979bdSMark Fasheh static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 25491fabe148SMark Fasheh handle_t *handle, 2550ccd979bdSMark Fasheh struct inode *data_alloc_inode, 2551ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh) 2552ccd979bdSMark Fasheh { 2553ccd979bdSMark Fasheh int status = 0; 2554ccd979bdSMark Fasheh int i; 2555ccd979bdSMark Fasheh unsigned int num_clusters; 2556ccd979bdSMark Fasheh u64 start_blk; 2557ccd979bdSMark Fasheh struct ocfs2_truncate_rec rec; 2558ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2559ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2560ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2561ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2562ccd979bdSMark Fasheh 2563ccd979bdSMark Fasheh mlog_entry_void(); 2564ccd979bdSMark Fasheh 2565ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2566ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2567ccd979bdSMark Fasheh i = le16_to_cpu(tl->tl_used) - 1; 2568ccd979bdSMark Fasheh while (i >= 0) { 2569ccd979bdSMark Fasheh /* Caller has given us at least enough credits to 2570ccd979bdSMark Fasheh * update the truncate log dinode */ 2571ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2572ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2573ccd979bdSMark Fasheh if (status < 0) { 2574ccd979bdSMark Fasheh mlog_errno(status); 2575ccd979bdSMark Fasheh goto bail; 2576ccd979bdSMark Fasheh } 2577ccd979bdSMark Fasheh 2578ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(i); 2579ccd979bdSMark Fasheh 2580ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2581ccd979bdSMark Fasheh if (status < 0) { 2582ccd979bdSMark Fasheh mlog_errno(status); 2583ccd979bdSMark Fasheh goto bail; 2584ccd979bdSMark Fasheh } 2585ccd979bdSMark Fasheh 2586ccd979bdSMark Fasheh /* TODO: Perhaps we can calculate the bulk of the 2587ccd979bdSMark Fasheh * credits up front rather than extending like 2588ccd979bdSMark Fasheh * this. */ 2589ccd979bdSMark Fasheh status = ocfs2_extend_trans(handle, 2590ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 2591ccd979bdSMark Fasheh if (status < 0) { 2592ccd979bdSMark Fasheh mlog_errno(status); 2593ccd979bdSMark Fasheh goto bail; 2594ccd979bdSMark Fasheh } 2595ccd979bdSMark Fasheh 2596ccd979bdSMark Fasheh rec = tl->tl_recs[i]; 2597ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 2598ccd979bdSMark Fasheh le32_to_cpu(rec.t_start)); 2599ccd979bdSMark Fasheh num_clusters = le32_to_cpu(rec.t_clusters); 2600ccd979bdSMark Fasheh 2601ccd979bdSMark Fasheh /* if start_blk is not set, we ignore the record as 2602ccd979bdSMark Fasheh * invalid. */ 2603ccd979bdSMark Fasheh if (start_blk) { 2604ccd979bdSMark Fasheh mlog(0, "free record %d, start = %u, clusters = %u\n", 2605ccd979bdSMark Fasheh i, le32_to_cpu(rec.t_start), num_clusters); 2606ccd979bdSMark Fasheh 2607ccd979bdSMark Fasheh status = ocfs2_free_clusters(handle, data_alloc_inode, 2608ccd979bdSMark Fasheh data_alloc_bh, start_blk, 2609ccd979bdSMark Fasheh num_clusters); 2610ccd979bdSMark Fasheh if (status < 0) { 2611ccd979bdSMark Fasheh mlog_errno(status); 2612ccd979bdSMark Fasheh goto bail; 2613ccd979bdSMark Fasheh } 2614ccd979bdSMark Fasheh } 2615ccd979bdSMark Fasheh i--; 2616ccd979bdSMark Fasheh } 2617ccd979bdSMark Fasheh 2618ccd979bdSMark Fasheh bail: 2619ccd979bdSMark Fasheh mlog_exit(status); 2620ccd979bdSMark Fasheh return status; 2621ccd979bdSMark Fasheh } 2622ccd979bdSMark Fasheh 26231b1dcc1bSJes Sorensen /* Expects you to already be holding tl_inode->i_mutex */ 2624ccd979bdSMark Fasheh static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2625ccd979bdSMark Fasheh { 2626ccd979bdSMark Fasheh int status; 2627ccd979bdSMark Fasheh unsigned int num_to_flush; 26281fabe148SMark Fasheh handle_t *handle; 2629ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2630ccd979bdSMark Fasheh struct inode *data_alloc_inode = NULL; 2631ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2632ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh = NULL; 2633ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2634ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2635ccd979bdSMark Fasheh 2636ccd979bdSMark Fasheh mlog_entry_void(); 2637ccd979bdSMark Fasheh 26381b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2639ccd979bdSMark Fasheh 2640ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2641ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2642ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2643ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2644ccd979bdSMark Fasheh status = -EIO; 2645e08dc8b9SMark Fasheh goto out; 2646ccd979bdSMark Fasheh } 2647ccd979bdSMark Fasheh 2648ccd979bdSMark Fasheh num_to_flush = le16_to_cpu(tl->tl_used); 2649b0697053SMark Fasheh mlog(0, "Flush %u records from truncate log #%llu\n", 2650b0697053SMark Fasheh num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 2651ccd979bdSMark Fasheh if (!num_to_flush) { 2652ccd979bdSMark Fasheh status = 0; 2653e08dc8b9SMark Fasheh goto out; 2654ccd979bdSMark Fasheh } 2655ccd979bdSMark Fasheh 2656ccd979bdSMark Fasheh data_alloc_inode = ocfs2_get_system_file_inode(osb, 2657ccd979bdSMark Fasheh GLOBAL_BITMAP_SYSTEM_INODE, 2658ccd979bdSMark Fasheh OCFS2_INVALID_SLOT); 2659ccd979bdSMark Fasheh if (!data_alloc_inode) { 2660ccd979bdSMark Fasheh status = -EINVAL; 2661ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get bitmap inode!\n"); 2662e08dc8b9SMark Fasheh goto out; 2663ccd979bdSMark Fasheh } 2664ccd979bdSMark Fasheh 2665e08dc8b9SMark Fasheh mutex_lock(&data_alloc_inode->i_mutex); 2666e08dc8b9SMark Fasheh 26674bcec184SMark Fasheh status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 2668ccd979bdSMark Fasheh if (status < 0) { 2669ccd979bdSMark Fasheh mlog_errno(status); 2670e08dc8b9SMark Fasheh goto out_mutex; 2671ccd979bdSMark Fasheh } 2672ccd979bdSMark Fasheh 267365eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2674ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2675ccd979bdSMark Fasheh status = PTR_ERR(handle); 2676ccd979bdSMark Fasheh mlog_errno(status); 2677e08dc8b9SMark Fasheh goto out_unlock; 2678ccd979bdSMark Fasheh } 2679ccd979bdSMark Fasheh 2680ccd979bdSMark Fasheh status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 2681ccd979bdSMark Fasheh data_alloc_bh); 2682e08dc8b9SMark Fasheh if (status < 0) 2683ccd979bdSMark Fasheh mlog_errno(status); 2684ccd979bdSMark Fasheh 268502dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2686ccd979bdSMark Fasheh 2687e08dc8b9SMark Fasheh out_unlock: 2688e08dc8b9SMark Fasheh brelse(data_alloc_bh); 2689e08dc8b9SMark Fasheh ocfs2_meta_unlock(data_alloc_inode, 1); 2690e08dc8b9SMark Fasheh 2691e08dc8b9SMark Fasheh out_mutex: 2692e08dc8b9SMark Fasheh mutex_unlock(&data_alloc_inode->i_mutex); 2693ccd979bdSMark Fasheh iput(data_alloc_inode); 2694ccd979bdSMark Fasheh 2695e08dc8b9SMark Fasheh out: 2696ccd979bdSMark Fasheh mlog_exit(status); 2697ccd979bdSMark Fasheh return status; 2698ccd979bdSMark Fasheh } 2699ccd979bdSMark Fasheh 2700ccd979bdSMark Fasheh int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2701ccd979bdSMark Fasheh { 2702ccd979bdSMark Fasheh int status; 2703ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2704ccd979bdSMark Fasheh 27051b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2706ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 27071b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2708ccd979bdSMark Fasheh 2709ccd979bdSMark Fasheh return status; 2710ccd979bdSMark Fasheh } 2711ccd979bdSMark Fasheh 2712c4028958SDavid Howells static void ocfs2_truncate_log_worker(struct work_struct *work) 2713ccd979bdSMark Fasheh { 2714ccd979bdSMark Fasheh int status; 2715c4028958SDavid Howells struct ocfs2_super *osb = 2716c4028958SDavid Howells container_of(work, struct ocfs2_super, 2717c4028958SDavid Howells osb_truncate_log_wq.work); 2718ccd979bdSMark Fasheh 2719ccd979bdSMark Fasheh mlog_entry_void(); 2720ccd979bdSMark Fasheh 2721ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2722ccd979bdSMark Fasheh if (status < 0) 2723ccd979bdSMark Fasheh mlog_errno(status); 2724ccd979bdSMark Fasheh 2725ccd979bdSMark Fasheh mlog_exit(status); 2726ccd979bdSMark Fasheh } 2727ccd979bdSMark Fasheh 2728ccd979bdSMark Fasheh #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 2729ccd979bdSMark Fasheh void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 2730ccd979bdSMark Fasheh int cancel) 2731ccd979bdSMark Fasheh { 2732ccd979bdSMark Fasheh if (osb->osb_tl_inode) { 2733ccd979bdSMark Fasheh /* We want to push off log flushes while truncates are 2734ccd979bdSMark Fasheh * still running. */ 2735ccd979bdSMark Fasheh if (cancel) 2736ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2737ccd979bdSMark Fasheh 2738ccd979bdSMark Fasheh queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 2739ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 2740ccd979bdSMark Fasheh } 2741ccd979bdSMark Fasheh } 2742ccd979bdSMark Fasheh 2743ccd979bdSMark Fasheh static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 2744ccd979bdSMark Fasheh int slot_num, 2745ccd979bdSMark Fasheh struct inode **tl_inode, 2746ccd979bdSMark Fasheh struct buffer_head **tl_bh) 2747ccd979bdSMark Fasheh { 2748ccd979bdSMark Fasheh int status; 2749ccd979bdSMark Fasheh struct inode *inode = NULL; 2750ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2751ccd979bdSMark Fasheh 2752ccd979bdSMark Fasheh inode = ocfs2_get_system_file_inode(osb, 2753ccd979bdSMark Fasheh TRUNCATE_LOG_SYSTEM_INODE, 2754ccd979bdSMark Fasheh slot_num); 2755ccd979bdSMark Fasheh if (!inode) { 2756ccd979bdSMark Fasheh status = -EINVAL; 2757ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 2758ccd979bdSMark Fasheh goto bail; 2759ccd979bdSMark Fasheh } 2760ccd979bdSMark Fasheh 2761ccd979bdSMark Fasheh status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 2762ccd979bdSMark Fasheh OCFS2_BH_CACHED, inode); 2763ccd979bdSMark Fasheh if (status < 0) { 2764ccd979bdSMark Fasheh iput(inode); 2765ccd979bdSMark Fasheh mlog_errno(status); 2766ccd979bdSMark Fasheh goto bail; 2767ccd979bdSMark Fasheh } 2768ccd979bdSMark Fasheh 2769ccd979bdSMark Fasheh *tl_inode = inode; 2770ccd979bdSMark Fasheh *tl_bh = bh; 2771ccd979bdSMark Fasheh bail: 2772ccd979bdSMark Fasheh mlog_exit(status); 2773ccd979bdSMark Fasheh return status; 2774ccd979bdSMark Fasheh } 2775ccd979bdSMark Fasheh 2776ccd979bdSMark Fasheh /* called during the 1st stage of node recovery. we stamp a clean 2777ccd979bdSMark Fasheh * truncate log and pass back a copy for processing later. if the 2778ccd979bdSMark Fasheh * truncate log does not require processing, a *tl_copy is set to 2779ccd979bdSMark Fasheh * NULL. */ 2780ccd979bdSMark Fasheh int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 2781ccd979bdSMark Fasheh int slot_num, 2782ccd979bdSMark Fasheh struct ocfs2_dinode **tl_copy) 2783ccd979bdSMark Fasheh { 2784ccd979bdSMark Fasheh int status; 2785ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2786ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2787ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2788ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2789ccd979bdSMark Fasheh 2790ccd979bdSMark Fasheh *tl_copy = NULL; 2791ccd979bdSMark Fasheh 2792ccd979bdSMark Fasheh mlog(0, "recover truncate log from slot %d\n", slot_num); 2793ccd979bdSMark Fasheh 2794ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 2795ccd979bdSMark Fasheh if (status < 0) { 2796ccd979bdSMark Fasheh mlog_errno(status); 2797ccd979bdSMark Fasheh goto bail; 2798ccd979bdSMark Fasheh } 2799ccd979bdSMark Fasheh 2800ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2801ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2802ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2803ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); 2804ccd979bdSMark Fasheh status = -EIO; 2805ccd979bdSMark Fasheh goto bail; 2806ccd979bdSMark Fasheh } 2807ccd979bdSMark Fasheh 2808ccd979bdSMark Fasheh if (le16_to_cpu(tl->tl_used)) { 2809ccd979bdSMark Fasheh mlog(0, "We'll have %u logs to recover\n", 2810ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used)); 2811ccd979bdSMark Fasheh 2812ccd979bdSMark Fasheh *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 2813ccd979bdSMark Fasheh if (!(*tl_copy)) { 2814ccd979bdSMark Fasheh status = -ENOMEM; 2815ccd979bdSMark Fasheh mlog_errno(status); 2816ccd979bdSMark Fasheh goto bail; 2817ccd979bdSMark Fasheh } 2818ccd979bdSMark Fasheh 2819ccd979bdSMark Fasheh /* Assuming the write-out below goes well, this copy 2820ccd979bdSMark Fasheh * will be passed back to recovery for processing. */ 2821ccd979bdSMark Fasheh memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 2822ccd979bdSMark Fasheh 2823ccd979bdSMark Fasheh /* All we need to do to clear the truncate log is set 2824ccd979bdSMark Fasheh * tl_used. */ 2825ccd979bdSMark Fasheh tl->tl_used = 0; 2826ccd979bdSMark Fasheh 2827ccd979bdSMark Fasheh status = ocfs2_write_block(osb, tl_bh, tl_inode); 2828ccd979bdSMark Fasheh if (status < 0) { 2829ccd979bdSMark Fasheh mlog_errno(status); 2830ccd979bdSMark Fasheh goto bail; 2831ccd979bdSMark Fasheh } 2832ccd979bdSMark Fasheh } 2833ccd979bdSMark Fasheh 2834ccd979bdSMark Fasheh bail: 2835ccd979bdSMark Fasheh if (tl_inode) 2836ccd979bdSMark Fasheh iput(tl_inode); 2837ccd979bdSMark Fasheh if (tl_bh) 2838ccd979bdSMark Fasheh brelse(tl_bh); 2839ccd979bdSMark Fasheh 2840ccd979bdSMark Fasheh if (status < 0 && (*tl_copy)) { 2841ccd979bdSMark Fasheh kfree(*tl_copy); 2842ccd979bdSMark Fasheh *tl_copy = NULL; 2843ccd979bdSMark Fasheh } 2844ccd979bdSMark Fasheh 2845ccd979bdSMark Fasheh mlog_exit(status); 2846ccd979bdSMark Fasheh return status; 2847ccd979bdSMark Fasheh } 2848ccd979bdSMark Fasheh 2849ccd979bdSMark Fasheh int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 2850ccd979bdSMark Fasheh struct ocfs2_dinode *tl_copy) 2851ccd979bdSMark Fasheh { 2852ccd979bdSMark Fasheh int status = 0; 2853ccd979bdSMark Fasheh int i; 2854ccd979bdSMark Fasheh unsigned int clusters, num_recs, start_cluster; 2855ccd979bdSMark Fasheh u64 start_blk; 28561fabe148SMark Fasheh handle_t *handle; 2857ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2858ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2859ccd979bdSMark Fasheh 2860ccd979bdSMark Fasheh mlog_entry_void(); 2861ccd979bdSMark Fasheh 2862ccd979bdSMark Fasheh if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 2863ccd979bdSMark Fasheh mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 2864ccd979bdSMark Fasheh return -EINVAL; 2865ccd979bdSMark Fasheh } 2866ccd979bdSMark Fasheh 2867ccd979bdSMark Fasheh tl = &tl_copy->id2.i_dealloc; 2868ccd979bdSMark Fasheh num_recs = le16_to_cpu(tl->tl_used); 2869b0697053SMark Fasheh mlog(0, "cleanup %u records from %llu\n", num_recs, 2870b0697053SMark Fasheh (unsigned long long)tl_copy->i_blkno); 2871ccd979bdSMark Fasheh 28721b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2873ccd979bdSMark Fasheh for(i = 0; i < num_recs; i++) { 2874ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 2875ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 2876ccd979bdSMark Fasheh if (status < 0) { 2877ccd979bdSMark Fasheh mlog_errno(status); 2878ccd979bdSMark Fasheh goto bail_up; 2879ccd979bdSMark Fasheh } 2880ccd979bdSMark Fasheh } 2881ccd979bdSMark Fasheh 288265eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2883ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2884ccd979bdSMark Fasheh status = PTR_ERR(handle); 2885ccd979bdSMark Fasheh mlog_errno(status); 2886ccd979bdSMark Fasheh goto bail_up; 2887ccd979bdSMark Fasheh } 2888ccd979bdSMark Fasheh 2889ccd979bdSMark Fasheh clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 2890ccd979bdSMark Fasheh start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 2891ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 2892ccd979bdSMark Fasheh 2893ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, 2894ccd979bdSMark Fasheh start_blk, clusters); 289502dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2896ccd979bdSMark Fasheh if (status < 0) { 2897ccd979bdSMark Fasheh mlog_errno(status); 2898ccd979bdSMark Fasheh goto bail_up; 2899ccd979bdSMark Fasheh } 2900ccd979bdSMark Fasheh } 2901ccd979bdSMark Fasheh 2902ccd979bdSMark Fasheh bail_up: 29031b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2904ccd979bdSMark Fasheh 2905ccd979bdSMark Fasheh mlog_exit(status); 2906ccd979bdSMark Fasheh return status; 2907ccd979bdSMark Fasheh } 2908ccd979bdSMark Fasheh 2909ccd979bdSMark Fasheh void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 2910ccd979bdSMark Fasheh { 2911ccd979bdSMark Fasheh int status; 2912ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2913ccd979bdSMark Fasheh 2914ccd979bdSMark Fasheh mlog_entry_void(); 2915ccd979bdSMark Fasheh 2916ccd979bdSMark Fasheh if (tl_inode) { 2917ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2918ccd979bdSMark Fasheh flush_workqueue(ocfs2_wq); 2919ccd979bdSMark Fasheh 2920ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2921ccd979bdSMark Fasheh if (status < 0) 2922ccd979bdSMark Fasheh mlog_errno(status); 2923ccd979bdSMark Fasheh 2924ccd979bdSMark Fasheh brelse(osb->osb_tl_bh); 2925ccd979bdSMark Fasheh iput(osb->osb_tl_inode); 2926ccd979bdSMark Fasheh } 2927ccd979bdSMark Fasheh 2928ccd979bdSMark Fasheh mlog_exit_void(); 2929ccd979bdSMark Fasheh } 2930ccd979bdSMark Fasheh 2931ccd979bdSMark Fasheh int ocfs2_truncate_log_init(struct ocfs2_super *osb) 2932ccd979bdSMark Fasheh { 2933ccd979bdSMark Fasheh int status; 2934ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2935ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2936ccd979bdSMark Fasheh 2937ccd979bdSMark Fasheh mlog_entry_void(); 2938ccd979bdSMark Fasheh 2939ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, 2940ccd979bdSMark Fasheh osb->slot_num, 2941ccd979bdSMark Fasheh &tl_inode, 2942ccd979bdSMark Fasheh &tl_bh); 2943ccd979bdSMark Fasheh if (status < 0) 2944ccd979bdSMark Fasheh mlog_errno(status); 2945ccd979bdSMark Fasheh 2946ccd979bdSMark Fasheh /* ocfs2_truncate_log_shutdown keys on the existence of 2947ccd979bdSMark Fasheh * osb->osb_tl_inode so we don't set any of the osb variables 2948ccd979bdSMark Fasheh * until we're sure all is well. */ 2949c4028958SDavid Howells INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 2950c4028958SDavid Howells ocfs2_truncate_log_worker); 2951ccd979bdSMark Fasheh osb->osb_tl_bh = tl_bh; 2952ccd979bdSMark Fasheh osb->osb_tl_inode = tl_inode; 2953ccd979bdSMark Fasheh 2954ccd979bdSMark Fasheh mlog_exit(status); 2955ccd979bdSMark Fasheh return status; 2956ccd979bdSMark Fasheh } 2957ccd979bdSMark Fasheh 2958ccd979bdSMark Fasheh /* This function will figure out whether the currently last extent 2959ccd979bdSMark Fasheh * block will be deleted, and if it will, what the new last extent 2960ccd979bdSMark Fasheh * block will be so we can update his h_next_leaf_blk field, as well 2961ccd979bdSMark Fasheh * as the dinodes i_last_eb_blk */ 2962dcd0538fSMark Fasheh static int ocfs2_find_new_last_ext_blk(struct inode *inode, 29633a0782d0SMark Fasheh unsigned int clusters_to_del, 2964dcd0538fSMark Fasheh struct ocfs2_path *path, 2965ccd979bdSMark Fasheh struct buffer_head **new_last_eb) 2966ccd979bdSMark Fasheh { 29673a0782d0SMark Fasheh int next_free, ret = 0; 2968dcd0538fSMark Fasheh u32 cpos; 29693a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 2970ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 2971ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 2972ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2973ccd979bdSMark Fasheh 2974ccd979bdSMark Fasheh *new_last_eb = NULL; 2975ccd979bdSMark Fasheh 2976ccd979bdSMark Fasheh /* we have no tree, so of course, no last_eb. */ 2977dcd0538fSMark Fasheh if (!path->p_tree_depth) 2978dcd0538fSMark Fasheh goto out; 2979ccd979bdSMark Fasheh 2980ccd979bdSMark Fasheh /* trunc to zero special case - this makes tree_depth = 0 2981ccd979bdSMark Fasheh * regardless of what it is. */ 29823a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 2983dcd0538fSMark Fasheh goto out; 2984ccd979bdSMark Fasheh 2985dcd0538fSMark Fasheh el = path_leaf_el(path); 2986ccd979bdSMark Fasheh BUG_ON(!el->l_next_free_rec); 2987ccd979bdSMark Fasheh 29883a0782d0SMark Fasheh /* 29893a0782d0SMark Fasheh * Make sure that this extent list will actually be empty 29903a0782d0SMark Fasheh * after we clear away the data. We can shortcut out if 29913a0782d0SMark Fasheh * there's more than one non-empty extent in the 29923a0782d0SMark Fasheh * list. Otherwise, a check of the remaining extent is 29933a0782d0SMark Fasheh * necessary. 29943a0782d0SMark Fasheh */ 29953a0782d0SMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 29963a0782d0SMark Fasheh rec = NULL; 2997dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 29983a0782d0SMark Fasheh if (next_free > 2) 2999dcd0538fSMark Fasheh goto out; 30003a0782d0SMark Fasheh 30013a0782d0SMark Fasheh /* We may have a valid extent in index 1, check it. */ 30023a0782d0SMark Fasheh if (next_free == 2) 30033a0782d0SMark Fasheh rec = &el->l_recs[1]; 30043a0782d0SMark Fasheh 30053a0782d0SMark Fasheh /* 30063a0782d0SMark Fasheh * Fall through - no more nonempty extents, so we want 30073a0782d0SMark Fasheh * to delete this leaf. 30083a0782d0SMark Fasheh */ 30093a0782d0SMark Fasheh } else { 30103a0782d0SMark Fasheh if (next_free > 1) 3011dcd0538fSMark Fasheh goto out; 3012ccd979bdSMark Fasheh 30133a0782d0SMark Fasheh rec = &el->l_recs[0]; 30143a0782d0SMark Fasheh } 30153a0782d0SMark Fasheh 30163a0782d0SMark Fasheh if (rec) { 30173a0782d0SMark Fasheh /* 30183a0782d0SMark Fasheh * Check it we'll only be trimming off the end of this 30193a0782d0SMark Fasheh * cluster. 30203a0782d0SMark Fasheh */ 3021e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 30223a0782d0SMark Fasheh goto out; 30233a0782d0SMark Fasheh } 30243a0782d0SMark Fasheh 3025dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3026dcd0538fSMark Fasheh if (ret) { 3027dcd0538fSMark Fasheh mlog_errno(ret); 3028dcd0538fSMark Fasheh goto out; 3029ccd979bdSMark Fasheh } 3030ccd979bdSMark Fasheh 3031dcd0538fSMark Fasheh ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 3032dcd0538fSMark Fasheh if (ret) { 3033dcd0538fSMark Fasheh mlog_errno(ret); 3034dcd0538fSMark Fasheh goto out; 3035ccd979bdSMark Fasheh } 3036dcd0538fSMark Fasheh 3037ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 3038ccd979bdSMark Fasheh el = &eb->h_list; 3039ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3040ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3041dcd0538fSMark Fasheh ret = -EROFS; 3042dcd0538fSMark Fasheh goto out; 3043ccd979bdSMark Fasheh } 3044ccd979bdSMark Fasheh 3045ccd979bdSMark Fasheh *new_last_eb = bh; 3046ccd979bdSMark Fasheh get_bh(*new_last_eb); 3047dcd0538fSMark Fasheh mlog(0, "returning block %llu, (cpos: %u)\n", 3048dcd0538fSMark Fasheh (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 3049dcd0538fSMark Fasheh out: 3050ccd979bdSMark Fasheh brelse(bh); 3051ccd979bdSMark Fasheh 3052dcd0538fSMark Fasheh return ret; 3053ccd979bdSMark Fasheh } 3054ccd979bdSMark Fasheh 30553a0782d0SMark Fasheh /* 30563a0782d0SMark Fasheh * Trim some clusters off the rightmost edge of a tree. Only called 30573a0782d0SMark Fasheh * during truncate. 30583a0782d0SMark Fasheh * 30593a0782d0SMark Fasheh * The caller needs to: 30603a0782d0SMark Fasheh * - start journaling of each path component. 30613a0782d0SMark Fasheh * - compute and fully set up any new last ext block 30623a0782d0SMark Fasheh */ 30633a0782d0SMark Fasheh static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 30643a0782d0SMark Fasheh handle_t *handle, struct ocfs2_truncate_context *tc, 30653a0782d0SMark Fasheh u32 clusters_to_del, u64 *delete_start) 30663a0782d0SMark Fasheh { 30673a0782d0SMark Fasheh int ret, i, index = path->p_tree_depth; 30683a0782d0SMark Fasheh u32 new_edge = 0; 30693a0782d0SMark Fasheh u64 deleted_eb = 0; 30703a0782d0SMark Fasheh struct buffer_head *bh; 30713a0782d0SMark Fasheh struct ocfs2_extent_list *el; 30723a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 30733a0782d0SMark Fasheh 30743a0782d0SMark Fasheh *delete_start = 0; 30753a0782d0SMark Fasheh 30763a0782d0SMark Fasheh while (index >= 0) { 30773a0782d0SMark Fasheh bh = path->p_node[index].bh; 30783a0782d0SMark Fasheh el = path->p_node[index].el; 30793a0782d0SMark Fasheh 30803a0782d0SMark Fasheh mlog(0, "traveling tree (index = %d, block = %llu)\n", 30813a0782d0SMark Fasheh index, (unsigned long long)bh->b_blocknr); 30823a0782d0SMark Fasheh 30833a0782d0SMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 30843a0782d0SMark Fasheh 30853a0782d0SMark Fasheh if (index != 30863a0782d0SMark Fasheh (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 30873a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 30883a0782d0SMark Fasheh "Inode %lu has invalid ext. block %llu", 30893a0782d0SMark Fasheh inode->i_ino, 30903a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr); 30913a0782d0SMark Fasheh ret = -EROFS; 30923a0782d0SMark Fasheh goto out; 30933a0782d0SMark Fasheh } 30943a0782d0SMark Fasheh 30953a0782d0SMark Fasheh find_tail_record: 30963a0782d0SMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 30973a0782d0SMark Fasheh rec = &el->l_recs[i]; 30983a0782d0SMark Fasheh 30993a0782d0SMark Fasheh mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 31003a0782d0SMark Fasheh "next = %u\n", i, le32_to_cpu(rec->e_cpos), 3101e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec), 31023a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 31033a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 31043a0782d0SMark Fasheh 3105e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 31063a0782d0SMark Fasheh 31073a0782d0SMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 31083a0782d0SMark Fasheh /* 31093a0782d0SMark Fasheh * If the leaf block contains a single empty 31103a0782d0SMark Fasheh * extent and no records, we can just remove 31113a0782d0SMark Fasheh * the block. 31123a0782d0SMark Fasheh */ 31133a0782d0SMark Fasheh if (i == 0 && ocfs2_is_empty_extent(rec)) { 31143a0782d0SMark Fasheh memset(rec, 0, 31153a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 31163a0782d0SMark Fasheh el->l_next_free_rec = cpu_to_le16(0); 31173a0782d0SMark Fasheh 31183a0782d0SMark Fasheh goto delete; 31193a0782d0SMark Fasheh } 31203a0782d0SMark Fasheh 31213a0782d0SMark Fasheh /* 31223a0782d0SMark Fasheh * Remove any empty extents by shifting things 31233a0782d0SMark Fasheh * left. That should make life much easier on 31243a0782d0SMark Fasheh * the code below. This condition is rare 31253a0782d0SMark Fasheh * enough that we shouldn't see a performance 31263a0782d0SMark Fasheh * hit. 31273a0782d0SMark Fasheh */ 31283a0782d0SMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 31293a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 31303a0782d0SMark Fasheh 31313a0782d0SMark Fasheh for(i = 0; 31323a0782d0SMark Fasheh i < le16_to_cpu(el->l_next_free_rec); i++) 31333a0782d0SMark Fasheh el->l_recs[i] = el->l_recs[i + 1]; 31343a0782d0SMark Fasheh 31353a0782d0SMark Fasheh memset(&el->l_recs[i], 0, 31363a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 31373a0782d0SMark Fasheh 31383a0782d0SMark Fasheh /* 31393a0782d0SMark Fasheh * We've modified our extent list. The 31403a0782d0SMark Fasheh * simplest way to handle this change 31413a0782d0SMark Fasheh * is to being the search from the 31423a0782d0SMark Fasheh * start again. 31433a0782d0SMark Fasheh */ 31443a0782d0SMark Fasheh goto find_tail_record; 31453a0782d0SMark Fasheh } 31463a0782d0SMark Fasheh 3147e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 31483a0782d0SMark Fasheh 31493a0782d0SMark Fasheh /* 31503a0782d0SMark Fasheh * We'll use "new_edge" on our way back up the 31513a0782d0SMark Fasheh * tree to know what our rightmost cpos is. 31523a0782d0SMark Fasheh */ 3153e48edee2SMark Fasheh new_edge = le16_to_cpu(rec->e_leaf_clusters); 31543a0782d0SMark Fasheh new_edge += le32_to_cpu(rec->e_cpos); 31553a0782d0SMark Fasheh 31563a0782d0SMark Fasheh /* 31573a0782d0SMark Fasheh * The caller will use this to delete data blocks. 31583a0782d0SMark Fasheh */ 31593a0782d0SMark Fasheh *delete_start = le64_to_cpu(rec->e_blkno) 31603a0782d0SMark Fasheh + ocfs2_clusters_to_blocks(inode->i_sb, 3161e48edee2SMark Fasheh le16_to_cpu(rec->e_leaf_clusters)); 31623a0782d0SMark Fasheh 31633a0782d0SMark Fasheh /* 31643a0782d0SMark Fasheh * If it's now empty, remove this record. 31653a0782d0SMark Fasheh */ 3166e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 31673a0782d0SMark Fasheh memset(rec, 0, 31683a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 31693a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 31703a0782d0SMark Fasheh } 31713a0782d0SMark Fasheh } else { 31723a0782d0SMark Fasheh if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 31733a0782d0SMark Fasheh memset(rec, 0, 31743a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 31753a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 31763a0782d0SMark Fasheh 31773a0782d0SMark Fasheh goto delete; 31783a0782d0SMark Fasheh } 31793a0782d0SMark Fasheh 31803a0782d0SMark Fasheh /* Can this actually happen? */ 31813a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) 31823a0782d0SMark Fasheh goto delete; 31833a0782d0SMark Fasheh 31843a0782d0SMark Fasheh /* 31853a0782d0SMark Fasheh * We never actually deleted any clusters 31863a0782d0SMark Fasheh * because our leaf was empty. There's no 31873a0782d0SMark Fasheh * reason to adjust the rightmost edge then. 31883a0782d0SMark Fasheh */ 31893a0782d0SMark Fasheh if (new_edge == 0) 31903a0782d0SMark Fasheh goto delete; 31913a0782d0SMark Fasheh 3192e48edee2SMark Fasheh rec->e_int_clusters = cpu_to_le32(new_edge); 3193e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 31943a0782d0SMark Fasheh -le32_to_cpu(rec->e_cpos)); 31953a0782d0SMark Fasheh 31963a0782d0SMark Fasheh /* 31973a0782d0SMark Fasheh * A deleted child record should have been 31983a0782d0SMark Fasheh * caught above. 31993a0782d0SMark Fasheh */ 3200e48edee2SMark Fasheh BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 32013a0782d0SMark Fasheh } 32023a0782d0SMark Fasheh 32033a0782d0SMark Fasheh delete: 32043a0782d0SMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 32053a0782d0SMark Fasheh if (ret) { 32063a0782d0SMark Fasheh mlog_errno(ret); 32073a0782d0SMark Fasheh goto out; 32083a0782d0SMark Fasheh } 32093a0782d0SMark Fasheh 32103a0782d0SMark Fasheh mlog(0, "extent list container %llu, after: record %d: " 32113a0782d0SMark Fasheh "(%u, %u, %llu), next = %u.\n", 32123a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr, i, 3213e48edee2SMark Fasheh le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 32143a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 32153a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 32163a0782d0SMark Fasheh 32173a0782d0SMark Fasheh /* 32183a0782d0SMark Fasheh * We must be careful to only attempt delete of an 32193a0782d0SMark Fasheh * extent block (and not the root inode block). 32203a0782d0SMark Fasheh */ 32213a0782d0SMark Fasheh if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 32223a0782d0SMark Fasheh struct ocfs2_extent_block *eb = 32233a0782d0SMark Fasheh (struct ocfs2_extent_block *)bh->b_data; 32243a0782d0SMark Fasheh 32253a0782d0SMark Fasheh /* 32263a0782d0SMark Fasheh * Save this for use when processing the 32273a0782d0SMark Fasheh * parent block. 32283a0782d0SMark Fasheh */ 32293a0782d0SMark Fasheh deleted_eb = le64_to_cpu(eb->h_blkno); 32303a0782d0SMark Fasheh 32313a0782d0SMark Fasheh mlog(0, "deleting this extent block.\n"); 32323a0782d0SMark Fasheh 32333a0782d0SMark Fasheh ocfs2_remove_from_cache(inode, bh); 32343a0782d0SMark Fasheh 3235e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 32363a0782d0SMark Fasheh BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 32373a0782d0SMark Fasheh BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 32383a0782d0SMark Fasheh 32393a0782d0SMark Fasheh if (le16_to_cpu(eb->h_suballoc_slot) == 0) { 32403a0782d0SMark Fasheh /* 32413a0782d0SMark Fasheh * This code only understands how to 32423a0782d0SMark Fasheh * lock the suballocator in slot 0, 32433a0782d0SMark Fasheh * which is fine because allocation is 32443a0782d0SMark Fasheh * only ever done out of that 32453a0782d0SMark Fasheh * suballocator too. A future version 32463a0782d0SMark Fasheh * might change that however, so avoid 32473a0782d0SMark Fasheh * a free if we don't know how to 32483a0782d0SMark Fasheh * handle it. This way an fs incompat 32493a0782d0SMark Fasheh * bit will not be necessary. 32503a0782d0SMark Fasheh */ 32513a0782d0SMark Fasheh ret = ocfs2_free_extent_block(handle, 32523a0782d0SMark Fasheh tc->tc_ext_alloc_inode, 32533a0782d0SMark Fasheh tc->tc_ext_alloc_bh, 32543a0782d0SMark Fasheh eb); 32553a0782d0SMark Fasheh 32563a0782d0SMark Fasheh /* An error here is not fatal. */ 32573a0782d0SMark Fasheh if (ret < 0) 32583a0782d0SMark Fasheh mlog_errno(ret); 32593a0782d0SMark Fasheh } 32603a0782d0SMark Fasheh } else { 32613a0782d0SMark Fasheh deleted_eb = 0; 32623a0782d0SMark Fasheh } 32633a0782d0SMark Fasheh 32643a0782d0SMark Fasheh index--; 32653a0782d0SMark Fasheh } 32663a0782d0SMark Fasheh 32673a0782d0SMark Fasheh ret = 0; 32683a0782d0SMark Fasheh out: 32693a0782d0SMark Fasheh return ret; 32703a0782d0SMark Fasheh } 32713a0782d0SMark Fasheh 3272ccd979bdSMark Fasheh static int ocfs2_do_truncate(struct ocfs2_super *osb, 3273ccd979bdSMark Fasheh unsigned int clusters_to_del, 3274ccd979bdSMark Fasheh struct inode *inode, 3275ccd979bdSMark Fasheh struct buffer_head *fe_bh, 32761fabe148SMark Fasheh handle_t *handle, 3277dcd0538fSMark Fasheh struct ocfs2_truncate_context *tc, 3278dcd0538fSMark Fasheh struct ocfs2_path *path) 3279ccd979bdSMark Fasheh { 32803a0782d0SMark Fasheh int status; 3281ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 3282ccd979bdSMark Fasheh struct ocfs2_extent_block *last_eb = NULL; 3283ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3284ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 3285ccd979bdSMark Fasheh u64 delete_blk = 0; 3286ccd979bdSMark Fasheh 3287ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 3288ccd979bdSMark Fasheh 32893a0782d0SMark Fasheh status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 3290dcd0538fSMark Fasheh path, &last_eb_bh); 3291ccd979bdSMark Fasheh if (status < 0) { 3292ccd979bdSMark Fasheh mlog_errno(status); 3293ccd979bdSMark Fasheh goto bail; 3294ccd979bdSMark Fasheh } 3295ccd979bdSMark Fasheh 3296dcd0538fSMark Fasheh /* 3297dcd0538fSMark Fasheh * Each component will be touched, so we might as well journal 3298dcd0538fSMark Fasheh * here to avoid having to handle errors later. 3299dcd0538fSMark Fasheh */ 33003a0782d0SMark Fasheh status = ocfs2_journal_access_path(inode, handle, path); 3301ccd979bdSMark Fasheh if (status < 0) { 3302ccd979bdSMark Fasheh mlog_errno(status); 3303ccd979bdSMark Fasheh goto bail; 3304ccd979bdSMark Fasheh } 3305dcd0538fSMark Fasheh 3306dcd0538fSMark Fasheh if (last_eb_bh) { 3307dcd0538fSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 3308dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 3309dcd0538fSMark Fasheh if (status < 0) { 3310dcd0538fSMark Fasheh mlog_errno(status); 3311dcd0538fSMark Fasheh goto bail; 3312dcd0538fSMark Fasheh } 3313dcd0538fSMark Fasheh 3314dcd0538fSMark Fasheh last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3315dcd0538fSMark Fasheh } 3316dcd0538fSMark Fasheh 3317ccd979bdSMark Fasheh el = &(fe->id2.i_list); 3318ccd979bdSMark Fasheh 3319dcd0538fSMark Fasheh /* 3320dcd0538fSMark Fasheh * Lower levels depend on this never happening, but it's best 3321dcd0538fSMark Fasheh * to check it up here before changing the tree. 3322dcd0538fSMark Fasheh */ 3323e48edee2SMark Fasheh if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 3324dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 3325dcd0538fSMark Fasheh "Inode %lu has an empty extent record, depth %u\n", 3326dcd0538fSMark Fasheh inode->i_ino, le16_to_cpu(el->l_tree_depth)); 33273a0782d0SMark Fasheh status = -EROFS; 3328dcd0538fSMark Fasheh goto bail; 3329dcd0538fSMark Fasheh } 3330dcd0538fSMark Fasheh 3331ccd979bdSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 3332ccd979bdSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3333ccd979bdSMark Fasheh clusters_to_del; 3334ccd979bdSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 3335ccd979bdSMark Fasheh le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3336ccd979bdSMark Fasheh 33373a0782d0SMark Fasheh status = ocfs2_trim_tree(inode, path, handle, tc, 33383a0782d0SMark Fasheh clusters_to_del, &delete_blk); 33393a0782d0SMark Fasheh if (status) { 33403a0782d0SMark Fasheh mlog_errno(status); 33413a0782d0SMark Fasheh goto bail; 3342ccd979bdSMark Fasheh } 3343ccd979bdSMark Fasheh 3344dcd0538fSMark Fasheh if (le32_to_cpu(fe->i_clusters) == 0) { 3345ccd979bdSMark Fasheh /* trunc to zero is a special case. */ 3346ccd979bdSMark Fasheh el->l_tree_depth = 0; 3347ccd979bdSMark Fasheh fe->i_last_eb_blk = 0; 3348ccd979bdSMark Fasheh } else if (last_eb) 3349ccd979bdSMark Fasheh fe->i_last_eb_blk = last_eb->h_blkno; 3350ccd979bdSMark Fasheh 3351ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 3352ccd979bdSMark Fasheh if (status < 0) { 3353ccd979bdSMark Fasheh mlog_errno(status); 3354ccd979bdSMark Fasheh goto bail; 3355ccd979bdSMark Fasheh } 3356ccd979bdSMark Fasheh 3357ccd979bdSMark Fasheh if (last_eb) { 3358ccd979bdSMark Fasheh /* If there will be a new last extent block, then by 3359ccd979bdSMark Fasheh * definition, there cannot be any leaves to the right of 3360ccd979bdSMark Fasheh * him. */ 3361ccd979bdSMark Fasheh last_eb->h_next_leaf_blk = 0; 3362ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 3363ccd979bdSMark Fasheh if (status < 0) { 3364ccd979bdSMark Fasheh mlog_errno(status); 3365ccd979bdSMark Fasheh goto bail; 3366ccd979bdSMark Fasheh } 3367ccd979bdSMark Fasheh } 3368ccd979bdSMark Fasheh 33693a0782d0SMark Fasheh if (delete_blk) { 3370ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, delete_blk, 3371ccd979bdSMark Fasheh clusters_to_del); 3372ccd979bdSMark Fasheh if (status < 0) { 3373ccd979bdSMark Fasheh mlog_errno(status); 3374ccd979bdSMark Fasheh goto bail; 3375ccd979bdSMark Fasheh } 33763a0782d0SMark Fasheh } 3377ccd979bdSMark Fasheh status = 0; 3378ccd979bdSMark Fasheh bail: 3379dcd0538fSMark Fasheh 3380ccd979bdSMark Fasheh mlog_exit(status); 3381ccd979bdSMark Fasheh return status; 3382ccd979bdSMark Fasheh } 3383ccd979bdSMark Fasheh 338460b11392SMark Fasheh static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 338560b11392SMark Fasheh { 338660b11392SMark Fasheh set_buffer_uptodate(bh); 338760b11392SMark Fasheh mark_buffer_dirty(bh); 338860b11392SMark Fasheh return 0; 338960b11392SMark Fasheh } 339060b11392SMark Fasheh 339160b11392SMark Fasheh static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) 339260b11392SMark Fasheh { 339360b11392SMark Fasheh set_buffer_uptodate(bh); 339460b11392SMark Fasheh mark_buffer_dirty(bh); 339560b11392SMark Fasheh return ocfs2_journal_dirty_data(handle, bh); 339660b11392SMark Fasheh } 339760b11392SMark Fasheh 339860b11392SMark Fasheh static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 339960b11392SMark Fasheh struct page **pages, int numpages, 340060b11392SMark Fasheh u64 phys, handle_t *handle) 340160b11392SMark Fasheh { 340260b11392SMark Fasheh int i, ret, partial = 0; 340360b11392SMark Fasheh void *kaddr; 340460b11392SMark Fasheh struct page *page; 340560b11392SMark Fasheh unsigned int from, to = PAGE_CACHE_SIZE; 340660b11392SMark Fasheh struct super_block *sb = inode->i_sb; 340760b11392SMark Fasheh 340860b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 340960b11392SMark Fasheh 341060b11392SMark Fasheh if (numpages == 0) 341160b11392SMark Fasheh goto out; 341260b11392SMark Fasheh 341360b11392SMark Fasheh from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 341460b11392SMark Fasheh if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { 341560b11392SMark Fasheh /* 341660b11392SMark Fasheh * Since 'from' has been capped to a value below page 341760b11392SMark Fasheh * size, this calculation won't be able to overflow 341860b11392SMark Fasheh * 'to' 341960b11392SMark Fasheh */ 342060b11392SMark Fasheh to = ocfs2_align_bytes_to_clusters(sb, from); 342160b11392SMark Fasheh 342260b11392SMark Fasheh /* 342360b11392SMark Fasheh * The truncate tail in this case should never contain 342460b11392SMark Fasheh * more than one page at maximum. The loop below also 342560b11392SMark Fasheh * assumes this. 342660b11392SMark Fasheh */ 342760b11392SMark Fasheh BUG_ON(numpages != 1); 342860b11392SMark Fasheh } 342960b11392SMark Fasheh 343060b11392SMark Fasheh for(i = 0; i < numpages; i++) { 343160b11392SMark Fasheh page = pages[i]; 343260b11392SMark Fasheh 343360b11392SMark Fasheh BUG_ON(from > PAGE_CACHE_SIZE); 343460b11392SMark Fasheh BUG_ON(to > PAGE_CACHE_SIZE); 343560b11392SMark Fasheh 343660b11392SMark Fasheh ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); 343760b11392SMark Fasheh if (ret) 343860b11392SMark Fasheh mlog_errno(ret); 343960b11392SMark Fasheh 344060b11392SMark Fasheh kaddr = kmap_atomic(page, KM_USER0); 344160b11392SMark Fasheh memset(kaddr + from, 0, to - from); 344260b11392SMark Fasheh kunmap_atomic(kaddr, KM_USER0); 344360b11392SMark Fasheh 344460b11392SMark Fasheh /* 344560b11392SMark Fasheh * Need to set the buffers we zero'd into uptodate 344660b11392SMark Fasheh * here if they aren't - ocfs2_map_page_blocks() 344760b11392SMark Fasheh * might've skipped some 344860b11392SMark Fasheh */ 344960b11392SMark Fasheh if (ocfs2_should_order_data(inode)) { 345060b11392SMark Fasheh ret = walk_page_buffers(handle, 345160b11392SMark Fasheh page_buffers(page), 345260b11392SMark Fasheh from, to, &partial, 345360b11392SMark Fasheh ocfs2_ordered_zero_func); 345460b11392SMark Fasheh if (ret < 0) 345560b11392SMark Fasheh mlog_errno(ret); 345660b11392SMark Fasheh } else { 345760b11392SMark Fasheh ret = walk_page_buffers(handle, page_buffers(page), 345860b11392SMark Fasheh from, to, &partial, 345960b11392SMark Fasheh ocfs2_writeback_zero_func); 346060b11392SMark Fasheh if (ret < 0) 346160b11392SMark Fasheh mlog_errno(ret); 346260b11392SMark Fasheh } 346360b11392SMark Fasheh 346460b11392SMark Fasheh if (!partial) 346560b11392SMark Fasheh SetPageUptodate(page); 346660b11392SMark Fasheh 346760b11392SMark Fasheh flush_dcache_page(page); 346860b11392SMark Fasheh 346960b11392SMark Fasheh /* 347060b11392SMark Fasheh * Every page after the 1st one should be completely zero'd. 347160b11392SMark Fasheh */ 347260b11392SMark Fasheh from = 0; 347360b11392SMark Fasheh } 347460b11392SMark Fasheh out: 347560b11392SMark Fasheh if (pages) { 347660b11392SMark Fasheh for (i = 0; i < numpages; i++) { 347760b11392SMark Fasheh page = pages[i]; 347860b11392SMark Fasheh unlock_page(page); 347960b11392SMark Fasheh mark_page_accessed(page); 348060b11392SMark Fasheh page_cache_release(page); 348160b11392SMark Fasheh } 348260b11392SMark Fasheh } 348360b11392SMark Fasheh } 348460b11392SMark Fasheh 348560b11392SMark Fasheh static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 348660b11392SMark Fasheh int *num, u64 *phys) 348760b11392SMark Fasheh { 348860b11392SMark Fasheh int i, numpages = 0, ret = 0; 348960b11392SMark Fasheh unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; 3490*49cb8d2dSMark Fasheh unsigned int ext_flags; 349160b11392SMark Fasheh struct super_block *sb = inode->i_sb; 349260b11392SMark Fasheh struct address_space *mapping = inode->i_mapping; 349360b11392SMark Fasheh unsigned long index; 349460b11392SMark Fasheh u64 next_cluster_bytes; 349560b11392SMark Fasheh 349660b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 349760b11392SMark Fasheh 349860b11392SMark Fasheh /* Cluster boundary, so we don't need to grab any pages. */ 349960b11392SMark Fasheh if ((isize & (csize - 1)) == 0) 350060b11392SMark Fasheh goto out; 350160b11392SMark Fasheh 350260b11392SMark Fasheh ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 3503*49cb8d2dSMark Fasheh phys, NULL, &ext_flags); 350460b11392SMark Fasheh if (ret) { 350560b11392SMark Fasheh mlog_errno(ret); 350660b11392SMark Fasheh goto out; 350760b11392SMark Fasheh } 350860b11392SMark Fasheh 350960b11392SMark Fasheh /* Tail is a hole. */ 351060b11392SMark Fasheh if (*phys == 0) 351160b11392SMark Fasheh goto out; 351260b11392SMark Fasheh 3513*49cb8d2dSMark Fasheh /* Tail is marked as unwritten, we can count on write to zero 3514*49cb8d2dSMark Fasheh * in that case. */ 3515*49cb8d2dSMark Fasheh if (ext_flags & OCFS2_EXT_UNWRITTEN) 3516*49cb8d2dSMark Fasheh goto out; 3517*49cb8d2dSMark Fasheh 351860b11392SMark Fasheh next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 351960b11392SMark Fasheh index = isize >> PAGE_CACHE_SHIFT; 352060b11392SMark Fasheh do { 352160b11392SMark Fasheh pages[numpages] = grab_cache_page(mapping, index); 352260b11392SMark Fasheh if (!pages[numpages]) { 352360b11392SMark Fasheh ret = -ENOMEM; 352460b11392SMark Fasheh mlog_errno(ret); 352560b11392SMark Fasheh goto out; 352660b11392SMark Fasheh } 352760b11392SMark Fasheh 352860b11392SMark Fasheh numpages++; 352960b11392SMark Fasheh index++; 353060b11392SMark Fasheh } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 353160b11392SMark Fasheh 353260b11392SMark Fasheh out: 353360b11392SMark Fasheh if (ret != 0) { 353460b11392SMark Fasheh if (pages) { 353560b11392SMark Fasheh for (i = 0; i < numpages; i++) { 353660b11392SMark Fasheh if (pages[i]) { 353760b11392SMark Fasheh unlock_page(pages[i]); 353860b11392SMark Fasheh page_cache_release(pages[i]); 353960b11392SMark Fasheh } 354060b11392SMark Fasheh } 354160b11392SMark Fasheh } 354260b11392SMark Fasheh numpages = 0; 354360b11392SMark Fasheh } 354460b11392SMark Fasheh 354560b11392SMark Fasheh *num = numpages; 354660b11392SMark Fasheh 354760b11392SMark Fasheh return ret; 354860b11392SMark Fasheh } 354960b11392SMark Fasheh 355060b11392SMark Fasheh /* 355160b11392SMark Fasheh * Zero the area past i_size but still within an allocated 355260b11392SMark Fasheh * cluster. This avoids exposing nonzero data on subsequent file 355360b11392SMark Fasheh * extends. 355460b11392SMark Fasheh * 355560b11392SMark Fasheh * We need to call this before i_size is updated on the inode because 355660b11392SMark Fasheh * otherwise block_write_full_page() will skip writeout of pages past 355760b11392SMark Fasheh * i_size. The new_i_size parameter is passed for this reason. 355860b11392SMark Fasheh */ 355960b11392SMark Fasheh int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 356060b11392SMark Fasheh u64 new_i_size) 356160b11392SMark Fasheh { 356260b11392SMark Fasheh int ret, numpages; 3563fa41045fSMark Fasheh loff_t endbyte; 356460b11392SMark Fasheh struct page **pages = NULL; 356560b11392SMark Fasheh u64 phys; 356660b11392SMark Fasheh 356760b11392SMark Fasheh /* 356860b11392SMark Fasheh * File systems which don't support sparse files zero on every 356960b11392SMark Fasheh * extend. 357060b11392SMark Fasheh */ 357160b11392SMark Fasheh if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 357260b11392SMark Fasheh return 0; 357360b11392SMark Fasheh 357460b11392SMark Fasheh pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), 357560b11392SMark Fasheh sizeof(struct page *), GFP_NOFS); 357660b11392SMark Fasheh if (pages == NULL) { 357760b11392SMark Fasheh ret = -ENOMEM; 357860b11392SMark Fasheh mlog_errno(ret); 357960b11392SMark Fasheh goto out; 358060b11392SMark Fasheh } 358160b11392SMark Fasheh 358260b11392SMark Fasheh ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 358360b11392SMark Fasheh if (ret) { 358460b11392SMark Fasheh mlog_errno(ret); 358560b11392SMark Fasheh goto out; 358660b11392SMark Fasheh } 358760b11392SMark Fasheh 358860b11392SMark Fasheh if (numpages == 0) 358960b11392SMark Fasheh goto out; 359060b11392SMark Fasheh 359160b11392SMark Fasheh ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 359260b11392SMark Fasheh handle); 359360b11392SMark Fasheh 359460b11392SMark Fasheh /* 359560b11392SMark Fasheh * Initiate writeout of the pages we zero'd here. We don't 359660b11392SMark Fasheh * wait on them - the truncate_inode_pages() call later will 359760b11392SMark Fasheh * do that for us. 359860b11392SMark Fasheh */ 3599fa41045fSMark Fasheh endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 3600fa41045fSMark Fasheh ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 3601fa41045fSMark Fasheh endbyte - 1, SYNC_FILE_RANGE_WRITE); 360260b11392SMark Fasheh if (ret) 360360b11392SMark Fasheh mlog_errno(ret); 360460b11392SMark Fasheh 360560b11392SMark Fasheh out: 360660b11392SMark Fasheh if (pages) 360760b11392SMark Fasheh kfree(pages); 360860b11392SMark Fasheh 360960b11392SMark Fasheh return ret; 361060b11392SMark Fasheh } 361160b11392SMark Fasheh 3612ccd979bdSMark Fasheh /* 3613ccd979bdSMark Fasheh * It is expected, that by the time you call this function, 3614ccd979bdSMark Fasheh * inode->i_size and fe->i_size have been adjusted. 3615ccd979bdSMark Fasheh * 3616ccd979bdSMark Fasheh * WARNING: This will kfree the truncate context 3617ccd979bdSMark Fasheh */ 3618ccd979bdSMark Fasheh int ocfs2_commit_truncate(struct ocfs2_super *osb, 3619ccd979bdSMark Fasheh struct inode *inode, 3620ccd979bdSMark Fasheh struct buffer_head *fe_bh, 3621ccd979bdSMark Fasheh struct ocfs2_truncate_context *tc) 3622ccd979bdSMark Fasheh { 3623ccd979bdSMark Fasheh int status, i, credits, tl_sem = 0; 3624dcd0538fSMark Fasheh u32 clusters_to_del, new_highest_cpos, range; 3625ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 36261fabe148SMark Fasheh handle_t *handle = NULL; 3627ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 3628dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 3629ccd979bdSMark Fasheh 3630ccd979bdSMark Fasheh mlog_entry_void(); 3631ccd979bdSMark Fasheh 3632ccd979bdSMark Fasheh down_write(&OCFS2_I(inode)->ip_alloc_sem); 3633ccd979bdSMark Fasheh 3634dcd0538fSMark Fasheh new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 3635ccd979bdSMark Fasheh i_size_read(inode)); 3636ccd979bdSMark Fasheh 3637dcd0538fSMark Fasheh path = ocfs2_new_inode_path(fe_bh); 3638dcd0538fSMark Fasheh if (!path) { 3639dcd0538fSMark Fasheh status = -ENOMEM; 3640ccd979bdSMark Fasheh mlog_errno(status); 3641ccd979bdSMark Fasheh goto bail; 3642ccd979bdSMark Fasheh } 3643dcd0538fSMark Fasheh start: 3644dcd0538fSMark Fasheh /* 36453a0782d0SMark Fasheh * Check that we still have allocation to delete. 36463a0782d0SMark Fasheh */ 36473a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == 0) { 36483a0782d0SMark Fasheh status = 0; 36493a0782d0SMark Fasheh goto bail; 36503a0782d0SMark Fasheh } 36513a0782d0SMark Fasheh 36523a0782d0SMark Fasheh /* 3653dcd0538fSMark Fasheh * Truncate always works against the rightmost tree branch. 3654dcd0538fSMark Fasheh */ 3655dcd0538fSMark Fasheh status = ocfs2_find_path(inode, path, UINT_MAX); 3656dcd0538fSMark Fasheh if (status) { 3657dcd0538fSMark Fasheh mlog_errno(status); 3658ccd979bdSMark Fasheh goto bail; 3659ccd979bdSMark Fasheh } 3660ccd979bdSMark Fasheh 3661dcd0538fSMark Fasheh mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 3662dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 3663dcd0538fSMark Fasheh 3664dcd0538fSMark Fasheh /* 3665dcd0538fSMark Fasheh * By now, el will point to the extent list on the bottom most 3666dcd0538fSMark Fasheh * portion of this tree. Only the tail record is considered in 3667dcd0538fSMark Fasheh * each pass. 3668dcd0538fSMark Fasheh * 3669dcd0538fSMark Fasheh * We handle the following cases, in order: 3670dcd0538fSMark Fasheh * - empty extent: delete the remaining branch 3671dcd0538fSMark Fasheh * - remove the entire record 3672dcd0538fSMark Fasheh * - remove a partial record 3673dcd0538fSMark Fasheh * - no record needs to be removed (truncate has completed) 3674dcd0538fSMark Fasheh */ 3675dcd0538fSMark Fasheh el = path_leaf_el(path); 36763a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 36773a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 36783a0782d0SMark Fasheh "Inode %llu has empty extent block at %llu\n", 36793a0782d0SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 36803a0782d0SMark Fasheh (unsigned long long)path_leaf_bh(path)->b_blocknr); 36813a0782d0SMark Fasheh status = -EROFS; 36823a0782d0SMark Fasheh goto bail; 36833a0782d0SMark Fasheh } 36843a0782d0SMark Fasheh 3685ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 3686dcd0538fSMark Fasheh range = le32_to_cpu(el->l_recs[i].e_cpos) + 3687e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 3688dcd0538fSMark Fasheh if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 3689dcd0538fSMark Fasheh clusters_to_del = 0; 3690dcd0538fSMark Fasheh } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 3691e48edee2SMark Fasheh clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 3692dcd0538fSMark Fasheh } else if (range > new_highest_cpos) { 3693e48edee2SMark Fasheh clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 3694ccd979bdSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos)) - 3695dcd0538fSMark Fasheh new_highest_cpos; 3696dcd0538fSMark Fasheh } else { 3697dcd0538fSMark Fasheh status = 0; 3698dcd0538fSMark Fasheh goto bail; 3699dcd0538fSMark Fasheh } 3700ccd979bdSMark Fasheh 3701dcd0538fSMark Fasheh mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 3702dcd0538fSMark Fasheh clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 3703dcd0538fSMark Fasheh 3704dcd0538fSMark Fasheh BUG_ON(clusters_to_del == 0); 3705ccd979bdSMark Fasheh 37061b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 3707ccd979bdSMark Fasheh tl_sem = 1; 3708ccd979bdSMark Fasheh /* ocfs2_truncate_log_needs_flush guarantees us at least one 3709ccd979bdSMark Fasheh * record is free for use. If there isn't any, we flush to get 3710ccd979bdSMark Fasheh * an empty truncate log. */ 3711ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 3712ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 3713ccd979bdSMark Fasheh if (status < 0) { 3714ccd979bdSMark Fasheh mlog_errno(status); 3715ccd979bdSMark Fasheh goto bail; 3716ccd979bdSMark Fasheh } 3717ccd979bdSMark Fasheh } 3718ccd979bdSMark Fasheh 3719ccd979bdSMark Fasheh credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3720dcd0538fSMark Fasheh (struct ocfs2_dinode *)fe_bh->b_data, 3721dcd0538fSMark Fasheh el); 372265eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, credits); 3723ccd979bdSMark Fasheh if (IS_ERR(handle)) { 3724ccd979bdSMark Fasheh status = PTR_ERR(handle); 3725ccd979bdSMark Fasheh handle = NULL; 3726ccd979bdSMark Fasheh mlog_errno(status); 3727ccd979bdSMark Fasheh goto bail; 3728ccd979bdSMark Fasheh } 3729ccd979bdSMark Fasheh 3730dcd0538fSMark Fasheh status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 3731dcd0538fSMark Fasheh tc, path); 3732ccd979bdSMark Fasheh if (status < 0) { 3733ccd979bdSMark Fasheh mlog_errno(status); 3734ccd979bdSMark Fasheh goto bail; 3735ccd979bdSMark Fasheh } 3736ccd979bdSMark Fasheh 37371b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3738ccd979bdSMark Fasheh tl_sem = 0; 3739ccd979bdSMark Fasheh 374002dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3741ccd979bdSMark Fasheh handle = NULL; 3742ccd979bdSMark Fasheh 3743dcd0538fSMark Fasheh ocfs2_reinit_path(path, 1); 3744dcd0538fSMark Fasheh 3745dcd0538fSMark Fasheh /* 37463a0782d0SMark Fasheh * The check above will catch the case where we've truncated 37473a0782d0SMark Fasheh * away all allocation. 3748dcd0538fSMark Fasheh */ 3749ccd979bdSMark Fasheh goto start; 37503a0782d0SMark Fasheh 3751ccd979bdSMark Fasheh bail: 3752ccd979bdSMark Fasheh up_write(&OCFS2_I(inode)->ip_alloc_sem); 3753ccd979bdSMark Fasheh 3754ccd979bdSMark Fasheh ocfs2_schedule_truncate_log_flush(osb, 1); 3755ccd979bdSMark Fasheh 3756ccd979bdSMark Fasheh if (tl_sem) 37571b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3758ccd979bdSMark Fasheh 3759ccd979bdSMark Fasheh if (handle) 376002dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3761ccd979bdSMark Fasheh 3762dcd0538fSMark Fasheh ocfs2_free_path(path); 3763ccd979bdSMark Fasheh 3764ccd979bdSMark Fasheh /* This will drop the ext_alloc cluster lock for us */ 3765ccd979bdSMark Fasheh ocfs2_free_truncate_context(tc); 3766ccd979bdSMark Fasheh 3767ccd979bdSMark Fasheh mlog_exit(status); 3768ccd979bdSMark Fasheh return status; 3769ccd979bdSMark Fasheh } 3770ccd979bdSMark Fasheh 3771ccd979bdSMark Fasheh /* 3772ccd979bdSMark Fasheh * Expects the inode to already be locked. This will figure out which 3773ccd979bdSMark Fasheh * inodes need to be locked and will put them on the returned truncate 3774ccd979bdSMark Fasheh * context. 3775ccd979bdSMark Fasheh */ 3776ccd979bdSMark Fasheh int ocfs2_prepare_truncate(struct ocfs2_super *osb, 3777ccd979bdSMark Fasheh struct inode *inode, 3778ccd979bdSMark Fasheh struct buffer_head *fe_bh, 3779ccd979bdSMark Fasheh struct ocfs2_truncate_context **tc) 3780ccd979bdSMark Fasheh { 3781dcd0538fSMark Fasheh int status, metadata_delete, i; 3782ccd979bdSMark Fasheh unsigned int new_i_clusters; 3783ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 3784ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 3785ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3786ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 3787ccd979bdSMark Fasheh struct inode *ext_alloc_inode = NULL; 3788ccd979bdSMark Fasheh struct buffer_head *ext_alloc_bh = NULL; 3789ccd979bdSMark Fasheh 3790ccd979bdSMark Fasheh mlog_entry_void(); 3791ccd979bdSMark Fasheh 3792ccd979bdSMark Fasheh *tc = NULL; 3793ccd979bdSMark Fasheh 3794ccd979bdSMark Fasheh new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 3795ccd979bdSMark Fasheh i_size_read(inode)); 3796ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 3797ccd979bdSMark Fasheh 3798ccd979bdSMark Fasheh mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 3799b0697053SMark Fasheh "%llu\n", fe->i_clusters, new_i_clusters, 3800b0697053SMark Fasheh (unsigned long long)fe->i_size); 3801ccd979bdSMark Fasheh 3802cd861280SRobert P. J. Day *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 3803ccd979bdSMark Fasheh if (!(*tc)) { 3804ccd979bdSMark Fasheh status = -ENOMEM; 3805ccd979bdSMark Fasheh mlog_errno(status); 3806ccd979bdSMark Fasheh goto bail; 3807ccd979bdSMark Fasheh } 3808ccd979bdSMark Fasheh 3809ccd979bdSMark Fasheh metadata_delete = 0; 3810ccd979bdSMark Fasheh if (fe->id2.i_list.l_tree_depth) { 3811ccd979bdSMark Fasheh /* If we have a tree, then the truncate may result in 3812ccd979bdSMark Fasheh * metadata deletes. Figure this out from the 3813ccd979bdSMark Fasheh * rightmost leaf block.*/ 3814ccd979bdSMark Fasheh status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 3815ccd979bdSMark Fasheh &last_eb_bh, OCFS2_BH_CACHED, inode); 3816ccd979bdSMark Fasheh if (status < 0) { 3817ccd979bdSMark Fasheh mlog_errno(status); 3818ccd979bdSMark Fasheh goto bail; 3819ccd979bdSMark Fasheh } 3820ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3821ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3822ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3823ccd979bdSMark Fasheh 3824ccd979bdSMark Fasheh brelse(last_eb_bh); 3825ccd979bdSMark Fasheh status = -EIO; 3826ccd979bdSMark Fasheh goto bail; 3827ccd979bdSMark Fasheh } 3828ccd979bdSMark Fasheh el = &(eb->h_list); 3829dcd0538fSMark Fasheh 3830dcd0538fSMark Fasheh i = 0; 3831dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) 3832dcd0538fSMark Fasheh i = 1; 3833dcd0538fSMark Fasheh /* 3834dcd0538fSMark Fasheh * XXX: Should we check that next_free_rec contains 3835dcd0538fSMark Fasheh * the extent? 3836dcd0538fSMark Fasheh */ 3837dcd0538fSMark Fasheh if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_i_clusters) 3838ccd979bdSMark Fasheh metadata_delete = 1; 3839ccd979bdSMark Fasheh } 3840ccd979bdSMark Fasheh 3841ccd979bdSMark Fasheh (*tc)->tc_last_eb_bh = last_eb_bh; 3842ccd979bdSMark Fasheh 3843ccd979bdSMark Fasheh if (metadata_delete) { 3844ccd979bdSMark Fasheh mlog(0, "Will have to delete metadata for this trunc. " 3845ccd979bdSMark Fasheh "locking allocator.\n"); 3846ccd979bdSMark Fasheh ext_alloc_inode = ocfs2_get_system_file_inode(osb, EXTENT_ALLOC_SYSTEM_INODE, 0); 3847ccd979bdSMark Fasheh if (!ext_alloc_inode) { 3848ccd979bdSMark Fasheh status = -ENOMEM; 3849ccd979bdSMark Fasheh mlog_errno(status); 3850ccd979bdSMark Fasheh goto bail; 3851ccd979bdSMark Fasheh } 3852ccd979bdSMark Fasheh 38531b1dcc1bSJes Sorensen mutex_lock(&ext_alloc_inode->i_mutex); 3854ccd979bdSMark Fasheh (*tc)->tc_ext_alloc_inode = ext_alloc_inode; 3855ccd979bdSMark Fasheh 38564bcec184SMark Fasheh status = ocfs2_meta_lock(ext_alloc_inode, &ext_alloc_bh, 1); 3857ccd979bdSMark Fasheh if (status < 0) { 3858ccd979bdSMark Fasheh mlog_errno(status); 3859ccd979bdSMark Fasheh goto bail; 3860ccd979bdSMark Fasheh } 3861ccd979bdSMark Fasheh (*tc)->tc_ext_alloc_bh = ext_alloc_bh; 3862ccd979bdSMark Fasheh (*tc)->tc_ext_alloc_locked = 1; 3863ccd979bdSMark Fasheh } 3864ccd979bdSMark Fasheh 3865ccd979bdSMark Fasheh status = 0; 3866ccd979bdSMark Fasheh bail: 3867ccd979bdSMark Fasheh if (status < 0) { 3868ccd979bdSMark Fasheh if (*tc) 3869ccd979bdSMark Fasheh ocfs2_free_truncate_context(*tc); 3870ccd979bdSMark Fasheh *tc = NULL; 3871ccd979bdSMark Fasheh } 3872ccd979bdSMark Fasheh mlog_exit_void(); 3873ccd979bdSMark Fasheh return status; 3874ccd979bdSMark Fasheh } 3875ccd979bdSMark Fasheh 3876ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 3877ccd979bdSMark Fasheh { 3878ccd979bdSMark Fasheh if (tc->tc_ext_alloc_inode) { 3879ccd979bdSMark Fasheh if (tc->tc_ext_alloc_locked) 3880ccd979bdSMark Fasheh ocfs2_meta_unlock(tc->tc_ext_alloc_inode, 1); 3881ccd979bdSMark Fasheh 38821b1dcc1bSJes Sorensen mutex_unlock(&tc->tc_ext_alloc_inode->i_mutex); 3883ccd979bdSMark Fasheh iput(tc->tc_ext_alloc_inode); 3884ccd979bdSMark Fasheh } 3885ccd979bdSMark Fasheh 3886ccd979bdSMark Fasheh if (tc->tc_ext_alloc_bh) 3887ccd979bdSMark Fasheh brelse(tc->tc_ext_alloc_bh); 3888ccd979bdSMark Fasheh 3889ccd979bdSMark Fasheh if (tc->tc_last_eb_bh) 3890ccd979bdSMark Fasheh brelse(tc->tc_last_eb_bh); 3891ccd979bdSMark Fasheh 3892ccd979bdSMark Fasheh kfree(tc); 3893ccd979bdSMark Fasheh } 3894