1ccd979bdSMark Fasheh /* -*- mode: c; c-basic-offset: 8; -*- 2ccd979bdSMark Fasheh * vim: noexpandtab sw=8 ts=8 sts=0: 3ccd979bdSMark Fasheh * 4ccd979bdSMark Fasheh * alloc.c 5ccd979bdSMark Fasheh * 6ccd979bdSMark Fasheh * Extent allocs and frees 7ccd979bdSMark Fasheh * 8ccd979bdSMark Fasheh * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9ccd979bdSMark Fasheh * 10ccd979bdSMark Fasheh * This program is free software; you can redistribute it and/or 11ccd979bdSMark Fasheh * modify it under the terms of the GNU General Public 12ccd979bdSMark Fasheh * License as published by the Free Software Foundation; either 13ccd979bdSMark Fasheh * version 2 of the License, or (at your option) any later version. 14ccd979bdSMark Fasheh * 15ccd979bdSMark Fasheh * This program is distributed in the hope that it will be useful, 16ccd979bdSMark Fasheh * but WITHOUT ANY WARRANTY; without even the implied warranty of 17ccd979bdSMark Fasheh * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18ccd979bdSMark Fasheh * General Public License for more details. 19ccd979bdSMark Fasheh * 20ccd979bdSMark Fasheh * You should have received a copy of the GNU General Public 21ccd979bdSMark Fasheh * License along with this program; if not, write to the 22ccd979bdSMark Fasheh * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23ccd979bdSMark Fasheh * Boston, MA 021110-1307, USA. 24ccd979bdSMark Fasheh */ 25ccd979bdSMark Fasheh 26ccd979bdSMark Fasheh #include <linux/fs.h> 27ccd979bdSMark Fasheh #include <linux/types.h> 28ccd979bdSMark Fasheh #include <linux/slab.h> 29ccd979bdSMark Fasheh #include <linux/highmem.h> 3060b11392SMark Fasheh #include <linux/swap.h> 31ccd979bdSMark Fasheh 32ccd979bdSMark Fasheh #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33ccd979bdSMark Fasheh #include <cluster/masklog.h> 34ccd979bdSMark Fasheh 35ccd979bdSMark Fasheh #include "ocfs2.h" 36ccd979bdSMark Fasheh 37ccd979bdSMark Fasheh #include "alloc.h" 3860b11392SMark Fasheh #include "aops.h" 39ccd979bdSMark Fasheh #include "dlmglue.h" 40ccd979bdSMark Fasheh #include "extent_map.h" 41ccd979bdSMark Fasheh #include "inode.h" 42ccd979bdSMark Fasheh #include "journal.h" 43ccd979bdSMark Fasheh #include "localalloc.h" 44ccd979bdSMark Fasheh #include "suballoc.h" 45ccd979bdSMark Fasheh #include "sysfile.h" 46ccd979bdSMark Fasheh #include "file.h" 47ccd979bdSMark Fasheh #include "super.h" 48ccd979bdSMark Fasheh #include "uptodate.h" 49ccd979bdSMark Fasheh 50ccd979bdSMark Fasheh #include "buffer_head_io.h" 51ccd979bdSMark Fasheh 52ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 53*59a5e416SMark Fasheh static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 54*59a5e416SMark Fasheh struct ocfs2_extent_block *eb); 55ccd979bdSMark Fasheh 56dcd0538fSMark Fasheh /* 57dcd0538fSMark Fasheh * Structures which describe a path through a btree, and functions to 58dcd0538fSMark Fasheh * manipulate them. 59dcd0538fSMark Fasheh * 60dcd0538fSMark Fasheh * The idea here is to be as generic as possible with the tree 61dcd0538fSMark Fasheh * manipulation code. 62dcd0538fSMark Fasheh */ 63dcd0538fSMark Fasheh struct ocfs2_path_item { 64dcd0538fSMark Fasheh struct buffer_head *bh; 65dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 66dcd0538fSMark Fasheh }; 67dcd0538fSMark Fasheh 68dcd0538fSMark Fasheh #define OCFS2_MAX_PATH_DEPTH 5 69dcd0538fSMark Fasheh 70dcd0538fSMark Fasheh struct ocfs2_path { 71dcd0538fSMark Fasheh int p_tree_depth; 72dcd0538fSMark Fasheh struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 73dcd0538fSMark Fasheh }; 74dcd0538fSMark Fasheh 75dcd0538fSMark Fasheh #define path_root_bh(_path) ((_path)->p_node[0].bh) 76dcd0538fSMark Fasheh #define path_root_el(_path) ((_path)->p_node[0].el) 77dcd0538fSMark Fasheh #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 78dcd0538fSMark Fasheh #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 79dcd0538fSMark Fasheh #define path_num_items(_path) ((_path)->p_tree_depth + 1) 80dcd0538fSMark Fasheh 81dcd0538fSMark Fasheh /* 82dcd0538fSMark Fasheh * Reset the actual path elements so that we can re-use the structure 83dcd0538fSMark Fasheh * to build another path. Generally, this involves freeing the buffer 84dcd0538fSMark Fasheh * heads. 85dcd0538fSMark Fasheh */ 86dcd0538fSMark Fasheh static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 87dcd0538fSMark Fasheh { 88dcd0538fSMark Fasheh int i, start = 0, depth = 0; 89dcd0538fSMark Fasheh struct ocfs2_path_item *node; 90dcd0538fSMark Fasheh 91dcd0538fSMark Fasheh if (keep_root) 92dcd0538fSMark Fasheh start = 1; 93dcd0538fSMark Fasheh 94dcd0538fSMark Fasheh for(i = start; i < path_num_items(path); i++) { 95dcd0538fSMark Fasheh node = &path->p_node[i]; 96dcd0538fSMark Fasheh 97dcd0538fSMark Fasheh brelse(node->bh); 98dcd0538fSMark Fasheh node->bh = NULL; 99dcd0538fSMark Fasheh node->el = NULL; 100dcd0538fSMark Fasheh } 101dcd0538fSMark Fasheh 102dcd0538fSMark Fasheh /* 103dcd0538fSMark Fasheh * Tree depth may change during truncate, or insert. If we're 104dcd0538fSMark Fasheh * keeping the root extent list, then make sure that our path 105dcd0538fSMark Fasheh * structure reflects the proper depth. 106dcd0538fSMark Fasheh */ 107dcd0538fSMark Fasheh if (keep_root) 108dcd0538fSMark Fasheh depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 109dcd0538fSMark Fasheh 110dcd0538fSMark Fasheh path->p_tree_depth = depth; 111dcd0538fSMark Fasheh } 112dcd0538fSMark Fasheh 113dcd0538fSMark Fasheh static void ocfs2_free_path(struct ocfs2_path *path) 114dcd0538fSMark Fasheh { 115dcd0538fSMark Fasheh if (path) { 116dcd0538fSMark Fasheh ocfs2_reinit_path(path, 0); 117dcd0538fSMark Fasheh kfree(path); 118dcd0538fSMark Fasheh } 119dcd0538fSMark Fasheh } 120dcd0538fSMark Fasheh 121dcd0538fSMark Fasheh /* 122dcd0538fSMark Fasheh * Make the *dest path the same as src and re-initialize src path to 123dcd0538fSMark Fasheh * have a root only. 124dcd0538fSMark Fasheh */ 125dcd0538fSMark Fasheh static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 126dcd0538fSMark Fasheh { 127dcd0538fSMark Fasheh int i; 128dcd0538fSMark Fasheh 129dcd0538fSMark Fasheh BUG_ON(path_root_bh(dest) != path_root_bh(src)); 130dcd0538fSMark Fasheh 131dcd0538fSMark Fasheh for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 132dcd0538fSMark Fasheh brelse(dest->p_node[i].bh); 133dcd0538fSMark Fasheh 134dcd0538fSMark Fasheh dest->p_node[i].bh = src->p_node[i].bh; 135dcd0538fSMark Fasheh dest->p_node[i].el = src->p_node[i].el; 136dcd0538fSMark Fasheh 137dcd0538fSMark Fasheh src->p_node[i].bh = NULL; 138dcd0538fSMark Fasheh src->p_node[i].el = NULL; 139dcd0538fSMark Fasheh } 140dcd0538fSMark Fasheh } 141dcd0538fSMark Fasheh 142dcd0538fSMark Fasheh /* 143dcd0538fSMark Fasheh * Insert an extent block at given index. 144dcd0538fSMark Fasheh * 145dcd0538fSMark Fasheh * This will not take an additional reference on eb_bh. 146dcd0538fSMark Fasheh */ 147dcd0538fSMark Fasheh static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 148dcd0538fSMark Fasheh struct buffer_head *eb_bh) 149dcd0538fSMark Fasheh { 150dcd0538fSMark Fasheh struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 151dcd0538fSMark Fasheh 152dcd0538fSMark Fasheh /* 153dcd0538fSMark Fasheh * Right now, no root bh is an extent block, so this helps 154dcd0538fSMark Fasheh * catch code errors with dinode trees. The assertion can be 155dcd0538fSMark Fasheh * safely removed if we ever need to insert extent block 156dcd0538fSMark Fasheh * structures at the root. 157dcd0538fSMark Fasheh */ 158dcd0538fSMark Fasheh BUG_ON(index == 0); 159dcd0538fSMark Fasheh 160dcd0538fSMark Fasheh path->p_node[index].bh = eb_bh; 161dcd0538fSMark Fasheh path->p_node[index].el = &eb->h_list; 162dcd0538fSMark Fasheh } 163dcd0538fSMark Fasheh 164dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 165dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el) 166dcd0538fSMark Fasheh { 167dcd0538fSMark Fasheh struct ocfs2_path *path; 168dcd0538fSMark Fasheh 169dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 170dcd0538fSMark Fasheh 171dcd0538fSMark Fasheh path = kzalloc(sizeof(*path), GFP_NOFS); 172dcd0538fSMark Fasheh if (path) { 173dcd0538fSMark Fasheh path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 174dcd0538fSMark Fasheh get_bh(root_bh); 175dcd0538fSMark Fasheh path_root_bh(path) = root_bh; 176dcd0538fSMark Fasheh path_root_el(path) = root_el; 177dcd0538fSMark Fasheh } 178dcd0538fSMark Fasheh 179dcd0538fSMark Fasheh return path; 180dcd0538fSMark Fasheh } 181dcd0538fSMark Fasheh 182dcd0538fSMark Fasheh /* 183dcd0538fSMark Fasheh * Allocate and initialize a new path based on a disk inode tree. 184dcd0538fSMark Fasheh */ 185dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) 186dcd0538fSMark Fasheh { 187dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 188dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &di->id2.i_list; 189dcd0538fSMark Fasheh 190dcd0538fSMark Fasheh return ocfs2_new_path(di_bh, el); 191dcd0538fSMark Fasheh } 192dcd0538fSMark Fasheh 193dcd0538fSMark Fasheh /* 194dcd0538fSMark Fasheh * Convenience function to journal all components in a path. 195dcd0538fSMark Fasheh */ 196dcd0538fSMark Fasheh static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 197dcd0538fSMark Fasheh struct ocfs2_path *path) 198dcd0538fSMark Fasheh { 199dcd0538fSMark Fasheh int i, ret = 0; 200dcd0538fSMark Fasheh 201dcd0538fSMark Fasheh if (!path) 202dcd0538fSMark Fasheh goto out; 203dcd0538fSMark Fasheh 204dcd0538fSMark Fasheh for(i = 0; i < path_num_items(path); i++) { 205dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 206dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 207dcd0538fSMark Fasheh if (ret < 0) { 208dcd0538fSMark Fasheh mlog_errno(ret); 209dcd0538fSMark Fasheh goto out; 210dcd0538fSMark Fasheh } 211dcd0538fSMark Fasheh } 212dcd0538fSMark Fasheh 213dcd0538fSMark Fasheh out: 214dcd0538fSMark Fasheh return ret; 215dcd0538fSMark Fasheh } 216dcd0538fSMark Fasheh 217dcd0538fSMark Fasheh enum ocfs2_contig_type { 218dcd0538fSMark Fasheh CONTIG_NONE = 0, 219dcd0538fSMark Fasheh CONTIG_LEFT, 220dcd0538fSMark Fasheh CONTIG_RIGHT 221dcd0538fSMark Fasheh }; 222dcd0538fSMark Fasheh 223e48edee2SMark Fasheh 224e48edee2SMark Fasheh /* 225e48edee2SMark Fasheh * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 226e48edee2SMark Fasheh * ocfs2_extent_contig only work properly against leaf nodes! 227e48edee2SMark Fasheh */ 228dcd0538fSMark Fasheh static int ocfs2_block_extent_contig(struct super_block *sb, 229ccd979bdSMark Fasheh struct ocfs2_extent_rec *ext, 230ccd979bdSMark Fasheh u64 blkno) 231ccd979bdSMark Fasheh { 232e48edee2SMark Fasheh u64 blk_end = le64_to_cpu(ext->e_blkno); 233e48edee2SMark Fasheh 234e48edee2SMark Fasheh blk_end += ocfs2_clusters_to_blocks(sb, 235e48edee2SMark Fasheh le16_to_cpu(ext->e_leaf_clusters)); 236e48edee2SMark Fasheh 237e48edee2SMark Fasheh return blkno == blk_end; 238ccd979bdSMark Fasheh } 239ccd979bdSMark Fasheh 240dcd0538fSMark Fasheh static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 241dcd0538fSMark Fasheh struct ocfs2_extent_rec *right) 242dcd0538fSMark Fasheh { 243e48edee2SMark Fasheh u32 left_range; 244e48edee2SMark Fasheh 245e48edee2SMark Fasheh left_range = le32_to_cpu(left->e_cpos) + 246e48edee2SMark Fasheh le16_to_cpu(left->e_leaf_clusters); 247e48edee2SMark Fasheh 248e48edee2SMark Fasheh return (left_range == le32_to_cpu(right->e_cpos)); 249dcd0538fSMark Fasheh } 250dcd0538fSMark Fasheh 251dcd0538fSMark Fasheh static enum ocfs2_contig_type 252dcd0538fSMark Fasheh ocfs2_extent_contig(struct inode *inode, 253dcd0538fSMark Fasheh struct ocfs2_extent_rec *ext, 254dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 255dcd0538fSMark Fasheh { 256dcd0538fSMark Fasheh u64 blkno = le64_to_cpu(insert_rec->e_blkno); 257dcd0538fSMark Fasheh 258dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(ext, insert_rec) && 259dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 260dcd0538fSMark Fasheh return CONTIG_RIGHT; 261dcd0538fSMark Fasheh 262dcd0538fSMark Fasheh blkno = le64_to_cpu(ext->e_blkno); 263dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(insert_rec, ext) && 264dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 265dcd0538fSMark Fasheh return CONTIG_LEFT; 266dcd0538fSMark Fasheh 267dcd0538fSMark Fasheh return CONTIG_NONE; 268dcd0538fSMark Fasheh } 269dcd0538fSMark Fasheh 270dcd0538fSMark Fasheh /* 271dcd0538fSMark Fasheh * NOTE: We can have pretty much any combination of contiguousness and 272dcd0538fSMark Fasheh * appending. 273dcd0538fSMark Fasheh * 274dcd0538fSMark Fasheh * The usefulness of APPEND_TAIL is more in that it lets us know that 275dcd0538fSMark Fasheh * we'll have to update the path to that leaf. 276dcd0538fSMark Fasheh */ 277dcd0538fSMark Fasheh enum ocfs2_append_type { 278dcd0538fSMark Fasheh APPEND_NONE = 0, 279dcd0538fSMark Fasheh APPEND_TAIL, 280dcd0538fSMark Fasheh }; 281dcd0538fSMark Fasheh 282dcd0538fSMark Fasheh struct ocfs2_insert_type { 283dcd0538fSMark Fasheh enum ocfs2_append_type ins_appending; 284dcd0538fSMark Fasheh enum ocfs2_contig_type ins_contig; 285dcd0538fSMark Fasheh int ins_contig_index; 286dcd0538fSMark Fasheh int ins_free_records; 287dcd0538fSMark Fasheh int ins_tree_depth; 288dcd0538fSMark Fasheh }; 289dcd0538fSMark Fasheh 290ccd979bdSMark Fasheh /* 291ccd979bdSMark Fasheh * How many free extents have we got before we need more meta data? 292ccd979bdSMark Fasheh */ 293ccd979bdSMark Fasheh int ocfs2_num_free_extents(struct ocfs2_super *osb, 294ccd979bdSMark Fasheh struct inode *inode, 295ccd979bdSMark Fasheh struct ocfs2_dinode *fe) 296ccd979bdSMark Fasheh { 297ccd979bdSMark Fasheh int retval; 298ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 299ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 300ccd979bdSMark Fasheh struct buffer_head *eb_bh = NULL; 301ccd979bdSMark Fasheh 302ccd979bdSMark Fasheh mlog_entry_void(); 303ccd979bdSMark Fasheh 304ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(fe)) { 305ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 306ccd979bdSMark Fasheh retval = -EIO; 307ccd979bdSMark Fasheh goto bail; 308ccd979bdSMark Fasheh } 309ccd979bdSMark Fasheh 310ccd979bdSMark Fasheh if (fe->i_last_eb_blk) { 311ccd979bdSMark Fasheh retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 312ccd979bdSMark Fasheh &eb_bh, OCFS2_BH_CACHED, inode); 313ccd979bdSMark Fasheh if (retval < 0) { 314ccd979bdSMark Fasheh mlog_errno(retval); 315ccd979bdSMark Fasheh goto bail; 316ccd979bdSMark Fasheh } 317ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 318ccd979bdSMark Fasheh el = &eb->h_list; 319ccd979bdSMark Fasheh } else 320ccd979bdSMark Fasheh el = &fe->id2.i_list; 321ccd979bdSMark Fasheh 322ccd979bdSMark Fasheh BUG_ON(el->l_tree_depth != 0); 323ccd979bdSMark Fasheh 324ccd979bdSMark Fasheh retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 325ccd979bdSMark Fasheh bail: 326ccd979bdSMark Fasheh if (eb_bh) 327ccd979bdSMark Fasheh brelse(eb_bh); 328ccd979bdSMark Fasheh 329ccd979bdSMark Fasheh mlog_exit(retval); 330ccd979bdSMark Fasheh return retval; 331ccd979bdSMark Fasheh } 332ccd979bdSMark Fasheh 333ccd979bdSMark Fasheh /* expects array to already be allocated 334ccd979bdSMark Fasheh * 335ccd979bdSMark Fasheh * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 336ccd979bdSMark Fasheh * l_count for you 337ccd979bdSMark Fasheh */ 338ccd979bdSMark Fasheh static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 3391fabe148SMark Fasheh handle_t *handle, 340ccd979bdSMark Fasheh struct inode *inode, 341ccd979bdSMark Fasheh int wanted, 342ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 343ccd979bdSMark Fasheh struct buffer_head *bhs[]) 344ccd979bdSMark Fasheh { 345ccd979bdSMark Fasheh int count, status, i; 346ccd979bdSMark Fasheh u16 suballoc_bit_start; 347ccd979bdSMark Fasheh u32 num_got; 348ccd979bdSMark Fasheh u64 first_blkno; 349ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 350ccd979bdSMark Fasheh 351ccd979bdSMark Fasheh mlog_entry_void(); 352ccd979bdSMark Fasheh 353ccd979bdSMark Fasheh count = 0; 354ccd979bdSMark Fasheh while (count < wanted) { 355ccd979bdSMark Fasheh status = ocfs2_claim_metadata(osb, 356ccd979bdSMark Fasheh handle, 357ccd979bdSMark Fasheh meta_ac, 358ccd979bdSMark Fasheh wanted - count, 359ccd979bdSMark Fasheh &suballoc_bit_start, 360ccd979bdSMark Fasheh &num_got, 361ccd979bdSMark Fasheh &first_blkno); 362ccd979bdSMark Fasheh if (status < 0) { 363ccd979bdSMark Fasheh mlog_errno(status); 364ccd979bdSMark Fasheh goto bail; 365ccd979bdSMark Fasheh } 366ccd979bdSMark Fasheh 367ccd979bdSMark Fasheh for(i = count; i < (num_got + count); i++) { 368ccd979bdSMark Fasheh bhs[i] = sb_getblk(osb->sb, first_blkno); 369ccd979bdSMark Fasheh if (bhs[i] == NULL) { 370ccd979bdSMark Fasheh status = -EIO; 371ccd979bdSMark Fasheh mlog_errno(status); 372ccd979bdSMark Fasheh goto bail; 373ccd979bdSMark Fasheh } 374ccd979bdSMark Fasheh ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 375ccd979bdSMark Fasheh 376ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bhs[i], 377ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 378ccd979bdSMark Fasheh if (status < 0) { 379ccd979bdSMark Fasheh mlog_errno(status); 380ccd979bdSMark Fasheh goto bail; 381ccd979bdSMark Fasheh } 382ccd979bdSMark Fasheh 383ccd979bdSMark Fasheh memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 384ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 385ccd979bdSMark Fasheh /* Ok, setup the minimal stuff here. */ 386ccd979bdSMark Fasheh strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 387ccd979bdSMark Fasheh eb->h_blkno = cpu_to_le64(first_blkno); 388ccd979bdSMark Fasheh eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 389ccd979bdSMark Fasheh 390ccd979bdSMark Fasheh #ifndef OCFS2_USE_ALL_METADATA_SUBALLOCATORS 391ccd979bdSMark Fasheh /* we always use slot zero's suballocator */ 392ccd979bdSMark Fasheh eb->h_suballoc_slot = 0; 393ccd979bdSMark Fasheh #else 394ccd979bdSMark Fasheh eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 395ccd979bdSMark Fasheh #endif 396ccd979bdSMark Fasheh eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 397ccd979bdSMark Fasheh eb->h_list.l_count = 398ccd979bdSMark Fasheh cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 399ccd979bdSMark Fasheh 400ccd979bdSMark Fasheh suballoc_bit_start++; 401ccd979bdSMark Fasheh first_blkno++; 402ccd979bdSMark Fasheh 403ccd979bdSMark Fasheh /* We'll also be dirtied by the caller, so 404ccd979bdSMark Fasheh * this isn't absolutely necessary. */ 405ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bhs[i]); 406ccd979bdSMark Fasheh if (status < 0) { 407ccd979bdSMark Fasheh mlog_errno(status); 408ccd979bdSMark Fasheh goto bail; 409ccd979bdSMark Fasheh } 410ccd979bdSMark Fasheh } 411ccd979bdSMark Fasheh 412ccd979bdSMark Fasheh count += num_got; 413ccd979bdSMark Fasheh } 414ccd979bdSMark Fasheh 415ccd979bdSMark Fasheh status = 0; 416ccd979bdSMark Fasheh bail: 417ccd979bdSMark Fasheh if (status < 0) { 418ccd979bdSMark Fasheh for(i = 0; i < wanted; i++) { 419ccd979bdSMark Fasheh if (bhs[i]) 420ccd979bdSMark Fasheh brelse(bhs[i]); 421ccd979bdSMark Fasheh bhs[i] = NULL; 422ccd979bdSMark Fasheh } 423ccd979bdSMark Fasheh } 424ccd979bdSMark Fasheh mlog_exit(status); 425ccd979bdSMark Fasheh return status; 426ccd979bdSMark Fasheh } 427ccd979bdSMark Fasheh 428ccd979bdSMark Fasheh /* 429dcd0538fSMark Fasheh * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 430dcd0538fSMark Fasheh * 431dcd0538fSMark Fasheh * Returns the sum of the rightmost extent rec logical offset and 432dcd0538fSMark Fasheh * cluster count. 433dcd0538fSMark Fasheh * 434dcd0538fSMark Fasheh * ocfs2_add_branch() uses this to determine what logical cluster 435dcd0538fSMark Fasheh * value should be populated into the leftmost new branch records. 436dcd0538fSMark Fasheh * 437dcd0538fSMark Fasheh * ocfs2_shift_tree_depth() uses this to determine the # clusters 438dcd0538fSMark Fasheh * value for the new topmost tree record. 439dcd0538fSMark Fasheh */ 440dcd0538fSMark Fasheh static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 441dcd0538fSMark Fasheh { 442dcd0538fSMark Fasheh int i; 443dcd0538fSMark Fasheh 444dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 445dcd0538fSMark Fasheh 446dcd0538fSMark Fasheh return le32_to_cpu(el->l_recs[i].e_cpos) + 447e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 448dcd0538fSMark Fasheh } 449dcd0538fSMark Fasheh 450dcd0538fSMark Fasheh /* 451ccd979bdSMark Fasheh * Add an entire tree branch to our inode. eb_bh is the extent block 452ccd979bdSMark Fasheh * to start at, if we don't want to start the branch at the dinode 453ccd979bdSMark Fasheh * structure. 454ccd979bdSMark Fasheh * 455ccd979bdSMark Fasheh * last_eb_bh is required as we have to update it's next_leaf pointer 456ccd979bdSMark Fasheh * for the new last extent block. 457ccd979bdSMark Fasheh * 458ccd979bdSMark Fasheh * the new branch will be 'empty' in the sense that every block will 459e48edee2SMark Fasheh * contain a single record with cluster count == 0. 460ccd979bdSMark Fasheh */ 461ccd979bdSMark Fasheh static int ocfs2_add_branch(struct ocfs2_super *osb, 4621fabe148SMark Fasheh handle_t *handle, 463ccd979bdSMark Fasheh struct inode *inode, 464ccd979bdSMark Fasheh struct buffer_head *fe_bh, 465ccd979bdSMark Fasheh struct buffer_head *eb_bh, 466ccd979bdSMark Fasheh struct buffer_head *last_eb_bh, 467ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 468ccd979bdSMark Fasheh { 469ccd979bdSMark Fasheh int status, new_blocks, i; 470ccd979bdSMark Fasheh u64 next_blkno, new_last_eb_blk; 471ccd979bdSMark Fasheh struct buffer_head *bh; 472ccd979bdSMark Fasheh struct buffer_head **new_eb_bhs = NULL; 473ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 474ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 475ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 476ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 477dcd0538fSMark Fasheh u32 new_cpos; 478ccd979bdSMark Fasheh 479ccd979bdSMark Fasheh mlog_entry_void(); 480ccd979bdSMark Fasheh 481ccd979bdSMark Fasheh BUG_ON(!last_eb_bh); 482ccd979bdSMark Fasheh 483ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 484ccd979bdSMark Fasheh 485ccd979bdSMark Fasheh if (eb_bh) { 486ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 487ccd979bdSMark Fasheh el = &eb->h_list; 488ccd979bdSMark Fasheh } else 489ccd979bdSMark Fasheh el = &fe->id2.i_list; 490ccd979bdSMark Fasheh 491ccd979bdSMark Fasheh /* we never add a branch to a leaf. */ 492ccd979bdSMark Fasheh BUG_ON(!el->l_tree_depth); 493ccd979bdSMark Fasheh 494ccd979bdSMark Fasheh new_blocks = le16_to_cpu(el->l_tree_depth); 495ccd979bdSMark Fasheh 496ccd979bdSMark Fasheh /* allocate the number of new eb blocks we need */ 497ccd979bdSMark Fasheh new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 498ccd979bdSMark Fasheh GFP_KERNEL); 499ccd979bdSMark Fasheh if (!new_eb_bhs) { 500ccd979bdSMark Fasheh status = -ENOMEM; 501ccd979bdSMark Fasheh mlog_errno(status); 502ccd979bdSMark Fasheh goto bail; 503ccd979bdSMark Fasheh } 504ccd979bdSMark Fasheh 505ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 506ccd979bdSMark Fasheh meta_ac, new_eb_bhs); 507ccd979bdSMark Fasheh if (status < 0) { 508ccd979bdSMark Fasheh mlog_errno(status); 509ccd979bdSMark Fasheh goto bail; 510ccd979bdSMark Fasheh } 511ccd979bdSMark Fasheh 512dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 513dcd0538fSMark Fasheh new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 514dcd0538fSMark Fasheh 515ccd979bdSMark Fasheh /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 516ccd979bdSMark Fasheh * linked with the rest of the tree. 517ccd979bdSMark Fasheh * conversly, new_eb_bhs[0] is the new bottommost leaf. 518ccd979bdSMark Fasheh * 519ccd979bdSMark Fasheh * when we leave the loop, new_last_eb_blk will point to the 520ccd979bdSMark Fasheh * newest leaf, and next_blkno will point to the topmost extent 521ccd979bdSMark Fasheh * block. */ 522ccd979bdSMark Fasheh next_blkno = new_last_eb_blk = 0; 523ccd979bdSMark Fasheh for(i = 0; i < new_blocks; i++) { 524ccd979bdSMark Fasheh bh = new_eb_bhs[i]; 525ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 526ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 527ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 528ccd979bdSMark Fasheh status = -EIO; 529ccd979bdSMark Fasheh goto bail; 530ccd979bdSMark Fasheh } 531ccd979bdSMark Fasheh eb_el = &eb->h_list; 532ccd979bdSMark Fasheh 533ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bh, 534ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 535ccd979bdSMark Fasheh if (status < 0) { 536ccd979bdSMark Fasheh mlog_errno(status); 537ccd979bdSMark Fasheh goto bail; 538ccd979bdSMark Fasheh } 539ccd979bdSMark Fasheh 540ccd979bdSMark Fasheh eb->h_next_leaf_blk = 0; 541ccd979bdSMark Fasheh eb_el->l_tree_depth = cpu_to_le16(i); 542ccd979bdSMark Fasheh eb_el->l_next_free_rec = cpu_to_le16(1); 543dcd0538fSMark Fasheh /* 544dcd0538fSMark Fasheh * This actually counts as an empty extent as 545dcd0538fSMark Fasheh * c_clusters == 0 546dcd0538fSMark Fasheh */ 547dcd0538fSMark Fasheh eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 548ccd979bdSMark Fasheh eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 549e48edee2SMark Fasheh /* 550e48edee2SMark Fasheh * eb_el isn't always an interior node, but even leaf 551e48edee2SMark Fasheh * nodes want a zero'd flags and reserved field so 552e48edee2SMark Fasheh * this gets the whole 32 bits regardless of use. 553e48edee2SMark Fasheh */ 554e48edee2SMark Fasheh eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 555ccd979bdSMark Fasheh if (!eb_el->l_tree_depth) 556ccd979bdSMark Fasheh new_last_eb_blk = le64_to_cpu(eb->h_blkno); 557ccd979bdSMark Fasheh 558ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bh); 559ccd979bdSMark Fasheh if (status < 0) { 560ccd979bdSMark Fasheh mlog_errno(status); 561ccd979bdSMark Fasheh goto bail; 562ccd979bdSMark Fasheh } 563ccd979bdSMark Fasheh 564ccd979bdSMark Fasheh next_blkno = le64_to_cpu(eb->h_blkno); 565ccd979bdSMark Fasheh } 566ccd979bdSMark Fasheh 567ccd979bdSMark Fasheh /* This is a bit hairy. We want to update up to three blocks 568ccd979bdSMark Fasheh * here without leaving any of them in an inconsistent state 569ccd979bdSMark Fasheh * in case of error. We don't have to worry about 570ccd979bdSMark Fasheh * journal_dirty erroring as it won't unless we've aborted the 571ccd979bdSMark Fasheh * handle (in which case we would never be here) so reserving 572ccd979bdSMark Fasheh * the write with journal_access is all we need to do. */ 573ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 574ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 575ccd979bdSMark Fasheh if (status < 0) { 576ccd979bdSMark Fasheh mlog_errno(status); 577ccd979bdSMark Fasheh goto bail; 578ccd979bdSMark Fasheh } 579ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 580ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 581ccd979bdSMark Fasheh if (status < 0) { 582ccd979bdSMark Fasheh mlog_errno(status); 583ccd979bdSMark Fasheh goto bail; 584ccd979bdSMark Fasheh } 585ccd979bdSMark Fasheh if (eb_bh) { 586ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, eb_bh, 587ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 588ccd979bdSMark Fasheh if (status < 0) { 589ccd979bdSMark Fasheh mlog_errno(status); 590ccd979bdSMark Fasheh goto bail; 591ccd979bdSMark Fasheh } 592ccd979bdSMark Fasheh } 593ccd979bdSMark Fasheh 594ccd979bdSMark Fasheh /* Link the new branch into the rest of the tree (el will 595ccd979bdSMark Fasheh * either be on the fe, or the extent block passed in. */ 596ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec); 597ccd979bdSMark Fasheh el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 598dcd0538fSMark Fasheh el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 599e48edee2SMark Fasheh el->l_recs[i].e_int_clusters = 0; 600ccd979bdSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 601ccd979bdSMark Fasheh 602ccd979bdSMark Fasheh /* fe needs a new last extent block pointer, as does the 603ccd979bdSMark Fasheh * next_leaf on the previously last-extent-block. */ 604ccd979bdSMark Fasheh fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 605ccd979bdSMark Fasheh 606ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 607ccd979bdSMark Fasheh eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 608ccd979bdSMark Fasheh 609ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 610ccd979bdSMark Fasheh if (status < 0) 611ccd979bdSMark Fasheh mlog_errno(status); 612ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 613ccd979bdSMark Fasheh if (status < 0) 614ccd979bdSMark Fasheh mlog_errno(status); 615ccd979bdSMark Fasheh if (eb_bh) { 616ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, eb_bh); 617ccd979bdSMark Fasheh if (status < 0) 618ccd979bdSMark Fasheh mlog_errno(status); 619ccd979bdSMark Fasheh } 620ccd979bdSMark Fasheh 621ccd979bdSMark Fasheh status = 0; 622ccd979bdSMark Fasheh bail: 623ccd979bdSMark Fasheh if (new_eb_bhs) { 624ccd979bdSMark Fasheh for (i = 0; i < new_blocks; i++) 625ccd979bdSMark Fasheh if (new_eb_bhs[i]) 626ccd979bdSMark Fasheh brelse(new_eb_bhs[i]); 627ccd979bdSMark Fasheh kfree(new_eb_bhs); 628ccd979bdSMark Fasheh } 629ccd979bdSMark Fasheh 630ccd979bdSMark Fasheh mlog_exit(status); 631ccd979bdSMark Fasheh return status; 632ccd979bdSMark Fasheh } 633ccd979bdSMark Fasheh 634ccd979bdSMark Fasheh /* 635ccd979bdSMark Fasheh * adds another level to the allocation tree. 636ccd979bdSMark Fasheh * returns back the new extent block so you can add a branch to it 637ccd979bdSMark Fasheh * after this call. 638ccd979bdSMark Fasheh */ 639ccd979bdSMark Fasheh static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 6401fabe148SMark Fasheh handle_t *handle, 641ccd979bdSMark Fasheh struct inode *inode, 642ccd979bdSMark Fasheh struct buffer_head *fe_bh, 643ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 644ccd979bdSMark Fasheh struct buffer_head **ret_new_eb_bh) 645ccd979bdSMark Fasheh { 646ccd979bdSMark Fasheh int status, i; 647dcd0538fSMark Fasheh u32 new_clusters; 648ccd979bdSMark Fasheh struct buffer_head *new_eb_bh = NULL; 649ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 650ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 651ccd979bdSMark Fasheh struct ocfs2_extent_list *fe_el; 652ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 653ccd979bdSMark Fasheh 654ccd979bdSMark Fasheh mlog_entry_void(); 655ccd979bdSMark Fasheh 656ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 657ccd979bdSMark Fasheh &new_eb_bh); 658ccd979bdSMark Fasheh if (status < 0) { 659ccd979bdSMark Fasheh mlog_errno(status); 660ccd979bdSMark Fasheh goto bail; 661ccd979bdSMark Fasheh } 662ccd979bdSMark Fasheh 663ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 664ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 665ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 666ccd979bdSMark Fasheh status = -EIO; 667ccd979bdSMark Fasheh goto bail; 668ccd979bdSMark Fasheh } 669ccd979bdSMark Fasheh 670ccd979bdSMark Fasheh eb_el = &eb->h_list; 671ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 672ccd979bdSMark Fasheh fe_el = &fe->id2.i_list; 673ccd979bdSMark Fasheh 674ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, new_eb_bh, 675ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 676ccd979bdSMark Fasheh if (status < 0) { 677ccd979bdSMark Fasheh mlog_errno(status); 678ccd979bdSMark Fasheh goto bail; 679ccd979bdSMark Fasheh } 680ccd979bdSMark Fasheh 681ccd979bdSMark Fasheh /* copy the fe data into the new extent block */ 682ccd979bdSMark Fasheh eb_el->l_tree_depth = fe_el->l_tree_depth; 683ccd979bdSMark Fasheh eb_el->l_next_free_rec = fe_el->l_next_free_rec; 684e48edee2SMark Fasheh for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 685e48edee2SMark Fasheh eb_el->l_recs[i] = fe_el->l_recs[i]; 686ccd979bdSMark Fasheh 687ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, new_eb_bh); 688ccd979bdSMark Fasheh if (status < 0) { 689ccd979bdSMark Fasheh mlog_errno(status); 690ccd979bdSMark Fasheh goto bail; 691ccd979bdSMark Fasheh } 692ccd979bdSMark Fasheh 693ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 694ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 695ccd979bdSMark Fasheh if (status < 0) { 696ccd979bdSMark Fasheh mlog_errno(status); 697ccd979bdSMark Fasheh goto bail; 698ccd979bdSMark Fasheh } 699ccd979bdSMark Fasheh 700dcd0538fSMark Fasheh new_clusters = ocfs2_sum_rightmost_rec(eb_el); 701dcd0538fSMark Fasheh 702ccd979bdSMark Fasheh /* update fe now */ 703ccd979bdSMark Fasheh le16_add_cpu(&fe_el->l_tree_depth, 1); 704ccd979bdSMark Fasheh fe_el->l_recs[0].e_cpos = 0; 705ccd979bdSMark Fasheh fe_el->l_recs[0].e_blkno = eb->h_blkno; 706e48edee2SMark Fasheh fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 707e48edee2SMark Fasheh for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 708e48edee2SMark Fasheh memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 709ccd979bdSMark Fasheh fe_el->l_next_free_rec = cpu_to_le16(1); 710ccd979bdSMark Fasheh 711ccd979bdSMark Fasheh /* If this is our 1st tree depth shift, then last_eb_blk 712ccd979bdSMark Fasheh * becomes the allocated extent block */ 713ccd979bdSMark Fasheh if (fe_el->l_tree_depth == cpu_to_le16(1)) 714ccd979bdSMark Fasheh fe->i_last_eb_blk = eb->h_blkno; 715ccd979bdSMark Fasheh 716ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 717ccd979bdSMark Fasheh if (status < 0) { 718ccd979bdSMark Fasheh mlog_errno(status); 719ccd979bdSMark Fasheh goto bail; 720ccd979bdSMark Fasheh } 721ccd979bdSMark Fasheh 722ccd979bdSMark Fasheh *ret_new_eb_bh = new_eb_bh; 723ccd979bdSMark Fasheh new_eb_bh = NULL; 724ccd979bdSMark Fasheh status = 0; 725ccd979bdSMark Fasheh bail: 726ccd979bdSMark Fasheh if (new_eb_bh) 727ccd979bdSMark Fasheh brelse(new_eb_bh); 728ccd979bdSMark Fasheh 729ccd979bdSMark Fasheh mlog_exit(status); 730ccd979bdSMark Fasheh return status; 731ccd979bdSMark Fasheh } 732ccd979bdSMark Fasheh 733ccd979bdSMark Fasheh /* 734ccd979bdSMark Fasheh * Should only be called when there is no space left in any of the 735ccd979bdSMark Fasheh * leaf nodes. What we want to do is find the lowest tree depth 736ccd979bdSMark Fasheh * non-leaf extent block with room for new records. There are three 737ccd979bdSMark Fasheh * valid results of this search: 738ccd979bdSMark Fasheh * 739ccd979bdSMark Fasheh * 1) a lowest extent block is found, then we pass it back in 740ccd979bdSMark Fasheh * *lowest_eb_bh and return '0' 741ccd979bdSMark Fasheh * 742ccd979bdSMark Fasheh * 2) the search fails to find anything, but the dinode has room. We 743ccd979bdSMark Fasheh * pass NULL back in *lowest_eb_bh, but still return '0' 744ccd979bdSMark Fasheh * 745ccd979bdSMark Fasheh * 3) the search fails to find anything AND the dinode is full, in 746ccd979bdSMark Fasheh * which case we return > 0 747ccd979bdSMark Fasheh * 748ccd979bdSMark Fasheh * return status < 0 indicates an error. 749ccd979bdSMark Fasheh */ 750ccd979bdSMark Fasheh static int ocfs2_find_branch_target(struct ocfs2_super *osb, 751ccd979bdSMark Fasheh struct inode *inode, 752ccd979bdSMark Fasheh struct buffer_head *fe_bh, 753ccd979bdSMark Fasheh struct buffer_head **target_bh) 754ccd979bdSMark Fasheh { 755ccd979bdSMark Fasheh int status = 0, i; 756ccd979bdSMark Fasheh u64 blkno; 757ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 758ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 759ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 760ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 761ccd979bdSMark Fasheh struct buffer_head *lowest_bh = NULL; 762ccd979bdSMark Fasheh 763ccd979bdSMark Fasheh mlog_entry_void(); 764ccd979bdSMark Fasheh 765ccd979bdSMark Fasheh *target_bh = NULL; 766ccd979bdSMark Fasheh 767ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 768ccd979bdSMark Fasheh el = &fe->id2.i_list; 769ccd979bdSMark Fasheh 770ccd979bdSMark Fasheh while(le16_to_cpu(el->l_tree_depth) > 1) { 771ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 772b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has empty " 773ccd979bdSMark Fasheh "extent list (next_free_rec == 0)", 774b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 775ccd979bdSMark Fasheh status = -EIO; 776ccd979bdSMark Fasheh goto bail; 777ccd979bdSMark Fasheh } 778ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 779ccd979bdSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 780ccd979bdSMark Fasheh if (!blkno) { 781b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has extent " 782ccd979bdSMark Fasheh "list where extent # %d has no physical " 783ccd979bdSMark Fasheh "block start", 784b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 785ccd979bdSMark Fasheh status = -EIO; 786ccd979bdSMark Fasheh goto bail; 787ccd979bdSMark Fasheh } 788ccd979bdSMark Fasheh 789ccd979bdSMark Fasheh if (bh) { 790ccd979bdSMark Fasheh brelse(bh); 791ccd979bdSMark Fasheh bh = NULL; 792ccd979bdSMark Fasheh } 793ccd979bdSMark Fasheh 794ccd979bdSMark Fasheh status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 795ccd979bdSMark Fasheh inode); 796ccd979bdSMark Fasheh if (status < 0) { 797ccd979bdSMark Fasheh mlog_errno(status); 798ccd979bdSMark Fasheh goto bail; 799ccd979bdSMark Fasheh } 800ccd979bdSMark Fasheh 801ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 802ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 803ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 804ccd979bdSMark Fasheh status = -EIO; 805ccd979bdSMark Fasheh goto bail; 806ccd979bdSMark Fasheh } 807ccd979bdSMark Fasheh el = &eb->h_list; 808ccd979bdSMark Fasheh 809ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) < 810ccd979bdSMark Fasheh le16_to_cpu(el->l_count)) { 811ccd979bdSMark Fasheh if (lowest_bh) 812ccd979bdSMark Fasheh brelse(lowest_bh); 813ccd979bdSMark Fasheh lowest_bh = bh; 814ccd979bdSMark Fasheh get_bh(lowest_bh); 815ccd979bdSMark Fasheh } 816ccd979bdSMark Fasheh } 817ccd979bdSMark Fasheh 818ccd979bdSMark Fasheh /* If we didn't find one and the fe doesn't have any room, 819ccd979bdSMark Fasheh * then return '1' */ 820ccd979bdSMark Fasheh if (!lowest_bh 821ccd979bdSMark Fasheh && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) 822ccd979bdSMark Fasheh status = 1; 823ccd979bdSMark Fasheh 824ccd979bdSMark Fasheh *target_bh = lowest_bh; 825ccd979bdSMark Fasheh bail: 826ccd979bdSMark Fasheh if (bh) 827ccd979bdSMark Fasheh brelse(bh); 828ccd979bdSMark Fasheh 829ccd979bdSMark Fasheh mlog_exit(status); 830ccd979bdSMark Fasheh return status; 831ccd979bdSMark Fasheh } 832ccd979bdSMark Fasheh 833e48edee2SMark Fasheh /* 834e48edee2SMark Fasheh * This is only valid for leaf nodes, which are the only ones that can 835e48edee2SMark Fasheh * have empty extents anyway. 836e48edee2SMark Fasheh */ 837dcd0538fSMark Fasheh static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) 838dcd0538fSMark Fasheh { 839e48edee2SMark Fasheh return !rec->e_leaf_clusters; 840dcd0538fSMark Fasheh } 841dcd0538fSMark Fasheh 842dcd0538fSMark Fasheh /* 843dcd0538fSMark Fasheh * This function will discard the rightmost extent record. 844dcd0538fSMark Fasheh */ 845dcd0538fSMark Fasheh static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 846dcd0538fSMark Fasheh { 847dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 848dcd0538fSMark Fasheh int count = le16_to_cpu(el->l_count); 849dcd0538fSMark Fasheh unsigned int num_bytes; 850dcd0538fSMark Fasheh 851dcd0538fSMark Fasheh BUG_ON(!next_free); 852dcd0538fSMark Fasheh /* This will cause us to go off the end of our extent list. */ 853dcd0538fSMark Fasheh BUG_ON(next_free >= count); 854dcd0538fSMark Fasheh 855dcd0538fSMark Fasheh num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 856dcd0538fSMark Fasheh 857dcd0538fSMark Fasheh memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 858dcd0538fSMark Fasheh } 859dcd0538fSMark Fasheh 860dcd0538fSMark Fasheh static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 861dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 862dcd0538fSMark Fasheh { 863dcd0538fSMark Fasheh int i, insert_index, next_free, has_empty, num_bytes; 864dcd0538fSMark Fasheh u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 865dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 866dcd0538fSMark Fasheh 867dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 868dcd0538fSMark Fasheh has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 869dcd0538fSMark Fasheh 870dcd0538fSMark Fasheh BUG_ON(!next_free); 871dcd0538fSMark Fasheh 872dcd0538fSMark Fasheh /* The tree code before us didn't allow enough room in the leaf. */ 873dcd0538fSMark Fasheh if (el->l_next_free_rec == el->l_count && !has_empty) 874dcd0538fSMark Fasheh BUG(); 875dcd0538fSMark Fasheh 876dcd0538fSMark Fasheh /* 877dcd0538fSMark Fasheh * The easiest way to approach this is to just remove the 878dcd0538fSMark Fasheh * empty extent and temporarily decrement next_free. 879dcd0538fSMark Fasheh */ 880dcd0538fSMark Fasheh if (has_empty) { 881dcd0538fSMark Fasheh /* 882dcd0538fSMark Fasheh * If next_free was 1 (only an empty extent), this 883dcd0538fSMark Fasheh * loop won't execute, which is fine. We still want 884dcd0538fSMark Fasheh * the decrement above to happen. 885dcd0538fSMark Fasheh */ 886dcd0538fSMark Fasheh for(i = 0; i < (next_free - 1); i++) 887dcd0538fSMark Fasheh el->l_recs[i] = el->l_recs[i+1]; 888dcd0538fSMark Fasheh 889dcd0538fSMark Fasheh next_free--; 890dcd0538fSMark Fasheh } 891dcd0538fSMark Fasheh 892dcd0538fSMark Fasheh /* 893dcd0538fSMark Fasheh * Figure out what the new record index should be. 894dcd0538fSMark Fasheh */ 895dcd0538fSMark Fasheh for(i = 0; i < next_free; i++) { 896dcd0538fSMark Fasheh rec = &el->l_recs[i]; 897dcd0538fSMark Fasheh 898dcd0538fSMark Fasheh if (insert_cpos < le32_to_cpu(rec->e_cpos)) 899dcd0538fSMark Fasheh break; 900dcd0538fSMark Fasheh } 901dcd0538fSMark Fasheh insert_index = i; 902dcd0538fSMark Fasheh 903dcd0538fSMark Fasheh mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 904dcd0538fSMark Fasheh insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 905dcd0538fSMark Fasheh 906dcd0538fSMark Fasheh BUG_ON(insert_index < 0); 907dcd0538fSMark Fasheh BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 908dcd0538fSMark Fasheh BUG_ON(insert_index > next_free); 909dcd0538fSMark Fasheh 910dcd0538fSMark Fasheh /* 911dcd0538fSMark Fasheh * No need to memmove if we're just adding to the tail. 912dcd0538fSMark Fasheh */ 913dcd0538fSMark Fasheh if (insert_index != next_free) { 914dcd0538fSMark Fasheh BUG_ON(next_free >= le16_to_cpu(el->l_count)); 915dcd0538fSMark Fasheh 916dcd0538fSMark Fasheh num_bytes = next_free - insert_index; 917dcd0538fSMark Fasheh num_bytes *= sizeof(struct ocfs2_extent_rec); 918dcd0538fSMark Fasheh memmove(&el->l_recs[insert_index + 1], 919dcd0538fSMark Fasheh &el->l_recs[insert_index], 920dcd0538fSMark Fasheh num_bytes); 921dcd0538fSMark Fasheh } 922dcd0538fSMark Fasheh 923dcd0538fSMark Fasheh /* 924dcd0538fSMark Fasheh * Either we had an empty extent, and need to re-increment or 925dcd0538fSMark Fasheh * there was no empty extent on a non full rightmost leaf node, 926dcd0538fSMark Fasheh * in which case we still need to increment. 927dcd0538fSMark Fasheh */ 928dcd0538fSMark Fasheh next_free++; 929dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(next_free); 930dcd0538fSMark Fasheh /* 931dcd0538fSMark Fasheh * Make sure none of the math above just messed up our tree. 932dcd0538fSMark Fasheh */ 933dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 934dcd0538fSMark Fasheh 935dcd0538fSMark Fasheh el->l_recs[insert_index] = *insert_rec; 936dcd0538fSMark Fasheh 937dcd0538fSMark Fasheh } 938dcd0538fSMark Fasheh 939dcd0538fSMark Fasheh /* 940dcd0538fSMark Fasheh * Create an empty extent record . 941dcd0538fSMark Fasheh * 942dcd0538fSMark Fasheh * l_next_free_rec may be updated. 943dcd0538fSMark Fasheh * 944dcd0538fSMark Fasheh * If an empty extent already exists do nothing. 945dcd0538fSMark Fasheh */ 946dcd0538fSMark Fasheh static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 947dcd0538fSMark Fasheh { 948dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 949dcd0538fSMark Fasheh 950e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 951e48edee2SMark Fasheh 952dcd0538fSMark Fasheh if (next_free == 0) 953dcd0538fSMark Fasheh goto set_and_inc; 954dcd0538fSMark Fasheh 955dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) 956dcd0538fSMark Fasheh return; 957dcd0538fSMark Fasheh 958dcd0538fSMark Fasheh mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 959dcd0538fSMark Fasheh "Asked to create an empty extent in a full list:\n" 960dcd0538fSMark Fasheh "count = %u, tree depth = %u", 961dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 962dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 963dcd0538fSMark Fasheh 964dcd0538fSMark Fasheh ocfs2_shift_records_right(el); 965dcd0538fSMark Fasheh 966dcd0538fSMark Fasheh set_and_inc: 967dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 968dcd0538fSMark Fasheh memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 969dcd0538fSMark Fasheh } 970dcd0538fSMark Fasheh 971dcd0538fSMark Fasheh /* 972dcd0538fSMark Fasheh * For a rotation which involves two leaf nodes, the "root node" is 973dcd0538fSMark Fasheh * the lowest level tree node which contains a path to both leafs. This 974dcd0538fSMark Fasheh * resulting set of information can be used to form a complete "subtree" 975dcd0538fSMark Fasheh * 976dcd0538fSMark Fasheh * This function is passed two full paths from the dinode down to a 977dcd0538fSMark Fasheh * pair of adjacent leaves. It's task is to figure out which path 978dcd0538fSMark Fasheh * index contains the subtree root - this can be the root index itself 979dcd0538fSMark Fasheh * in a worst-case rotation. 980dcd0538fSMark Fasheh * 981dcd0538fSMark Fasheh * The array index of the subtree root is passed back. 982dcd0538fSMark Fasheh */ 983dcd0538fSMark Fasheh static int ocfs2_find_subtree_root(struct inode *inode, 984dcd0538fSMark Fasheh struct ocfs2_path *left, 985dcd0538fSMark Fasheh struct ocfs2_path *right) 986dcd0538fSMark Fasheh { 987dcd0538fSMark Fasheh int i = 0; 988dcd0538fSMark Fasheh 989dcd0538fSMark Fasheh /* 990dcd0538fSMark Fasheh * Check that the caller passed in two paths from the same tree. 991dcd0538fSMark Fasheh */ 992dcd0538fSMark Fasheh BUG_ON(path_root_bh(left) != path_root_bh(right)); 993dcd0538fSMark Fasheh 994dcd0538fSMark Fasheh do { 995dcd0538fSMark Fasheh i++; 996dcd0538fSMark Fasheh 997dcd0538fSMark Fasheh /* 998dcd0538fSMark Fasheh * The caller didn't pass two adjacent paths. 999dcd0538fSMark Fasheh */ 1000dcd0538fSMark Fasheh mlog_bug_on_msg(i > left->p_tree_depth, 1001dcd0538fSMark Fasheh "Inode %lu, left depth %u, right depth %u\n" 1002dcd0538fSMark Fasheh "left leaf blk %llu, right leaf blk %llu\n", 1003dcd0538fSMark Fasheh inode->i_ino, left->p_tree_depth, 1004dcd0538fSMark Fasheh right->p_tree_depth, 1005dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(left)->b_blocknr, 1006dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(right)->b_blocknr); 1007dcd0538fSMark Fasheh } while (left->p_node[i].bh->b_blocknr == 1008dcd0538fSMark Fasheh right->p_node[i].bh->b_blocknr); 1009dcd0538fSMark Fasheh 1010dcd0538fSMark Fasheh return i - 1; 1011dcd0538fSMark Fasheh } 1012dcd0538fSMark Fasheh 1013dcd0538fSMark Fasheh typedef void (path_insert_t)(void *, struct buffer_head *); 1014dcd0538fSMark Fasheh 1015dcd0538fSMark Fasheh /* 1016dcd0538fSMark Fasheh * Traverse a btree path in search of cpos, starting at root_el. 1017dcd0538fSMark Fasheh * 1018dcd0538fSMark Fasheh * This code can be called with a cpos larger than the tree, in which 1019dcd0538fSMark Fasheh * case it will return the rightmost path. 1020dcd0538fSMark Fasheh */ 1021dcd0538fSMark Fasheh static int __ocfs2_find_path(struct inode *inode, 1022dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el, u32 cpos, 1023dcd0538fSMark Fasheh path_insert_t *func, void *data) 1024dcd0538fSMark Fasheh { 1025dcd0538fSMark Fasheh int i, ret = 0; 1026dcd0538fSMark Fasheh u32 range; 1027dcd0538fSMark Fasheh u64 blkno; 1028dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1029dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 1030dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1031dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1032dcd0538fSMark Fasheh struct ocfs2_inode_info *oi = OCFS2_I(inode); 1033dcd0538fSMark Fasheh 1034dcd0538fSMark Fasheh el = root_el; 1035dcd0538fSMark Fasheh while (el->l_tree_depth) { 1036dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 1037dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1038dcd0538fSMark Fasheh "Inode %llu has empty extent list at " 1039dcd0538fSMark Fasheh "depth %u\n", 1040dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1041dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 1042dcd0538fSMark Fasheh ret = -EROFS; 1043dcd0538fSMark Fasheh goto out; 1044dcd0538fSMark Fasheh 1045dcd0538fSMark Fasheh } 1046dcd0538fSMark Fasheh 1047dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1048dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1049dcd0538fSMark Fasheh 1050dcd0538fSMark Fasheh /* 1051dcd0538fSMark Fasheh * In the case that cpos is off the allocation 1052dcd0538fSMark Fasheh * tree, this should just wind up returning the 1053dcd0538fSMark Fasheh * rightmost record. 1054dcd0538fSMark Fasheh */ 1055dcd0538fSMark Fasheh range = le32_to_cpu(rec->e_cpos) + 1056e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec); 1057dcd0538fSMark Fasheh if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1058dcd0538fSMark Fasheh break; 1059dcd0538fSMark Fasheh } 1060dcd0538fSMark Fasheh 1061dcd0538fSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1062dcd0538fSMark Fasheh if (blkno == 0) { 1063dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1064dcd0538fSMark Fasheh "Inode %llu has bad blkno in extent list " 1065dcd0538fSMark Fasheh "at depth %u (index %d)\n", 1066dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1067dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), i); 1068dcd0538fSMark Fasheh ret = -EROFS; 1069dcd0538fSMark Fasheh goto out; 1070dcd0538fSMark Fasheh } 1071dcd0538fSMark Fasheh 1072dcd0538fSMark Fasheh brelse(bh); 1073dcd0538fSMark Fasheh bh = NULL; 1074dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1075dcd0538fSMark Fasheh &bh, OCFS2_BH_CACHED, inode); 1076dcd0538fSMark Fasheh if (ret) { 1077dcd0538fSMark Fasheh mlog_errno(ret); 1078dcd0538fSMark Fasheh goto out; 1079dcd0538fSMark Fasheh } 1080dcd0538fSMark Fasheh 1081dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 1082dcd0538fSMark Fasheh el = &eb->h_list; 1083dcd0538fSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1084dcd0538fSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1085dcd0538fSMark Fasheh ret = -EIO; 1086dcd0538fSMark Fasheh goto out; 1087dcd0538fSMark Fasheh } 1088dcd0538fSMark Fasheh 1089dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) > 1090dcd0538fSMark Fasheh le16_to_cpu(el->l_count)) { 1091dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1092dcd0538fSMark Fasheh "Inode %llu has bad count in extent list " 1093dcd0538fSMark Fasheh "at block %llu (next free=%u, count=%u)\n", 1094dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1095dcd0538fSMark Fasheh (unsigned long long)bh->b_blocknr, 1096dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1097dcd0538fSMark Fasheh le16_to_cpu(el->l_count)); 1098dcd0538fSMark Fasheh ret = -EROFS; 1099dcd0538fSMark Fasheh goto out; 1100dcd0538fSMark Fasheh } 1101dcd0538fSMark Fasheh 1102dcd0538fSMark Fasheh if (func) 1103dcd0538fSMark Fasheh func(data, bh); 1104dcd0538fSMark Fasheh } 1105dcd0538fSMark Fasheh 1106dcd0538fSMark Fasheh out: 1107dcd0538fSMark Fasheh /* 1108dcd0538fSMark Fasheh * Catch any trailing bh that the loop didn't handle. 1109dcd0538fSMark Fasheh */ 1110dcd0538fSMark Fasheh brelse(bh); 1111dcd0538fSMark Fasheh 1112dcd0538fSMark Fasheh return ret; 1113dcd0538fSMark Fasheh } 1114dcd0538fSMark Fasheh 1115dcd0538fSMark Fasheh /* 1116dcd0538fSMark Fasheh * Given an initialized path (that is, it has a valid root extent 1117dcd0538fSMark Fasheh * list), this function will traverse the btree in search of the path 1118dcd0538fSMark Fasheh * which would contain cpos. 1119dcd0538fSMark Fasheh * 1120dcd0538fSMark Fasheh * The path traveled is recorded in the path structure. 1121dcd0538fSMark Fasheh * 1122dcd0538fSMark Fasheh * Note that this will not do any comparisons on leaf node extent 1123dcd0538fSMark Fasheh * records, so it will work fine in the case that we just added a tree 1124dcd0538fSMark Fasheh * branch. 1125dcd0538fSMark Fasheh */ 1126dcd0538fSMark Fasheh struct find_path_data { 1127dcd0538fSMark Fasheh int index; 1128dcd0538fSMark Fasheh struct ocfs2_path *path; 1129dcd0538fSMark Fasheh }; 1130dcd0538fSMark Fasheh static void find_path_ins(void *data, struct buffer_head *bh) 1131dcd0538fSMark Fasheh { 1132dcd0538fSMark Fasheh struct find_path_data *fp = data; 1133dcd0538fSMark Fasheh 1134dcd0538fSMark Fasheh get_bh(bh); 1135dcd0538fSMark Fasheh ocfs2_path_insert_eb(fp->path, fp->index, bh); 1136dcd0538fSMark Fasheh fp->index++; 1137dcd0538fSMark Fasheh } 1138dcd0538fSMark Fasheh static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1139dcd0538fSMark Fasheh u32 cpos) 1140dcd0538fSMark Fasheh { 1141dcd0538fSMark Fasheh struct find_path_data data; 1142dcd0538fSMark Fasheh 1143dcd0538fSMark Fasheh data.index = 1; 1144dcd0538fSMark Fasheh data.path = path; 1145dcd0538fSMark Fasheh return __ocfs2_find_path(inode, path_root_el(path), cpos, 1146dcd0538fSMark Fasheh find_path_ins, &data); 1147dcd0538fSMark Fasheh } 1148dcd0538fSMark Fasheh 1149dcd0538fSMark Fasheh static void find_leaf_ins(void *data, struct buffer_head *bh) 1150dcd0538fSMark Fasheh { 1151dcd0538fSMark Fasheh struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1152dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &eb->h_list; 1153dcd0538fSMark Fasheh struct buffer_head **ret = data; 1154dcd0538fSMark Fasheh 1155dcd0538fSMark Fasheh /* We want to retain only the leaf block. */ 1156dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 1157dcd0538fSMark Fasheh get_bh(bh); 1158dcd0538fSMark Fasheh *ret = bh; 1159dcd0538fSMark Fasheh } 1160dcd0538fSMark Fasheh } 1161dcd0538fSMark Fasheh /* 1162dcd0538fSMark Fasheh * Find the leaf block in the tree which would contain cpos. No 1163dcd0538fSMark Fasheh * checking of the actual leaf is done. 1164dcd0538fSMark Fasheh * 1165dcd0538fSMark Fasheh * Some paths want to call this instead of allocating a path structure 1166dcd0538fSMark Fasheh * and calling ocfs2_find_path(). 1167dcd0538fSMark Fasheh * 1168dcd0538fSMark Fasheh * This function doesn't handle non btree extent lists. 1169dcd0538fSMark Fasheh */ 1170363041a5SMark Fasheh int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1171363041a5SMark Fasheh u32 cpos, struct buffer_head **leaf_bh) 1172dcd0538fSMark Fasheh { 1173dcd0538fSMark Fasheh int ret; 1174dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1175dcd0538fSMark Fasheh 1176dcd0538fSMark Fasheh ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1177dcd0538fSMark Fasheh if (ret) { 1178dcd0538fSMark Fasheh mlog_errno(ret); 1179dcd0538fSMark Fasheh goto out; 1180dcd0538fSMark Fasheh } 1181dcd0538fSMark Fasheh 1182dcd0538fSMark Fasheh *leaf_bh = bh; 1183dcd0538fSMark Fasheh out: 1184dcd0538fSMark Fasheh return ret; 1185dcd0538fSMark Fasheh } 1186dcd0538fSMark Fasheh 1187dcd0538fSMark Fasheh /* 1188dcd0538fSMark Fasheh * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1189dcd0538fSMark Fasheh * 1190dcd0538fSMark Fasheh * Basically, we've moved stuff around at the bottom of the tree and 1191dcd0538fSMark Fasheh * we need to fix up the extent records above the changes to reflect 1192dcd0538fSMark Fasheh * the new changes. 1193dcd0538fSMark Fasheh * 1194dcd0538fSMark Fasheh * left_rec: the record on the left. 1195dcd0538fSMark Fasheh * left_child_el: is the child list pointed to by left_rec 1196dcd0538fSMark Fasheh * right_rec: the record to the right of left_rec 1197dcd0538fSMark Fasheh * right_child_el: is the child list pointed to by right_rec 1198dcd0538fSMark Fasheh * 1199dcd0538fSMark Fasheh * By definition, this only works on interior nodes. 1200dcd0538fSMark Fasheh */ 1201dcd0538fSMark Fasheh static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1202dcd0538fSMark Fasheh struct ocfs2_extent_list *left_child_el, 1203dcd0538fSMark Fasheh struct ocfs2_extent_rec *right_rec, 1204dcd0538fSMark Fasheh struct ocfs2_extent_list *right_child_el) 1205dcd0538fSMark Fasheh { 1206dcd0538fSMark Fasheh u32 left_clusters, right_end; 1207dcd0538fSMark Fasheh 1208dcd0538fSMark Fasheh /* 1209dcd0538fSMark Fasheh * Interior nodes never have holes. Their cpos is the cpos of 1210dcd0538fSMark Fasheh * the leftmost record in their child list. Their cluster 1211dcd0538fSMark Fasheh * count covers the full theoretical range of their child list 1212dcd0538fSMark Fasheh * - the range between their cpos and the cpos of the record 1213dcd0538fSMark Fasheh * immediately to their right. 1214dcd0538fSMark Fasheh */ 1215dcd0538fSMark Fasheh left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1216dcd0538fSMark Fasheh left_clusters -= le32_to_cpu(left_rec->e_cpos); 1217e48edee2SMark Fasheh left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1218dcd0538fSMark Fasheh 1219dcd0538fSMark Fasheh /* 1220dcd0538fSMark Fasheh * Calculate the rightmost cluster count boundary before 1221e48edee2SMark Fasheh * moving cpos - we will need to adjust clusters after 1222dcd0538fSMark Fasheh * updating e_cpos to keep the same highest cluster count. 1223dcd0538fSMark Fasheh */ 1224dcd0538fSMark Fasheh right_end = le32_to_cpu(right_rec->e_cpos); 1225e48edee2SMark Fasheh right_end += le32_to_cpu(right_rec->e_int_clusters); 1226dcd0538fSMark Fasheh 1227dcd0538fSMark Fasheh right_rec->e_cpos = left_rec->e_cpos; 1228dcd0538fSMark Fasheh le32_add_cpu(&right_rec->e_cpos, left_clusters); 1229dcd0538fSMark Fasheh 1230dcd0538fSMark Fasheh right_end -= le32_to_cpu(right_rec->e_cpos); 1231e48edee2SMark Fasheh right_rec->e_int_clusters = cpu_to_le32(right_end); 1232dcd0538fSMark Fasheh } 1233dcd0538fSMark Fasheh 1234dcd0538fSMark Fasheh /* 1235dcd0538fSMark Fasheh * Adjust the adjacent root node records involved in a 1236dcd0538fSMark Fasheh * rotation. left_el_blkno is passed in as a key so that we can easily 1237dcd0538fSMark Fasheh * find it's index in the root list. 1238dcd0538fSMark Fasheh */ 1239dcd0538fSMark Fasheh static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1240dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el, 1241dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, 1242dcd0538fSMark Fasheh u64 left_el_blkno) 1243dcd0538fSMark Fasheh { 1244dcd0538fSMark Fasheh int i; 1245dcd0538fSMark Fasheh 1246dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1247dcd0538fSMark Fasheh le16_to_cpu(left_el->l_tree_depth)); 1248dcd0538fSMark Fasheh 1249dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1250dcd0538fSMark Fasheh if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1251dcd0538fSMark Fasheh break; 1252dcd0538fSMark Fasheh } 1253dcd0538fSMark Fasheh 1254dcd0538fSMark Fasheh /* 1255dcd0538fSMark Fasheh * The path walking code should have never returned a root and 1256dcd0538fSMark Fasheh * two paths which are not adjacent. 1257dcd0538fSMark Fasheh */ 1258dcd0538fSMark Fasheh BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1259dcd0538fSMark Fasheh 1260dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1261dcd0538fSMark Fasheh &root_el->l_recs[i + 1], right_el); 1262dcd0538fSMark Fasheh } 1263dcd0538fSMark Fasheh 1264dcd0538fSMark Fasheh /* 1265dcd0538fSMark Fasheh * We've changed a leaf block (in right_path) and need to reflect that 1266dcd0538fSMark Fasheh * change back up the subtree. 1267dcd0538fSMark Fasheh * 1268dcd0538fSMark Fasheh * This happens in multiple places: 1269dcd0538fSMark Fasheh * - When we've moved an extent record from the left path leaf to the right 1270dcd0538fSMark Fasheh * path leaf to make room for an empty extent in the left path leaf. 1271dcd0538fSMark Fasheh * - When our insert into the right path leaf is at the leftmost edge 1272dcd0538fSMark Fasheh * and requires an update of the path immediately to it's left. This 1273dcd0538fSMark Fasheh * can occur at the end of some types of rotation and appending inserts. 1274dcd0538fSMark Fasheh */ 1275dcd0538fSMark Fasheh static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1276dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1277dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1278dcd0538fSMark Fasheh int subtree_index) 1279dcd0538fSMark Fasheh { 1280dcd0538fSMark Fasheh int ret, i, idx; 1281dcd0538fSMark Fasheh struct ocfs2_extent_list *el, *left_el, *right_el; 1282dcd0538fSMark Fasheh struct ocfs2_extent_rec *left_rec, *right_rec; 1283dcd0538fSMark Fasheh struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1284dcd0538fSMark Fasheh 1285dcd0538fSMark Fasheh /* 1286dcd0538fSMark Fasheh * Update the counts and position values within all the 1287dcd0538fSMark Fasheh * interior nodes to reflect the leaf rotation we just did. 1288dcd0538fSMark Fasheh * 1289dcd0538fSMark Fasheh * The root node is handled below the loop. 1290dcd0538fSMark Fasheh * 1291dcd0538fSMark Fasheh * We begin the loop with right_el and left_el pointing to the 1292dcd0538fSMark Fasheh * leaf lists and work our way up. 1293dcd0538fSMark Fasheh * 1294dcd0538fSMark Fasheh * NOTE: within this loop, left_el and right_el always refer 1295dcd0538fSMark Fasheh * to the *child* lists. 1296dcd0538fSMark Fasheh */ 1297dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1298dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1299dcd0538fSMark Fasheh for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1300dcd0538fSMark Fasheh mlog(0, "Adjust records at index %u\n", i); 1301dcd0538fSMark Fasheh 1302dcd0538fSMark Fasheh /* 1303dcd0538fSMark Fasheh * One nice property of knowing that all of these 1304dcd0538fSMark Fasheh * nodes are below the root is that we only deal with 1305dcd0538fSMark Fasheh * the leftmost right node record and the rightmost 1306dcd0538fSMark Fasheh * left node record. 1307dcd0538fSMark Fasheh */ 1308dcd0538fSMark Fasheh el = left_path->p_node[i].el; 1309dcd0538fSMark Fasheh idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1310dcd0538fSMark Fasheh left_rec = &el->l_recs[idx]; 1311dcd0538fSMark Fasheh 1312dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1313dcd0538fSMark Fasheh right_rec = &el->l_recs[0]; 1314dcd0538fSMark Fasheh 1315dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1316dcd0538fSMark Fasheh right_el); 1317dcd0538fSMark Fasheh 1318dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1319dcd0538fSMark Fasheh if (ret) 1320dcd0538fSMark Fasheh mlog_errno(ret); 1321dcd0538fSMark Fasheh 1322dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1323dcd0538fSMark Fasheh if (ret) 1324dcd0538fSMark Fasheh mlog_errno(ret); 1325dcd0538fSMark Fasheh 1326dcd0538fSMark Fasheh /* 1327dcd0538fSMark Fasheh * Setup our list pointers now so that the current 1328dcd0538fSMark Fasheh * parents become children in the next iteration. 1329dcd0538fSMark Fasheh */ 1330dcd0538fSMark Fasheh left_el = left_path->p_node[i].el; 1331dcd0538fSMark Fasheh right_el = right_path->p_node[i].el; 1332dcd0538fSMark Fasheh } 1333dcd0538fSMark Fasheh 1334dcd0538fSMark Fasheh /* 1335dcd0538fSMark Fasheh * At the root node, adjust the two adjacent records which 1336dcd0538fSMark Fasheh * begin our path to the leaves. 1337dcd0538fSMark Fasheh */ 1338dcd0538fSMark Fasheh 1339dcd0538fSMark Fasheh el = left_path->p_node[subtree_index].el; 1340dcd0538fSMark Fasheh left_el = left_path->p_node[subtree_index + 1].el; 1341dcd0538fSMark Fasheh right_el = right_path->p_node[subtree_index + 1].el; 1342dcd0538fSMark Fasheh 1343dcd0538fSMark Fasheh ocfs2_adjust_root_records(el, left_el, right_el, 1344dcd0538fSMark Fasheh left_path->p_node[subtree_index + 1].bh->b_blocknr); 1345dcd0538fSMark Fasheh 1346dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1347dcd0538fSMark Fasheh 1348dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, root_bh); 1349dcd0538fSMark Fasheh if (ret) 1350dcd0538fSMark Fasheh mlog_errno(ret); 1351dcd0538fSMark Fasheh } 1352dcd0538fSMark Fasheh 1353dcd0538fSMark Fasheh static int ocfs2_rotate_subtree_right(struct inode *inode, 1354dcd0538fSMark Fasheh handle_t *handle, 1355dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1356dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1357dcd0538fSMark Fasheh int subtree_index) 1358dcd0538fSMark Fasheh { 1359dcd0538fSMark Fasheh int ret, i; 1360dcd0538fSMark Fasheh struct buffer_head *right_leaf_bh; 1361dcd0538fSMark Fasheh struct buffer_head *left_leaf_bh = NULL; 1362dcd0538fSMark Fasheh struct buffer_head *root_bh; 1363dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, *left_el; 1364dcd0538fSMark Fasheh struct ocfs2_extent_rec move_rec; 1365dcd0538fSMark Fasheh 1366dcd0538fSMark Fasheh left_leaf_bh = path_leaf_bh(left_path); 1367dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1368dcd0538fSMark Fasheh 1369dcd0538fSMark Fasheh if (left_el->l_next_free_rec != left_el->l_count) { 1370dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1371dcd0538fSMark Fasheh "Inode %llu has non-full interior leaf node %llu" 1372dcd0538fSMark Fasheh "(next free = %u)", 1373dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1374dcd0538fSMark Fasheh (unsigned long long)left_leaf_bh->b_blocknr, 1375dcd0538fSMark Fasheh le16_to_cpu(left_el->l_next_free_rec)); 1376dcd0538fSMark Fasheh return -EROFS; 1377dcd0538fSMark Fasheh } 1378dcd0538fSMark Fasheh 1379dcd0538fSMark Fasheh /* 1380dcd0538fSMark Fasheh * This extent block may already have an empty record, so we 1381dcd0538fSMark Fasheh * return early if so. 1382dcd0538fSMark Fasheh */ 1383dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1384dcd0538fSMark Fasheh return 0; 1385dcd0538fSMark Fasheh 1386dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1387dcd0538fSMark Fasheh BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1388dcd0538fSMark Fasheh 1389dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, root_bh, 1390dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1391dcd0538fSMark Fasheh if (ret) { 1392dcd0538fSMark Fasheh mlog_errno(ret); 1393dcd0538fSMark Fasheh goto out; 1394dcd0538fSMark Fasheh } 1395dcd0538fSMark Fasheh 1396dcd0538fSMark Fasheh for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1397dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1398dcd0538fSMark Fasheh right_path->p_node[i].bh, 1399dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1400dcd0538fSMark Fasheh if (ret) { 1401dcd0538fSMark Fasheh mlog_errno(ret); 1402dcd0538fSMark Fasheh goto out; 1403dcd0538fSMark Fasheh } 1404dcd0538fSMark Fasheh 1405dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1406dcd0538fSMark Fasheh left_path->p_node[i].bh, 1407dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1408dcd0538fSMark Fasheh if (ret) { 1409dcd0538fSMark Fasheh mlog_errno(ret); 1410dcd0538fSMark Fasheh goto out; 1411dcd0538fSMark Fasheh } 1412dcd0538fSMark Fasheh } 1413dcd0538fSMark Fasheh 1414dcd0538fSMark Fasheh right_leaf_bh = path_leaf_bh(right_path); 1415dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1416dcd0538fSMark Fasheh 1417dcd0538fSMark Fasheh /* This is a code error, not a disk corruption. */ 1418dcd0538fSMark Fasheh mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1419dcd0538fSMark Fasheh "because rightmost leaf block %llu is empty\n", 1420dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1421dcd0538fSMark Fasheh (unsigned long long)right_leaf_bh->b_blocknr); 1422dcd0538fSMark Fasheh 1423dcd0538fSMark Fasheh ocfs2_create_empty_extent(right_el); 1424dcd0538fSMark Fasheh 1425dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1426dcd0538fSMark Fasheh if (ret) { 1427dcd0538fSMark Fasheh mlog_errno(ret); 1428dcd0538fSMark Fasheh goto out; 1429dcd0538fSMark Fasheh } 1430dcd0538fSMark Fasheh 1431dcd0538fSMark Fasheh /* Do the copy now. */ 1432dcd0538fSMark Fasheh i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1433dcd0538fSMark Fasheh move_rec = left_el->l_recs[i]; 1434dcd0538fSMark Fasheh right_el->l_recs[0] = move_rec; 1435dcd0538fSMark Fasheh 1436dcd0538fSMark Fasheh /* 1437dcd0538fSMark Fasheh * Clear out the record we just copied and shift everything 1438dcd0538fSMark Fasheh * over, leaving an empty extent in the left leaf. 1439dcd0538fSMark Fasheh * 1440dcd0538fSMark Fasheh * We temporarily subtract from next_free_rec so that the 1441dcd0538fSMark Fasheh * shift will lose the tail record (which is now defunct). 1442dcd0538fSMark Fasheh */ 1443dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, -1); 1444dcd0538fSMark Fasheh ocfs2_shift_records_right(left_el); 1445dcd0538fSMark Fasheh memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1446dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, 1); 1447dcd0538fSMark Fasheh 1448dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1449dcd0538fSMark Fasheh if (ret) { 1450dcd0538fSMark Fasheh mlog_errno(ret); 1451dcd0538fSMark Fasheh goto out; 1452dcd0538fSMark Fasheh } 1453dcd0538fSMark Fasheh 1454dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1455dcd0538fSMark Fasheh subtree_index); 1456dcd0538fSMark Fasheh 1457dcd0538fSMark Fasheh out: 1458dcd0538fSMark Fasheh return ret; 1459dcd0538fSMark Fasheh } 1460dcd0538fSMark Fasheh 1461dcd0538fSMark Fasheh /* 1462dcd0538fSMark Fasheh * Given a full path, determine what cpos value would return us a path 1463dcd0538fSMark Fasheh * containing the leaf immediately to the left of the current one. 1464dcd0538fSMark Fasheh * 1465dcd0538fSMark Fasheh * Will return zero if the path passed in is already the leftmost path. 1466dcd0538fSMark Fasheh */ 1467dcd0538fSMark Fasheh static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 1468dcd0538fSMark Fasheh struct ocfs2_path *path, u32 *cpos) 1469dcd0538fSMark Fasheh { 1470dcd0538fSMark Fasheh int i, j, ret = 0; 1471dcd0538fSMark Fasheh u64 blkno; 1472dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1473dcd0538fSMark Fasheh 1474e48edee2SMark Fasheh BUG_ON(path->p_tree_depth == 0); 1475e48edee2SMark Fasheh 1476dcd0538fSMark Fasheh *cpos = 0; 1477dcd0538fSMark Fasheh 1478dcd0538fSMark Fasheh blkno = path_leaf_bh(path)->b_blocknr; 1479dcd0538fSMark Fasheh 1480dcd0538fSMark Fasheh /* Start at the tree node just above the leaf and work our way up. */ 1481dcd0538fSMark Fasheh i = path->p_tree_depth - 1; 1482dcd0538fSMark Fasheh while (i >= 0) { 1483dcd0538fSMark Fasheh el = path->p_node[i].el; 1484dcd0538fSMark Fasheh 1485dcd0538fSMark Fasheh /* 1486dcd0538fSMark Fasheh * Find the extent record just before the one in our 1487dcd0538fSMark Fasheh * path. 1488dcd0538fSMark Fasheh */ 1489dcd0538fSMark Fasheh for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 1490dcd0538fSMark Fasheh if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 1491dcd0538fSMark Fasheh if (j == 0) { 1492dcd0538fSMark Fasheh if (i == 0) { 1493dcd0538fSMark Fasheh /* 1494dcd0538fSMark Fasheh * We've determined that the 1495dcd0538fSMark Fasheh * path specified is already 1496dcd0538fSMark Fasheh * the leftmost one - return a 1497dcd0538fSMark Fasheh * cpos of zero. 1498dcd0538fSMark Fasheh */ 1499dcd0538fSMark Fasheh goto out; 1500dcd0538fSMark Fasheh } 1501dcd0538fSMark Fasheh /* 1502dcd0538fSMark Fasheh * The leftmost record points to our 1503dcd0538fSMark Fasheh * leaf - we need to travel up the 1504dcd0538fSMark Fasheh * tree one level. 1505dcd0538fSMark Fasheh */ 1506dcd0538fSMark Fasheh goto next_node; 1507dcd0538fSMark Fasheh } 1508dcd0538fSMark Fasheh 1509dcd0538fSMark Fasheh *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 1510e48edee2SMark Fasheh *cpos = *cpos + ocfs2_rec_clusters(el, 1511e48edee2SMark Fasheh &el->l_recs[j - 1]); 1512e48edee2SMark Fasheh *cpos = *cpos - 1; 1513dcd0538fSMark Fasheh goto out; 1514dcd0538fSMark Fasheh } 1515dcd0538fSMark Fasheh } 1516dcd0538fSMark Fasheh 1517dcd0538fSMark Fasheh /* 1518dcd0538fSMark Fasheh * If we got here, we never found a valid node where 1519dcd0538fSMark Fasheh * the tree indicated one should be. 1520dcd0538fSMark Fasheh */ 1521dcd0538fSMark Fasheh ocfs2_error(sb, 1522dcd0538fSMark Fasheh "Invalid extent tree at extent block %llu\n", 1523dcd0538fSMark Fasheh (unsigned long long)blkno); 1524dcd0538fSMark Fasheh ret = -EROFS; 1525dcd0538fSMark Fasheh goto out; 1526dcd0538fSMark Fasheh 1527dcd0538fSMark Fasheh next_node: 1528dcd0538fSMark Fasheh blkno = path->p_node[i].bh->b_blocknr; 1529dcd0538fSMark Fasheh i--; 1530dcd0538fSMark Fasheh } 1531dcd0538fSMark Fasheh 1532dcd0538fSMark Fasheh out: 1533dcd0538fSMark Fasheh return ret; 1534dcd0538fSMark Fasheh } 1535dcd0538fSMark Fasheh 1536dcd0538fSMark Fasheh static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1537dcd0538fSMark Fasheh struct ocfs2_path *path) 1538dcd0538fSMark Fasheh { 1539dcd0538fSMark Fasheh int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1540dcd0538fSMark Fasheh 1541dcd0538fSMark Fasheh if (handle->h_buffer_credits < credits) 1542dcd0538fSMark Fasheh return ocfs2_extend_trans(handle, credits); 1543dcd0538fSMark Fasheh 1544dcd0538fSMark Fasheh return 0; 1545dcd0538fSMark Fasheh } 1546dcd0538fSMark Fasheh 1547dcd0538fSMark Fasheh /* 1548dcd0538fSMark Fasheh * Trap the case where we're inserting into the theoretical range past 1549dcd0538fSMark Fasheh * the _actual_ left leaf range. Otherwise, we'll rotate a record 1550dcd0538fSMark Fasheh * whose cpos is less than ours into the right leaf. 1551dcd0538fSMark Fasheh * 1552dcd0538fSMark Fasheh * It's only necessary to look at the rightmost record of the left 1553dcd0538fSMark Fasheh * leaf because the logic that calls us should ensure that the 1554dcd0538fSMark Fasheh * theoretical ranges in the path components above the leaves are 1555dcd0538fSMark Fasheh * correct. 1556dcd0538fSMark Fasheh */ 1557dcd0538fSMark Fasheh static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 1558dcd0538fSMark Fasheh u32 insert_cpos) 1559dcd0538fSMark Fasheh { 1560dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el; 1561dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1562dcd0538fSMark Fasheh int next_free; 1563dcd0538fSMark Fasheh 1564dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1565dcd0538fSMark Fasheh next_free = le16_to_cpu(left_el->l_next_free_rec); 1566dcd0538fSMark Fasheh rec = &left_el->l_recs[next_free - 1]; 1567dcd0538fSMark Fasheh 1568dcd0538fSMark Fasheh if (insert_cpos > le32_to_cpu(rec->e_cpos)) 1569dcd0538fSMark Fasheh return 1; 1570dcd0538fSMark Fasheh return 0; 1571dcd0538fSMark Fasheh } 1572dcd0538fSMark Fasheh 1573dcd0538fSMark Fasheh /* 1574dcd0538fSMark Fasheh * Rotate all the records in a btree right one record, starting at insert_cpos. 1575dcd0538fSMark Fasheh * 1576dcd0538fSMark Fasheh * The path to the rightmost leaf should be passed in. 1577dcd0538fSMark Fasheh * 1578dcd0538fSMark Fasheh * The array is assumed to be large enough to hold an entire path (tree depth). 1579dcd0538fSMark Fasheh * 1580dcd0538fSMark Fasheh * Upon succesful return from this function: 1581dcd0538fSMark Fasheh * 1582dcd0538fSMark Fasheh * - The 'right_path' array will contain a path to the leaf block 1583dcd0538fSMark Fasheh * whose range contains e_cpos. 1584dcd0538fSMark Fasheh * - That leaf block will have a single empty extent in list index 0. 1585dcd0538fSMark Fasheh * - In the case that the rotation requires a post-insert update, 1586dcd0538fSMark Fasheh * *ret_left_path will contain a valid path which can be passed to 1587dcd0538fSMark Fasheh * ocfs2_insert_path(). 1588dcd0538fSMark Fasheh */ 1589dcd0538fSMark Fasheh static int ocfs2_rotate_tree_right(struct inode *inode, 1590dcd0538fSMark Fasheh handle_t *handle, 1591dcd0538fSMark Fasheh u32 insert_cpos, 1592dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1593dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1594dcd0538fSMark Fasheh { 1595dcd0538fSMark Fasheh int ret, start; 1596dcd0538fSMark Fasheh u32 cpos; 1597dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1598dcd0538fSMark Fasheh 1599dcd0538fSMark Fasheh *ret_left_path = NULL; 1600dcd0538fSMark Fasheh 1601dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1602dcd0538fSMark Fasheh path_root_el(right_path)); 1603dcd0538fSMark Fasheh if (!left_path) { 1604dcd0538fSMark Fasheh ret = -ENOMEM; 1605dcd0538fSMark Fasheh mlog_errno(ret); 1606dcd0538fSMark Fasheh goto out; 1607dcd0538fSMark Fasheh } 1608dcd0538fSMark Fasheh 1609dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 1610dcd0538fSMark Fasheh if (ret) { 1611dcd0538fSMark Fasheh mlog_errno(ret); 1612dcd0538fSMark Fasheh goto out; 1613dcd0538fSMark Fasheh } 1614dcd0538fSMark Fasheh 1615dcd0538fSMark Fasheh mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 1616dcd0538fSMark Fasheh 1617dcd0538fSMark Fasheh /* 1618dcd0538fSMark Fasheh * What we want to do here is: 1619dcd0538fSMark Fasheh * 1620dcd0538fSMark Fasheh * 1) Start with the rightmost path. 1621dcd0538fSMark Fasheh * 1622dcd0538fSMark Fasheh * 2) Determine a path to the leaf block directly to the left 1623dcd0538fSMark Fasheh * of that leaf. 1624dcd0538fSMark Fasheh * 1625dcd0538fSMark Fasheh * 3) Determine the 'subtree root' - the lowest level tree node 1626dcd0538fSMark Fasheh * which contains a path to both leaves. 1627dcd0538fSMark Fasheh * 1628dcd0538fSMark Fasheh * 4) Rotate the subtree. 1629dcd0538fSMark Fasheh * 1630dcd0538fSMark Fasheh * 5) Find the next subtree by considering the left path to be 1631dcd0538fSMark Fasheh * the new right path. 1632dcd0538fSMark Fasheh * 1633dcd0538fSMark Fasheh * The check at the top of this while loop also accepts 1634dcd0538fSMark Fasheh * insert_cpos == cpos because cpos is only a _theoretical_ 1635dcd0538fSMark Fasheh * value to get us the left path - insert_cpos might very well 1636dcd0538fSMark Fasheh * be filling that hole. 1637dcd0538fSMark Fasheh * 1638dcd0538fSMark Fasheh * Stop at a cpos of '0' because we either started at the 1639dcd0538fSMark Fasheh * leftmost branch (i.e., a tree with one branch and a 1640dcd0538fSMark Fasheh * rotation inside of it), or we've gone as far as we can in 1641dcd0538fSMark Fasheh * rotating subtrees. 1642dcd0538fSMark Fasheh */ 1643dcd0538fSMark Fasheh while (cpos && insert_cpos <= cpos) { 1644dcd0538fSMark Fasheh mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 1645dcd0538fSMark Fasheh insert_cpos, cpos); 1646dcd0538fSMark Fasheh 1647dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, cpos); 1648dcd0538fSMark Fasheh if (ret) { 1649dcd0538fSMark Fasheh mlog_errno(ret); 1650dcd0538fSMark Fasheh goto out; 1651dcd0538fSMark Fasheh } 1652dcd0538fSMark Fasheh 1653dcd0538fSMark Fasheh mlog_bug_on_msg(path_leaf_bh(left_path) == 1654dcd0538fSMark Fasheh path_leaf_bh(right_path), 1655dcd0538fSMark Fasheh "Inode %lu: error during insert of %u " 1656dcd0538fSMark Fasheh "(left path cpos %u) results in two identical " 1657dcd0538fSMark Fasheh "paths ending at %llu\n", 1658dcd0538fSMark Fasheh inode->i_ino, insert_cpos, cpos, 1659dcd0538fSMark Fasheh (unsigned long long) 1660dcd0538fSMark Fasheh path_leaf_bh(left_path)->b_blocknr); 1661dcd0538fSMark Fasheh 1662dcd0538fSMark Fasheh if (ocfs2_rotate_requires_path_adjustment(left_path, 1663dcd0538fSMark Fasheh insert_cpos)) { 1664dcd0538fSMark Fasheh mlog(0, "Path adjustment required\n"); 1665dcd0538fSMark Fasheh 1666dcd0538fSMark Fasheh /* 1667dcd0538fSMark Fasheh * We've rotated the tree as much as we 1668dcd0538fSMark Fasheh * should. The rest is up to 1669dcd0538fSMark Fasheh * ocfs2_insert_path() to complete, after the 1670dcd0538fSMark Fasheh * record insertion. We indicate this 1671dcd0538fSMark Fasheh * situation by returning the left path. 1672dcd0538fSMark Fasheh * 1673dcd0538fSMark Fasheh * The reason we don't adjust the records here 1674dcd0538fSMark Fasheh * before the record insert is that an error 1675dcd0538fSMark Fasheh * later might break the rule where a parent 1676dcd0538fSMark Fasheh * record e_cpos will reflect the actual 1677dcd0538fSMark Fasheh * e_cpos of the 1st nonempty record of the 1678dcd0538fSMark Fasheh * child list. 1679dcd0538fSMark Fasheh */ 1680dcd0538fSMark Fasheh *ret_left_path = left_path; 1681dcd0538fSMark Fasheh goto out_ret_path; 1682dcd0538fSMark Fasheh } 1683dcd0538fSMark Fasheh 1684dcd0538fSMark Fasheh start = ocfs2_find_subtree_root(inode, left_path, right_path); 1685dcd0538fSMark Fasheh 1686dcd0538fSMark Fasheh mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 1687dcd0538fSMark Fasheh start, 1688dcd0538fSMark Fasheh (unsigned long long) right_path->p_node[start].bh->b_blocknr, 1689dcd0538fSMark Fasheh right_path->p_tree_depth); 1690dcd0538fSMark Fasheh 1691dcd0538fSMark Fasheh ret = ocfs2_extend_rotate_transaction(handle, start, 1692dcd0538fSMark Fasheh right_path); 1693dcd0538fSMark Fasheh if (ret) { 1694dcd0538fSMark Fasheh mlog_errno(ret); 1695dcd0538fSMark Fasheh goto out; 1696dcd0538fSMark Fasheh } 1697dcd0538fSMark Fasheh 1698dcd0538fSMark Fasheh ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 1699dcd0538fSMark Fasheh right_path, start); 1700dcd0538fSMark Fasheh if (ret) { 1701dcd0538fSMark Fasheh mlog_errno(ret); 1702dcd0538fSMark Fasheh goto out; 1703dcd0538fSMark Fasheh } 1704dcd0538fSMark Fasheh 1705dcd0538fSMark Fasheh /* 1706dcd0538fSMark Fasheh * There is no need to re-read the next right path 1707dcd0538fSMark Fasheh * as we know that it'll be our current left 1708dcd0538fSMark Fasheh * path. Optimize by copying values instead. 1709dcd0538fSMark Fasheh */ 1710dcd0538fSMark Fasheh ocfs2_mv_path(right_path, left_path); 1711dcd0538fSMark Fasheh 1712dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1713dcd0538fSMark Fasheh &cpos); 1714dcd0538fSMark Fasheh if (ret) { 1715dcd0538fSMark Fasheh mlog_errno(ret); 1716dcd0538fSMark Fasheh goto out; 1717dcd0538fSMark Fasheh } 1718dcd0538fSMark Fasheh } 1719dcd0538fSMark Fasheh 1720dcd0538fSMark Fasheh out: 1721dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1722dcd0538fSMark Fasheh 1723dcd0538fSMark Fasheh out_ret_path: 1724dcd0538fSMark Fasheh return ret; 1725dcd0538fSMark Fasheh } 1726dcd0538fSMark Fasheh 1727dcd0538fSMark Fasheh /* 1728dcd0538fSMark Fasheh * Do the final bits of extent record insertion at the target leaf 1729dcd0538fSMark Fasheh * list. If this leaf is part of an allocation tree, it is assumed 1730dcd0538fSMark Fasheh * that the tree above has been prepared. 1731dcd0538fSMark Fasheh */ 1732dcd0538fSMark Fasheh static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 1733dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 1734dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 1735dcd0538fSMark Fasheh struct inode *inode) 1736dcd0538fSMark Fasheh { 1737dcd0538fSMark Fasheh int i = insert->ins_contig_index; 1738dcd0538fSMark Fasheh unsigned int range; 1739dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1740dcd0538fSMark Fasheh 1741e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1742dcd0538fSMark Fasheh 1743dcd0538fSMark Fasheh /* 1744dcd0538fSMark Fasheh * Contiguous insert - either left or right. 1745dcd0538fSMark Fasheh */ 1746dcd0538fSMark Fasheh if (insert->ins_contig != CONTIG_NONE) { 1747dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1748dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT) { 1749dcd0538fSMark Fasheh rec->e_blkno = insert_rec->e_blkno; 1750dcd0538fSMark Fasheh rec->e_cpos = insert_rec->e_cpos; 1751dcd0538fSMark Fasheh } 1752e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, 1753e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1754dcd0538fSMark Fasheh return; 1755dcd0538fSMark Fasheh } 1756dcd0538fSMark Fasheh 1757dcd0538fSMark Fasheh /* 1758dcd0538fSMark Fasheh * Handle insert into an empty leaf. 1759dcd0538fSMark Fasheh */ 1760dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0 || 1761dcd0538fSMark Fasheh ((le16_to_cpu(el->l_next_free_rec) == 1) && 1762dcd0538fSMark Fasheh ocfs2_is_empty_extent(&el->l_recs[0]))) { 1763dcd0538fSMark Fasheh el->l_recs[0] = *insert_rec; 1764dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(1); 1765dcd0538fSMark Fasheh return; 1766dcd0538fSMark Fasheh } 1767dcd0538fSMark Fasheh 1768dcd0538fSMark Fasheh /* 1769dcd0538fSMark Fasheh * Appending insert. 1770dcd0538fSMark Fasheh */ 1771dcd0538fSMark Fasheh if (insert->ins_appending == APPEND_TAIL) { 1772dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 1773dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1774e48edee2SMark Fasheh range = le32_to_cpu(rec->e_cpos) 1775e48edee2SMark Fasheh + le16_to_cpu(rec->e_leaf_clusters); 1776dcd0538fSMark Fasheh BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 1777dcd0538fSMark Fasheh 1778dcd0538fSMark Fasheh mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 1779dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1780dcd0538fSMark Fasheh "inode %lu, depth %u, count %u, next free %u, " 1781dcd0538fSMark Fasheh "rec.cpos %u, rec.clusters %u, " 1782dcd0538fSMark Fasheh "insert.cpos %u, insert.clusters %u\n", 1783dcd0538fSMark Fasheh inode->i_ino, 1784dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), 1785dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1786dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1787dcd0538fSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos), 1788e48edee2SMark Fasheh le16_to_cpu(el->l_recs[i].e_leaf_clusters), 1789dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 1790e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1791dcd0538fSMark Fasheh i++; 1792dcd0538fSMark Fasheh el->l_recs[i] = *insert_rec; 1793dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 1794dcd0538fSMark Fasheh return; 1795dcd0538fSMark Fasheh } 1796dcd0538fSMark Fasheh 1797dcd0538fSMark Fasheh /* 1798dcd0538fSMark Fasheh * Ok, we have to rotate. 1799dcd0538fSMark Fasheh * 1800dcd0538fSMark Fasheh * At this point, it is safe to assume that inserting into an 1801dcd0538fSMark Fasheh * empty leaf and appending to a leaf have both been handled 1802dcd0538fSMark Fasheh * above. 1803dcd0538fSMark Fasheh * 1804dcd0538fSMark Fasheh * This leaf needs to have space, either by the empty 1st 1805dcd0538fSMark Fasheh * extent record, or by virtue of an l_next_rec < l_count. 1806dcd0538fSMark Fasheh */ 1807dcd0538fSMark Fasheh ocfs2_rotate_leaf(el, insert_rec); 1808dcd0538fSMark Fasheh } 1809dcd0538fSMark Fasheh 1810dcd0538fSMark Fasheh static inline void ocfs2_update_dinode_clusters(struct inode *inode, 1811dcd0538fSMark Fasheh struct ocfs2_dinode *di, 1812dcd0538fSMark Fasheh u32 clusters) 1813dcd0538fSMark Fasheh { 1814dcd0538fSMark Fasheh le32_add_cpu(&di->i_clusters, clusters); 1815dcd0538fSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 1816dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 1817dcd0538fSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 1818dcd0538fSMark Fasheh } 1819dcd0538fSMark Fasheh 1820dcd0538fSMark Fasheh static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 1821dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 1822dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1823dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1824dcd0538fSMark Fasheh { 1825dcd0538fSMark Fasheh int ret, i, next_free; 1826dcd0538fSMark Fasheh struct buffer_head *bh; 1827dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1828dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1829dcd0538fSMark Fasheh 1830dcd0538fSMark Fasheh *ret_left_path = NULL; 1831dcd0538fSMark Fasheh 1832dcd0538fSMark Fasheh /* 1833e48edee2SMark Fasheh * This shouldn't happen for non-trees. The extent rec cluster 1834e48edee2SMark Fasheh * count manipulation below only works for interior nodes. 1835e48edee2SMark Fasheh */ 1836e48edee2SMark Fasheh BUG_ON(right_path->p_tree_depth == 0); 1837e48edee2SMark Fasheh 1838e48edee2SMark Fasheh /* 1839dcd0538fSMark Fasheh * If our appending insert is at the leftmost edge of a leaf, 1840dcd0538fSMark Fasheh * then we might need to update the rightmost records of the 1841dcd0538fSMark Fasheh * neighboring path. 1842dcd0538fSMark Fasheh */ 1843dcd0538fSMark Fasheh el = path_leaf_el(right_path); 1844dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1845dcd0538fSMark Fasheh if (next_free == 0 || 1846dcd0538fSMark Fasheh (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 1847dcd0538fSMark Fasheh u32 left_cpos; 1848dcd0538fSMark Fasheh 1849dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1850dcd0538fSMark Fasheh &left_cpos); 1851dcd0538fSMark Fasheh if (ret) { 1852dcd0538fSMark Fasheh mlog_errno(ret); 1853dcd0538fSMark Fasheh goto out; 1854dcd0538fSMark Fasheh } 1855dcd0538fSMark Fasheh 1856dcd0538fSMark Fasheh mlog(0, "Append may need a left path update. cpos: %u, " 1857dcd0538fSMark Fasheh "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 1858dcd0538fSMark Fasheh left_cpos); 1859dcd0538fSMark Fasheh 1860dcd0538fSMark Fasheh /* 1861dcd0538fSMark Fasheh * No need to worry if the append is already in the 1862dcd0538fSMark Fasheh * leftmost leaf. 1863dcd0538fSMark Fasheh */ 1864dcd0538fSMark Fasheh if (left_cpos) { 1865dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1866dcd0538fSMark Fasheh path_root_el(right_path)); 1867dcd0538fSMark Fasheh if (!left_path) { 1868dcd0538fSMark Fasheh ret = -ENOMEM; 1869dcd0538fSMark Fasheh mlog_errno(ret); 1870dcd0538fSMark Fasheh goto out; 1871dcd0538fSMark Fasheh } 1872dcd0538fSMark Fasheh 1873dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, left_cpos); 1874dcd0538fSMark Fasheh if (ret) { 1875dcd0538fSMark Fasheh mlog_errno(ret); 1876dcd0538fSMark Fasheh goto out; 1877dcd0538fSMark Fasheh } 1878dcd0538fSMark Fasheh 1879dcd0538fSMark Fasheh /* 1880dcd0538fSMark Fasheh * ocfs2_insert_path() will pass the left_path to the 1881dcd0538fSMark Fasheh * journal for us. 1882dcd0538fSMark Fasheh */ 1883dcd0538fSMark Fasheh } 1884dcd0538fSMark Fasheh } 1885dcd0538fSMark Fasheh 1886dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 1887dcd0538fSMark Fasheh if (ret) { 1888dcd0538fSMark Fasheh mlog_errno(ret); 1889dcd0538fSMark Fasheh goto out; 1890dcd0538fSMark Fasheh } 1891dcd0538fSMark Fasheh 1892dcd0538fSMark Fasheh el = path_root_el(right_path); 1893dcd0538fSMark Fasheh bh = path_root_bh(right_path); 1894dcd0538fSMark Fasheh i = 0; 1895dcd0538fSMark Fasheh while (1) { 1896e48edee2SMark Fasheh struct ocfs2_extent_rec *rec; 1897e48edee2SMark Fasheh 1898dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1899dcd0538fSMark Fasheh if (next_free == 0) { 1900dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1901dcd0538fSMark Fasheh "Dinode %llu has a bad extent list", 1902dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 1903dcd0538fSMark Fasheh ret = -EIO; 1904dcd0538fSMark Fasheh goto out; 1905dcd0538fSMark Fasheh } 1906dcd0538fSMark Fasheh 1907e48edee2SMark Fasheh rec = &el->l_recs[next_free - 1]; 1908e48edee2SMark Fasheh 1909e48edee2SMark Fasheh rec->e_int_clusters = insert_rec->e_cpos; 1910e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1911e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1912e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1913e48edee2SMark Fasheh -le32_to_cpu(rec->e_cpos)); 1914dcd0538fSMark Fasheh 1915dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 1916dcd0538fSMark Fasheh if (ret) 1917dcd0538fSMark Fasheh mlog_errno(ret); 1918dcd0538fSMark Fasheh 1919e48edee2SMark Fasheh /* Don't touch the leaf node */ 1920dcd0538fSMark Fasheh if (++i >= right_path->p_tree_depth) 1921dcd0538fSMark Fasheh break; 1922dcd0538fSMark Fasheh 1923dcd0538fSMark Fasheh bh = right_path->p_node[i].bh; 1924dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1925dcd0538fSMark Fasheh } 1926dcd0538fSMark Fasheh 1927dcd0538fSMark Fasheh *ret_left_path = left_path; 1928dcd0538fSMark Fasheh ret = 0; 1929dcd0538fSMark Fasheh out: 1930dcd0538fSMark Fasheh if (ret != 0) 1931dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1932dcd0538fSMark Fasheh 1933dcd0538fSMark Fasheh return ret; 1934dcd0538fSMark Fasheh } 1935dcd0538fSMark Fasheh 1936dcd0538fSMark Fasheh /* 1937dcd0538fSMark Fasheh * This function only does inserts on an allocation b-tree. For dinode 1938dcd0538fSMark Fasheh * lists, ocfs2_insert_at_leaf() is called directly. 1939dcd0538fSMark Fasheh * 1940dcd0538fSMark Fasheh * right_path is the path we want to do the actual insert 1941dcd0538fSMark Fasheh * in. left_path should only be passed in if we need to update that 1942dcd0538fSMark Fasheh * portion of the tree after an edge insert. 1943dcd0538fSMark Fasheh */ 1944dcd0538fSMark Fasheh static int ocfs2_insert_path(struct inode *inode, 1945dcd0538fSMark Fasheh handle_t *handle, 1946dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1947dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1948dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 1949dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 1950dcd0538fSMark Fasheh { 1951dcd0538fSMark Fasheh int ret, subtree_index; 1952dcd0538fSMark Fasheh struct buffer_head *leaf_bh = path_leaf_bh(right_path); 1953dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1954dcd0538fSMark Fasheh 1955dcd0538fSMark Fasheh /* 1956dcd0538fSMark Fasheh * Pass both paths to the journal. The majority of inserts 1957dcd0538fSMark Fasheh * will be touching all components anyway. 1958dcd0538fSMark Fasheh */ 1959dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 1960dcd0538fSMark Fasheh if (ret < 0) { 1961dcd0538fSMark Fasheh mlog_errno(ret); 1962dcd0538fSMark Fasheh goto out; 1963dcd0538fSMark Fasheh } 1964dcd0538fSMark Fasheh 1965dcd0538fSMark Fasheh if (left_path) { 1966dcd0538fSMark Fasheh int credits = handle->h_buffer_credits; 1967dcd0538fSMark Fasheh 1968dcd0538fSMark Fasheh /* 1969dcd0538fSMark Fasheh * There's a chance that left_path got passed back to 1970dcd0538fSMark Fasheh * us without being accounted for in the 1971dcd0538fSMark Fasheh * journal. Extend our transaction here to be sure we 1972dcd0538fSMark Fasheh * can change those blocks. 1973dcd0538fSMark Fasheh */ 1974dcd0538fSMark Fasheh credits += left_path->p_tree_depth; 1975dcd0538fSMark Fasheh 1976dcd0538fSMark Fasheh ret = ocfs2_extend_trans(handle, credits); 1977dcd0538fSMark Fasheh if (ret < 0) { 1978dcd0538fSMark Fasheh mlog_errno(ret); 1979dcd0538fSMark Fasheh goto out; 1980dcd0538fSMark Fasheh } 1981dcd0538fSMark Fasheh 1982dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, left_path); 1983dcd0538fSMark Fasheh if (ret < 0) { 1984dcd0538fSMark Fasheh mlog_errno(ret); 1985dcd0538fSMark Fasheh goto out; 1986dcd0538fSMark Fasheh } 1987dcd0538fSMark Fasheh } 1988dcd0538fSMark Fasheh 1989dcd0538fSMark Fasheh el = path_leaf_el(right_path); 1990dcd0538fSMark Fasheh 1991dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, insert, inode); 1992dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, leaf_bh); 1993dcd0538fSMark Fasheh if (ret) 1994dcd0538fSMark Fasheh mlog_errno(ret); 1995dcd0538fSMark Fasheh 1996dcd0538fSMark Fasheh if (left_path) { 1997dcd0538fSMark Fasheh /* 1998dcd0538fSMark Fasheh * The rotate code has indicated that we need to fix 1999dcd0538fSMark Fasheh * up portions of the tree after the insert. 2000dcd0538fSMark Fasheh * 2001dcd0538fSMark Fasheh * XXX: Should we extend the transaction here? 2002dcd0538fSMark Fasheh */ 2003dcd0538fSMark Fasheh subtree_index = ocfs2_find_subtree_root(inode, left_path, 2004dcd0538fSMark Fasheh right_path); 2005dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, 2006dcd0538fSMark Fasheh right_path, subtree_index); 2007dcd0538fSMark Fasheh } 2008dcd0538fSMark Fasheh 2009dcd0538fSMark Fasheh ret = 0; 2010dcd0538fSMark Fasheh out: 2011dcd0538fSMark Fasheh return ret; 2012dcd0538fSMark Fasheh } 2013dcd0538fSMark Fasheh 2014dcd0538fSMark Fasheh static int ocfs2_do_insert_extent(struct inode *inode, 2015dcd0538fSMark Fasheh handle_t *handle, 2016dcd0538fSMark Fasheh struct buffer_head *di_bh, 2017dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2018dcd0538fSMark Fasheh struct ocfs2_insert_type *type) 2019dcd0538fSMark Fasheh { 2020dcd0538fSMark Fasheh int ret, rotate = 0; 2021dcd0538fSMark Fasheh u32 cpos; 2022dcd0538fSMark Fasheh struct ocfs2_path *right_path = NULL; 2023dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 2024dcd0538fSMark Fasheh struct ocfs2_dinode *di; 2025dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2026dcd0538fSMark Fasheh 2027dcd0538fSMark Fasheh di = (struct ocfs2_dinode *) di_bh->b_data; 2028dcd0538fSMark Fasheh el = &di->id2.i_list; 2029dcd0538fSMark Fasheh 2030dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, di_bh, 2031dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2032dcd0538fSMark Fasheh if (ret) { 2033dcd0538fSMark Fasheh mlog_errno(ret); 2034dcd0538fSMark Fasheh goto out; 2035dcd0538fSMark Fasheh } 2036dcd0538fSMark Fasheh 2037dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 2038dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, type, inode); 2039dcd0538fSMark Fasheh goto out_update_clusters; 2040dcd0538fSMark Fasheh } 2041dcd0538fSMark Fasheh 2042dcd0538fSMark Fasheh right_path = ocfs2_new_inode_path(di_bh); 2043dcd0538fSMark Fasheh if (!right_path) { 2044dcd0538fSMark Fasheh ret = -ENOMEM; 2045dcd0538fSMark Fasheh mlog_errno(ret); 2046dcd0538fSMark Fasheh goto out; 2047dcd0538fSMark Fasheh } 2048dcd0538fSMark Fasheh 2049dcd0538fSMark Fasheh /* 2050dcd0538fSMark Fasheh * Determine the path to start with. Rotations need the 2051dcd0538fSMark Fasheh * rightmost path, everything else can go directly to the 2052dcd0538fSMark Fasheh * target leaf. 2053dcd0538fSMark Fasheh */ 2054dcd0538fSMark Fasheh cpos = le32_to_cpu(insert_rec->e_cpos); 2055dcd0538fSMark Fasheh if (type->ins_appending == APPEND_NONE && 2056dcd0538fSMark Fasheh type->ins_contig == CONTIG_NONE) { 2057dcd0538fSMark Fasheh rotate = 1; 2058dcd0538fSMark Fasheh cpos = UINT_MAX; 2059dcd0538fSMark Fasheh } 2060dcd0538fSMark Fasheh 2061dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, right_path, cpos); 2062dcd0538fSMark Fasheh if (ret) { 2063dcd0538fSMark Fasheh mlog_errno(ret); 2064dcd0538fSMark Fasheh goto out; 2065dcd0538fSMark Fasheh } 2066dcd0538fSMark Fasheh 2067dcd0538fSMark Fasheh /* 2068dcd0538fSMark Fasheh * Rotations and appends need special treatment - they modify 2069dcd0538fSMark Fasheh * parts of the tree's above them. 2070dcd0538fSMark Fasheh * 2071dcd0538fSMark Fasheh * Both might pass back a path immediate to the left of the 2072dcd0538fSMark Fasheh * one being inserted to. This will be cause 2073dcd0538fSMark Fasheh * ocfs2_insert_path() to modify the rightmost records of 2074dcd0538fSMark Fasheh * left_path to account for an edge insert. 2075dcd0538fSMark Fasheh * 2076dcd0538fSMark Fasheh * XXX: When modifying this code, keep in mind that an insert 2077dcd0538fSMark Fasheh * can wind up skipping both of these two special cases... 2078dcd0538fSMark Fasheh */ 2079dcd0538fSMark Fasheh if (rotate) { 2080dcd0538fSMark Fasheh ret = ocfs2_rotate_tree_right(inode, handle, 2081dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 2082dcd0538fSMark Fasheh right_path, &left_path); 2083dcd0538fSMark Fasheh if (ret) { 2084dcd0538fSMark Fasheh mlog_errno(ret); 2085dcd0538fSMark Fasheh goto out; 2086dcd0538fSMark Fasheh } 2087dcd0538fSMark Fasheh } else if (type->ins_appending == APPEND_TAIL 2088dcd0538fSMark Fasheh && type->ins_contig != CONTIG_LEFT) { 2089dcd0538fSMark Fasheh ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 2090dcd0538fSMark Fasheh right_path, &left_path); 2091dcd0538fSMark Fasheh if (ret) { 2092dcd0538fSMark Fasheh mlog_errno(ret); 2093dcd0538fSMark Fasheh goto out; 2094dcd0538fSMark Fasheh } 2095dcd0538fSMark Fasheh } 2096dcd0538fSMark Fasheh 2097dcd0538fSMark Fasheh ret = ocfs2_insert_path(inode, handle, left_path, right_path, 2098dcd0538fSMark Fasheh insert_rec, type); 2099dcd0538fSMark Fasheh if (ret) { 2100dcd0538fSMark Fasheh mlog_errno(ret); 2101dcd0538fSMark Fasheh goto out; 2102dcd0538fSMark Fasheh } 2103dcd0538fSMark Fasheh 2104dcd0538fSMark Fasheh out_update_clusters: 2105dcd0538fSMark Fasheh ocfs2_update_dinode_clusters(inode, di, 2106e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 2107dcd0538fSMark Fasheh 2108dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, di_bh); 2109dcd0538fSMark Fasheh if (ret) 2110dcd0538fSMark Fasheh mlog_errno(ret); 2111dcd0538fSMark Fasheh 2112dcd0538fSMark Fasheh out: 2113dcd0538fSMark Fasheh ocfs2_free_path(left_path); 2114dcd0538fSMark Fasheh ocfs2_free_path(right_path); 2115dcd0538fSMark Fasheh 2116dcd0538fSMark Fasheh return ret; 2117dcd0538fSMark Fasheh } 2118dcd0538fSMark Fasheh 2119dcd0538fSMark Fasheh static void ocfs2_figure_contig_type(struct inode *inode, 2120dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 2121dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2122dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2123dcd0538fSMark Fasheh { 2124dcd0538fSMark Fasheh int i; 2125dcd0538fSMark Fasheh enum ocfs2_contig_type contig_type = CONTIG_NONE; 2126dcd0538fSMark Fasheh 2127e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2128e48edee2SMark Fasheh 2129dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2130dcd0538fSMark Fasheh contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 2131dcd0538fSMark Fasheh insert_rec); 2132dcd0538fSMark Fasheh if (contig_type != CONTIG_NONE) { 2133dcd0538fSMark Fasheh insert->ins_contig_index = i; 2134dcd0538fSMark Fasheh break; 2135dcd0538fSMark Fasheh } 2136dcd0538fSMark Fasheh } 2137dcd0538fSMark Fasheh insert->ins_contig = contig_type; 2138dcd0538fSMark Fasheh } 2139dcd0538fSMark Fasheh 2140dcd0538fSMark Fasheh /* 2141dcd0538fSMark Fasheh * This should only be called against the righmost leaf extent list. 2142dcd0538fSMark Fasheh * 2143dcd0538fSMark Fasheh * ocfs2_figure_appending_type() will figure out whether we'll have to 2144dcd0538fSMark Fasheh * insert at the tail of the rightmost leaf. 2145dcd0538fSMark Fasheh * 2146dcd0538fSMark Fasheh * This should also work against the dinode list for tree's with 0 2147dcd0538fSMark Fasheh * depth. If we consider the dinode list to be the rightmost leaf node 2148dcd0538fSMark Fasheh * then the logic here makes sense. 2149dcd0538fSMark Fasheh */ 2150dcd0538fSMark Fasheh static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 2151dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2152dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2153dcd0538fSMark Fasheh { 2154dcd0538fSMark Fasheh int i; 2155dcd0538fSMark Fasheh u32 cpos = le32_to_cpu(insert_rec->e_cpos); 2156dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 2157dcd0538fSMark Fasheh 2158dcd0538fSMark Fasheh insert->ins_appending = APPEND_NONE; 2159dcd0538fSMark Fasheh 2160e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2161dcd0538fSMark Fasheh 2162dcd0538fSMark Fasheh if (!el->l_next_free_rec) 2163dcd0538fSMark Fasheh goto set_tail_append; 2164dcd0538fSMark Fasheh 2165dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 2166dcd0538fSMark Fasheh /* Were all records empty? */ 2167dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 1) 2168dcd0538fSMark Fasheh goto set_tail_append; 2169dcd0538fSMark Fasheh } 2170dcd0538fSMark Fasheh 2171dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 2172dcd0538fSMark Fasheh rec = &el->l_recs[i]; 2173dcd0538fSMark Fasheh 2174e48edee2SMark Fasheh if (cpos >= 2175e48edee2SMark Fasheh (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 2176dcd0538fSMark Fasheh goto set_tail_append; 2177dcd0538fSMark Fasheh 2178dcd0538fSMark Fasheh return; 2179dcd0538fSMark Fasheh 2180dcd0538fSMark Fasheh set_tail_append: 2181dcd0538fSMark Fasheh insert->ins_appending = APPEND_TAIL; 2182dcd0538fSMark Fasheh } 2183dcd0538fSMark Fasheh 2184dcd0538fSMark Fasheh /* 2185dcd0538fSMark Fasheh * Helper function called at the begining of an insert. 2186dcd0538fSMark Fasheh * 2187dcd0538fSMark Fasheh * This computes a few things that are commonly used in the process of 2188dcd0538fSMark Fasheh * inserting into the btree: 2189dcd0538fSMark Fasheh * - Whether the new extent is contiguous with an existing one. 2190dcd0538fSMark Fasheh * - The current tree depth. 2191dcd0538fSMark Fasheh * - Whether the insert is an appending one. 2192dcd0538fSMark Fasheh * - The total # of free records in the tree. 2193dcd0538fSMark Fasheh * 2194dcd0538fSMark Fasheh * All of the information is stored on the ocfs2_insert_type 2195dcd0538fSMark Fasheh * structure. 2196dcd0538fSMark Fasheh */ 2197dcd0538fSMark Fasheh static int ocfs2_figure_insert_type(struct inode *inode, 2198dcd0538fSMark Fasheh struct buffer_head *di_bh, 2199dcd0538fSMark Fasheh struct buffer_head **last_eb_bh, 2200dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2201dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 2202dcd0538fSMark Fasheh { 2203dcd0538fSMark Fasheh int ret; 2204dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2205dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 2206dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2207dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 2208dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 2209dcd0538fSMark Fasheh 2210dcd0538fSMark Fasheh el = &di->id2.i_list; 2211dcd0538fSMark Fasheh insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 2212dcd0538fSMark Fasheh 2213dcd0538fSMark Fasheh if (el->l_tree_depth) { 2214dcd0538fSMark Fasheh /* 2215dcd0538fSMark Fasheh * If we have tree depth, we read in the 2216dcd0538fSMark Fasheh * rightmost extent block ahead of time as 2217dcd0538fSMark Fasheh * ocfs2_figure_insert_type() and ocfs2_add_branch() 2218dcd0538fSMark Fasheh * may want it later. 2219dcd0538fSMark Fasheh */ 2220dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 2221dcd0538fSMark Fasheh le64_to_cpu(di->i_last_eb_blk), &bh, 2222dcd0538fSMark Fasheh OCFS2_BH_CACHED, inode); 2223dcd0538fSMark Fasheh if (ret) { 2224dcd0538fSMark Fasheh mlog_exit(ret); 2225dcd0538fSMark Fasheh goto out; 2226dcd0538fSMark Fasheh } 2227dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 2228dcd0538fSMark Fasheh el = &eb->h_list; 2229dcd0538fSMark Fasheh } 2230dcd0538fSMark Fasheh 2231dcd0538fSMark Fasheh /* 2232dcd0538fSMark Fasheh * Unless we have a contiguous insert, we'll need to know if 2233dcd0538fSMark Fasheh * there is room left in our allocation tree for another 2234dcd0538fSMark Fasheh * extent record. 2235dcd0538fSMark Fasheh * 2236dcd0538fSMark Fasheh * XXX: This test is simplistic, we can search for empty 2237dcd0538fSMark Fasheh * extent records too. 2238dcd0538fSMark Fasheh */ 2239dcd0538fSMark Fasheh insert->ins_free_records = le16_to_cpu(el->l_count) - 2240dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec); 2241dcd0538fSMark Fasheh 2242dcd0538fSMark Fasheh if (!insert->ins_tree_depth) { 2243dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2244dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2245dcd0538fSMark Fasheh return 0; 2246dcd0538fSMark Fasheh } 2247dcd0538fSMark Fasheh 2248dcd0538fSMark Fasheh path = ocfs2_new_inode_path(di_bh); 2249dcd0538fSMark Fasheh if (!path) { 2250dcd0538fSMark Fasheh ret = -ENOMEM; 2251dcd0538fSMark Fasheh mlog_errno(ret); 2252dcd0538fSMark Fasheh goto out; 2253dcd0538fSMark Fasheh } 2254dcd0538fSMark Fasheh 2255dcd0538fSMark Fasheh /* 2256dcd0538fSMark Fasheh * In the case that we're inserting past what the tree 2257dcd0538fSMark Fasheh * currently accounts for, ocfs2_find_path() will return for 2258dcd0538fSMark Fasheh * us the rightmost tree path. This is accounted for below in 2259dcd0538fSMark Fasheh * the appending code. 2260dcd0538fSMark Fasheh */ 2261dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 2262dcd0538fSMark Fasheh if (ret) { 2263dcd0538fSMark Fasheh mlog_errno(ret); 2264dcd0538fSMark Fasheh goto out; 2265dcd0538fSMark Fasheh } 2266dcd0538fSMark Fasheh 2267dcd0538fSMark Fasheh el = path_leaf_el(path); 2268dcd0538fSMark Fasheh 2269dcd0538fSMark Fasheh /* 2270dcd0538fSMark Fasheh * Now that we have the path, there's two things we want to determine: 2271dcd0538fSMark Fasheh * 1) Contiguousness (also set contig_index if this is so) 2272dcd0538fSMark Fasheh * 2273dcd0538fSMark Fasheh * 2) Are we doing an append? We can trivially break this up 2274dcd0538fSMark Fasheh * into two types of appends: simple record append, or a 2275dcd0538fSMark Fasheh * rotate inside the tail leaf. 2276dcd0538fSMark Fasheh */ 2277dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2278dcd0538fSMark Fasheh 2279dcd0538fSMark Fasheh /* 2280dcd0538fSMark Fasheh * The insert code isn't quite ready to deal with all cases of 2281dcd0538fSMark Fasheh * left contiguousness. Specifically, if it's an insert into 2282dcd0538fSMark Fasheh * the 1st record in a leaf, it will require the adjustment of 2283e48edee2SMark Fasheh * cluster count on the last record of the path directly to it's 2284dcd0538fSMark Fasheh * left. For now, just catch that case and fool the layers 2285dcd0538fSMark Fasheh * above us. This works just fine for tree_depth == 0, which 2286dcd0538fSMark Fasheh * is why we allow that above. 2287dcd0538fSMark Fasheh */ 2288dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT && 2289dcd0538fSMark Fasheh insert->ins_contig_index == 0) 2290dcd0538fSMark Fasheh insert->ins_contig = CONTIG_NONE; 2291dcd0538fSMark Fasheh 2292dcd0538fSMark Fasheh /* 2293dcd0538fSMark Fasheh * Ok, so we can simply compare against last_eb to figure out 2294dcd0538fSMark Fasheh * whether the path doesn't exist. This will only happen in 2295dcd0538fSMark Fasheh * the case that we're doing a tail append, so maybe we can 2296dcd0538fSMark Fasheh * take advantage of that information somehow. 2297dcd0538fSMark Fasheh */ 2298dcd0538fSMark Fasheh if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { 2299dcd0538fSMark Fasheh /* 2300dcd0538fSMark Fasheh * Ok, ocfs2_find_path() returned us the rightmost 2301dcd0538fSMark Fasheh * tree path. This might be an appending insert. There are 2302dcd0538fSMark Fasheh * two cases: 2303dcd0538fSMark Fasheh * 1) We're doing a true append at the tail: 2304dcd0538fSMark Fasheh * -This might even be off the end of the leaf 2305dcd0538fSMark Fasheh * 2) We're "appending" by rotating in the tail 2306dcd0538fSMark Fasheh */ 2307dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2308dcd0538fSMark Fasheh } 2309dcd0538fSMark Fasheh 2310dcd0538fSMark Fasheh out: 2311dcd0538fSMark Fasheh ocfs2_free_path(path); 2312dcd0538fSMark Fasheh 2313dcd0538fSMark Fasheh if (ret == 0) 2314dcd0538fSMark Fasheh *last_eb_bh = bh; 2315dcd0538fSMark Fasheh else 2316dcd0538fSMark Fasheh brelse(bh); 2317dcd0538fSMark Fasheh return ret; 2318dcd0538fSMark Fasheh } 2319dcd0538fSMark Fasheh 2320dcd0538fSMark Fasheh /* 2321dcd0538fSMark Fasheh * Insert an extent into an inode btree. 2322dcd0538fSMark Fasheh * 2323dcd0538fSMark Fasheh * The caller needs to update fe->i_clusters 2324dcd0538fSMark Fasheh */ 2325ccd979bdSMark Fasheh int ocfs2_insert_extent(struct ocfs2_super *osb, 23261fabe148SMark Fasheh handle_t *handle, 2327ccd979bdSMark Fasheh struct inode *inode, 2328ccd979bdSMark Fasheh struct buffer_head *fe_bh, 2329dcd0538fSMark Fasheh u32 cpos, 2330ccd979bdSMark Fasheh u64 start_blk, 2331ccd979bdSMark Fasheh u32 new_clusters, 2332ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 2333ccd979bdSMark Fasheh { 2334dcd0538fSMark Fasheh int status, shift; 2335ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 2336ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2337dcd0538fSMark Fasheh struct ocfs2_insert_type insert = {0, }; 2338dcd0538fSMark Fasheh struct ocfs2_extent_rec rec; 2339ccd979bdSMark Fasheh 2340dcd0538fSMark Fasheh mlog(0, "add %u clusters at position %u to inode %llu\n", 2341dcd0538fSMark Fasheh new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 2342ccd979bdSMark Fasheh 2343dcd0538fSMark Fasheh mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 2344dcd0538fSMark Fasheh (OCFS2_I(inode)->ip_clusters != cpos), 2345dcd0538fSMark Fasheh "Device %s, asking for sparse allocation: inode %llu, " 2346dcd0538fSMark Fasheh "cpos %u, clusters %u\n", 2347dcd0538fSMark Fasheh osb->dev_str, 2348dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, 2349dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters); 2350ccd979bdSMark Fasheh 2351e48edee2SMark Fasheh memset(&rec, 0, sizeof(rec)); 2352dcd0538fSMark Fasheh rec.e_cpos = cpu_to_le32(cpos); 2353dcd0538fSMark Fasheh rec.e_blkno = cpu_to_le64(start_blk); 2354e48edee2SMark Fasheh rec.e_leaf_clusters = cpu_to_le16(new_clusters); 2355ccd979bdSMark Fasheh 2356dcd0538fSMark Fasheh status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 2357dcd0538fSMark Fasheh &insert); 2358ccd979bdSMark Fasheh if (status < 0) { 2359dcd0538fSMark Fasheh mlog_errno(status); 2360ccd979bdSMark Fasheh goto bail; 2361ccd979bdSMark Fasheh } 2362ccd979bdSMark Fasheh 2363dcd0538fSMark Fasheh mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 2364dcd0538fSMark Fasheh "Insert.contig_index: %d, Insert.free_records: %d, " 2365dcd0538fSMark Fasheh "Insert.tree_depth: %d\n", 2366dcd0538fSMark Fasheh insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 2367dcd0538fSMark Fasheh insert.ins_free_records, insert.ins_tree_depth); 2368dcd0538fSMark Fasheh 2369dcd0538fSMark Fasheh /* 2370dcd0538fSMark Fasheh * Avoid growing the tree unless we're out of records and the 2371dcd0538fSMark Fasheh * insert type requres one. 2372dcd0538fSMark Fasheh */ 2373dcd0538fSMark Fasheh if (insert.ins_contig != CONTIG_NONE || insert.ins_free_records) 2374ccd979bdSMark Fasheh goto out_add; 2375ccd979bdSMark Fasheh 2376ccd979bdSMark Fasheh shift = ocfs2_find_branch_target(osb, inode, fe_bh, &bh); 2377ccd979bdSMark Fasheh if (shift < 0) { 2378ccd979bdSMark Fasheh status = shift; 2379ccd979bdSMark Fasheh mlog_errno(status); 2380ccd979bdSMark Fasheh goto bail; 2381ccd979bdSMark Fasheh } 2382ccd979bdSMark Fasheh 2383ccd979bdSMark Fasheh /* We traveled all the way to the bottom of the allocation tree 2384ccd979bdSMark Fasheh * and didn't find room for any more extents - we need to add 2385ccd979bdSMark Fasheh * another tree level */ 2386ccd979bdSMark Fasheh if (shift) { 2387ccd979bdSMark Fasheh BUG_ON(bh); 2388dcd0538fSMark Fasheh mlog(0, "need to shift tree depth " 2389dcd0538fSMark Fasheh "(current = %d)\n", insert.ins_tree_depth); 2390ccd979bdSMark Fasheh 2391ccd979bdSMark Fasheh /* ocfs2_shift_tree_depth will return us a buffer with 2392ccd979bdSMark Fasheh * the new extent block (so we can pass that to 2393ccd979bdSMark Fasheh * ocfs2_add_branch). */ 2394ccd979bdSMark Fasheh status = ocfs2_shift_tree_depth(osb, handle, inode, fe_bh, 2395ccd979bdSMark Fasheh meta_ac, &bh); 2396ccd979bdSMark Fasheh if (status < 0) { 2397ccd979bdSMark Fasheh mlog_errno(status); 2398ccd979bdSMark Fasheh goto bail; 2399ccd979bdSMark Fasheh } 2400dcd0538fSMark Fasheh insert.ins_tree_depth++; 2401ccd979bdSMark Fasheh /* Special case: we have room now if we shifted from 2402ccd979bdSMark Fasheh * tree_depth 0 */ 2403dcd0538fSMark Fasheh if (insert.ins_tree_depth == 1) 2404ccd979bdSMark Fasheh goto out_add; 2405ccd979bdSMark Fasheh } 2406ccd979bdSMark Fasheh 2407ccd979bdSMark Fasheh /* call ocfs2_add_branch to add the final part of the tree with 2408ccd979bdSMark Fasheh * the new data. */ 2409dcd0538fSMark Fasheh mlog(0, "add branch. bh = %p\n", bh); 2410ccd979bdSMark Fasheh status = ocfs2_add_branch(osb, handle, inode, fe_bh, bh, last_eb_bh, 2411ccd979bdSMark Fasheh meta_ac); 2412ccd979bdSMark Fasheh if (status < 0) { 2413ccd979bdSMark Fasheh mlog_errno(status); 2414ccd979bdSMark Fasheh goto bail; 2415ccd979bdSMark Fasheh } 2416ccd979bdSMark Fasheh 2417ccd979bdSMark Fasheh out_add: 2418dcd0538fSMark Fasheh /* Finally, we can add clusters. This might rotate the tree for us. */ 2419dcd0538fSMark Fasheh status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 2420ccd979bdSMark Fasheh if (status < 0) 2421ccd979bdSMark Fasheh mlog_errno(status); 242283418978SMark Fasheh else 242383418978SMark Fasheh ocfs2_extent_map_insert_rec(inode, &rec); 2424ccd979bdSMark Fasheh 2425ccd979bdSMark Fasheh bail: 2426ccd979bdSMark Fasheh if (bh) 2427ccd979bdSMark Fasheh brelse(bh); 2428ccd979bdSMark Fasheh 2429ccd979bdSMark Fasheh if (last_eb_bh) 2430ccd979bdSMark Fasheh brelse(last_eb_bh); 2431ccd979bdSMark Fasheh 2432ccd979bdSMark Fasheh mlog_exit(status); 2433ccd979bdSMark Fasheh return status; 2434ccd979bdSMark Fasheh } 2435ccd979bdSMark Fasheh 2436ccd979bdSMark Fasheh static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 2437ccd979bdSMark Fasheh { 2438ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2439ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2440ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2441ccd979bdSMark Fasheh 2442ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2443ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2444ccd979bdSMark Fasheh 2445ccd979bdSMark Fasheh mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 2446ccd979bdSMark Fasheh "slot %d, invalid truncate log parameters: used = " 2447ccd979bdSMark Fasheh "%u, count = %u\n", osb->slot_num, 2448ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 2449ccd979bdSMark Fasheh return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 2450ccd979bdSMark Fasheh } 2451ccd979bdSMark Fasheh 2452ccd979bdSMark Fasheh static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 2453ccd979bdSMark Fasheh unsigned int new_start) 2454ccd979bdSMark Fasheh { 2455ccd979bdSMark Fasheh unsigned int tail_index; 2456ccd979bdSMark Fasheh unsigned int current_tail; 2457ccd979bdSMark Fasheh 2458ccd979bdSMark Fasheh /* No records, nothing to coalesce */ 2459ccd979bdSMark Fasheh if (!le16_to_cpu(tl->tl_used)) 2460ccd979bdSMark Fasheh return 0; 2461ccd979bdSMark Fasheh 2462ccd979bdSMark Fasheh tail_index = le16_to_cpu(tl->tl_used) - 1; 2463ccd979bdSMark Fasheh current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 2464ccd979bdSMark Fasheh current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 2465ccd979bdSMark Fasheh 2466ccd979bdSMark Fasheh return current_tail == new_start; 2467ccd979bdSMark Fasheh } 2468ccd979bdSMark Fasheh 2469ccd979bdSMark Fasheh static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 24701fabe148SMark Fasheh handle_t *handle, 2471ccd979bdSMark Fasheh u64 start_blk, 2472ccd979bdSMark Fasheh unsigned int num_clusters) 2473ccd979bdSMark Fasheh { 2474ccd979bdSMark Fasheh int status, index; 2475ccd979bdSMark Fasheh unsigned int start_cluster, tl_count; 2476ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2477ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2478ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2479ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2480ccd979bdSMark Fasheh 2481b0697053SMark Fasheh mlog_entry("start_blk = %llu, num_clusters = %u\n", 2482b0697053SMark Fasheh (unsigned long long)start_blk, num_clusters); 2483ccd979bdSMark Fasheh 24841b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2485ccd979bdSMark Fasheh 2486ccd979bdSMark Fasheh start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 2487ccd979bdSMark Fasheh 2488ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2489ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2490ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2491ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2492ccd979bdSMark Fasheh status = -EIO; 2493ccd979bdSMark Fasheh goto bail; 2494ccd979bdSMark Fasheh } 2495ccd979bdSMark Fasheh 2496ccd979bdSMark Fasheh tl_count = le16_to_cpu(tl->tl_count); 2497ccd979bdSMark Fasheh mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 2498ccd979bdSMark Fasheh tl_count == 0, 2499b0697053SMark Fasheh "Truncate record count on #%llu invalid " 2500b0697053SMark Fasheh "wanted %u, actual %u\n", 2501b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 2502ccd979bdSMark Fasheh ocfs2_truncate_recs_per_inode(osb->sb), 2503ccd979bdSMark Fasheh le16_to_cpu(tl->tl_count)); 2504ccd979bdSMark Fasheh 2505ccd979bdSMark Fasheh /* Caller should have known to flush before calling us. */ 2506ccd979bdSMark Fasheh index = le16_to_cpu(tl->tl_used); 2507ccd979bdSMark Fasheh if (index >= tl_count) { 2508ccd979bdSMark Fasheh status = -ENOSPC; 2509ccd979bdSMark Fasheh mlog_errno(status); 2510ccd979bdSMark Fasheh goto bail; 2511ccd979bdSMark Fasheh } 2512ccd979bdSMark Fasheh 2513ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2514ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2515ccd979bdSMark Fasheh if (status < 0) { 2516ccd979bdSMark Fasheh mlog_errno(status); 2517ccd979bdSMark Fasheh goto bail; 2518ccd979bdSMark Fasheh } 2519ccd979bdSMark Fasheh 2520ccd979bdSMark Fasheh mlog(0, "Log truncate of %u clusters starting at cluster %u to " 2521b0697053SMark Fasheh "%llu (index = %d)\n", num_clusters, start_cluster, 2522b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 2523ccd979bdSMark Fasheh 2524ccd979bdSMark Fasheh if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 2525ccd979bdSMark Fasheh /* 2526ccd979bdSMark Fasheh * Move index back to the record we are coalescing with. 2527ccd979bdSMark Fasheh * ocfs2_truncate_log_can_coalesce() guarantees nonzero 2528ccd979bdSMark Fasheh */ 2529ccd979bdSMark Fasheh index--; 2530ccd979bdSMark Fasheh 2531ccd979bdSMark Fasheh num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 2532ccd979bdSMark Fasheh mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 2533ccd979bdSMark Fasheh index, le32_to_cpu(tl->tl_recs[index].t_start), 2534ccd979bdSMark Fasheh num_clusters); 2535ccd979bdSMark Fasheh } else { 2536ccd979bdSMark Fasheh tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 2537ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(index + 1); 2538ccd979bdSMark Fasheh } 2539ccd979bdSMark Fasheh tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 2540ccd979bdSMark Fasheh 2541ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2542ccd979bdSMark Fasheh if (status < 0) { 2543ccd979bdSMark Fasheh mlog_errno(status); 2544ccd979bdSMark Fasheh goto bail; 2545ccd979bdSMark Fasheh } 2546ccd979bdSMark Fasheh 2547ccd979bdSMark Fasheh bail: 2548ccd979bdSMark Fasheh mlog_exit(status); 2549ccd979bdSMark Fasheh return status; 2550ccd979bdSMark Fasheh } 2551ccd979bdSMark Fasheh 2552ccd979bdSMark Fasheh static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 25531fabe148SMark Fasheh handle_t *handle, 2554ccd979bdSMark Fasheh struct inode *data_alloc_inode, 2555ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh) 2556ccd979bdSMark Fasheh { 2557ccd979bdSMark Fasheh int status = 0; 2558ccd979bdSMark Fasheh int i; 2559ccd979bdSMark Fasheh unsigned int num_clusters; 2560ccd979bdSMark Fasheh u64 start_blk; 2561ccd979bdSMark Fasheh struct ocfs2_truncate_rec rec; 2562ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2563ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2564ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2565ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2566ccd979bdSMark Fasheh 2567ccd979bdSMark Fasheh mlog_entry_void(); 2568ccd979bdSMark Fasheh 2569ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2570ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2571ccd979bdSMark Fasheh i = le16_to_cpu(tl->tl_used) - 1; 2572ccd979bdSMark Fasheh while (i >= 0) { 2573ccd979bdSMark Fasheh /* Caller has given us at least enough credits to 2574ccd979bdSMark Fasheh * update the truncate log dinode */ 2575ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2576ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2577ccd979bdSMark Fasheh if (status < 0) { 2578ccd979bdSMark Fasheh mlog_errno(status); 2579ccd979bdSMark Fasheh goto bail; 2580ccd979bdSMark Fasheh } 2581ccd979bdSMark Fasheh 2582ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(i); 2583ccd979bdSMark Fasheh 2584ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2585ccd979bdSMark Fasheh if (status < 0) { 2586ccd979bdSMark Fasheh mlog_errno(status); 2587ccd979bdSMark Fasheh goto bail; 2588ccd979bdSMark Fasheh } 2589ccd979bdSMark Fasheh 2590ccd979bdSMark Fasheh /* TODO: Perhaps we can calculate the bulk of the 2591ccd979bdSMark Fasheh * credits up front rather than extending like 2592ccd979bdSMark Fasheh * this. */ 2593ccd979bdSMark Fasheh status = ocfs2_extend_trans(handle, 2594ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 2595ccd979bdSMark Fasheh if (status < 0) { 2596ccd979bdSMark Fasheh mlog_errno(status); 2597ccd979bdSMark Fasheh goto bail; 2598ccd979bdSMark Fasheh } 2599ccd979bdSMark Fasheh 2600ccd979bdSMark Fasheh rec = tl->tl_recs[i]; 2601ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 2602ccd979bdSMark Fasheh le32_to_cpu(rec.t_start)); 2603ccd979bdSMark Fasheh num_clusters = le32_to_cpu(rec.t_clusters); 2604ccd979bdSMark Fasheh 2605ccd979bdSMark Fasheh /* if start_blk is not set, we ignore the record as 2606ccd979bdSMark Fasheh * invalid. */ 2607ccd979bdSMark Fasheh if (start_blk) { 2608ccd979bdSMark Fasheh mlog(0, "free record %d, start = %u, clusters = %u\n", 2609ccd979bdSMark Fasheh i, le32_to_cpu(rec.t_start), num_clusters); 2610ccd979bdSMark Fasheh 2611ccd979bdSMark Fasheh status = ocfs2_free_clusters(handle, data_alloc_inode, 2612ccd979bdSMark Fasheh data_alloc_bh, start_blk, 2613ccd979bdSMark Fasheh num_clusters); 2614ccd979bdSMark Fasheh if (status < 0) { 2615ccd979bdSMark Fasheh mlog_errno(status); 2616ccd979bdSMark Fasheh goto bail; 2617ccd979bdSMark Fasheh } 2618ccd979bdSMark Fasheh } 2619ccd979bdSMark Fasheh i--; 2620ccd979bdSMark Fasheh } 2621ccd979bdSMark Fasheh 2622ccd979bdSMark Fasheh bail: 2623ccd979bdSMark Fasheh mlog_exit(status); 2624ccd979bdSMark Fasheh return status; 2625ccd979bdSMark Fasheh } 2626ccd979bdSMark Fasheh 26271b1dcc1bSJes Sorensen /* Expects you to already be holding tl_inode->i_mutex */ 2628ccd979bdSMark Fasheh static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2629ccd979bdSMark Fasheh { 2630ccd979bdSMark Fasheh int status; 2631ccd979bdSMark Fasheh unsigned int num_to_flush; 26321fabe148SMark Fasheh handle_t *handle; 2633ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2634ccd979bdSMark Fasheh struct inode *data_alloc_inode = NULL; 2635ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2636ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh = NULL; 2637ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2638ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2639ccd979bdSMark Fasheh 2640ccd979bdSMark Fasheh mlog_entry_void(); 2641ccd979bdSMark Fasheh 26421b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2643ccd979bdSMark Fasheh 2644ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2645ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2646ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2647ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2648ccd979bdSMark Fasheh status = -EIO; 2649e08dc8b9SMark Fasheh goto out; 2650ccd979bdSMark Fasheh } 2651ccd979bdSMark Fasheh 2652ccd979bdSMark Fasheh num_to_flush = le16_to_cpu(tl->tl_used); 2653b0697053SMark Fasheh mlog(0, "Flush %u records from truncate log #%llu\n", 2654b0697053SMark Fasheh num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 2655ccd979bdSMark Fasheh if (!num_to_flush) { 2656ccd979bdSMark Fasheh status = 0; 2657e08dc8b9SMark Fasheh goto out; 2658ccd979bdSMark Fasheh } 2659ccd979bdSMark Fasheh 2660ccd979bdSMark Fasheh data_alloc_inode = ocfs2_get_system_file_inode(osb, 2661ccd979bdSMark Fasheh GLOBAL_BITMAP_SYSTEM_INODE, 2662ccd979bdSMark Fasheh OCFS2_INVALID_SLOT); 2663ccd979bdSMark Fasheh if (!data_alloc_inode) { 2664ccd979bdSMark Fasheh status = -EINVAL; 2665ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get bitmap inode!\n"); 2666e08dc8b9SMark Fasheh goto out; 2667ccd979bdSMark Fasheh } 2668ccd979bdSMark Fasheh 2669e08dc8b9SMark Fasheh mutex_lock(&data_alloc_inode->i_mutex); 2670e08dc8b9SMark Fasheh 26714bcec184SMark Fasheh status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 2672ccd979bdSMark Fasheh if (status < 0) { 2673ccd979bdSMark Fasheh mlog_errno(status); 2674e08dc8b9SMark Fasheh goto out_mutex; 2675ccd979bdSMark Fasheh } 2676ccd979bdSMark Fasheh 267765eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2678ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2679ccd979bdSMark Fasheh status = PTR_ERR(handle); 2680ccd979bdSMark Fasheh mlog_errno(status); 2681e08dc8b9SMark Fasheh goto out_unlock; 2682ccd979bdSMark Fasheh } 2683ccd979bdSMark Fasheh 2684ccd979bdSMark Fasheh status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 2685ccd979bdSMark Fasheh data_alloc_bh); 2686e08dc8b9SMark Fasheh if (status < 0) 2687ccd979bdSMark Fasheh mlog_errno(status); 2688ccd979bdSMark Fasheh 268902dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2690ccd979bdSMark Fasheh 2691e08dc8b9SMark Fasheh out_unlock: 2692e08dc8b9SMark Fasheh brelse(data_alloc_bh); 2693e08dc8b9SMark Fasheh ocfs2_meta_unlock(data_alloc_inode, 1); 2694e08dc8b9SMark Fasheh 2695e08dc8b9SMark Fasheh out_mutex: 2696e08dc8b9SMark Fasheh mutex_unlock(&data_alloc_inode->i_mutex); 2697ccd979bdSMark Fasheh iput(data_alloc_inode); 2698ccd979bdSMark Fasheh 2699e08dc8b9SMark Fasheh out: 2700ccd979bdSMark Fasheh mlog_exit(status); 2701ccd979bdSMark Fasheh return status; 2702ccd979bdSMark Fasheh } 2703ccd979bdSMark Fasheh 2704ccd979bdSMark Fasheh int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2705ccd979bdSMark Fasheh { 2706ccd979bdSMark Fasheh int status; 2707ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2708ccd979bdSMark Fasheh 27091b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2710ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 27111b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2712ccd979bdSMark Fasheh 2713ccd979bdSMark Fasheh return status; 2714ccd979bdSMark Fasheh } 2715ccd979bdSMark Fasheh 2716c4028958SDavid Howells static void ocfs2_truncate_log_worker(struct work_struct *work) 2717ccd979bdSMark Fasheh { 2718ccd979bdSMark Fasheh int status; 2719c4028958SDavid Howells struct ocfs2_super *osb = 2720c4028958SDavid Howells container_of(work, struct ocfs2_super, 2721c4028958SDavid Howells osb_truncate_log_wq.work); 2722ccd979bdSMark Fasheh 2723ccd979bdSMark Fasheh mlog_entry_void(); 2724ccd979bdSMark Fasheh 2725ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2726ccd979bdSMark Fasheh if (status < 0) 2727ccd979bdSMark Fasheh mlog_errno(status); 2728ccd979bdSMark Fasheh 2729ccd979bdSMark Fasheh mlog_exit(status); 2730ccd979bdSMark Fasheh } 2731ccd979bdSMark Fasheh 2732ccd979bdSMark Fasheh #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 2733ccd979bdSMark Fasheh void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 2734ccd979bdSMark Fasheh int cancel) 2735ccd979bdSMark Fasheh { 2736ccd979bdSMark Fasheh if (osb->osb_tl_inode) { 2737ccd979bdSMark Fasheh /* We want to push off log flushes while truncates are 2738ccd979bdSMark Fasheh * still running. */ 2739ccd979bdSMark Fasheh if (cancel) 2740ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2741ccd979bdSMark Fasheh 2742ccd979bdSMark Fasheh queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 2743ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 2744ccd979bdSMark Fasheh } 2745ccd979bdSMark Fasheh } 2746ccd979bdSMark Fasheh 2747ccd979bdSMark Fasheh static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 2748ccd979bdSMark Fasheh int slot_num, 2749ccd979bdSMark Fasheh struct inode **tl_inode, 2750ccd979bdSMark Fasheh struct buffer_head **tl_bh) 2751ccd979bdSMark Fasheh { 2752ccd979bdSMark Fasheh int status; 2753ccd979bdSMark Fasheh struct inode *inode = NULL; 2754ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2755ccd979bdSMark Fasheh 2756ccd979bdSMark Fasheh inode = ocfs2_get_system_file_inode(osb, 2757ccd979bdSMark Fasheh TRUNCATE_LOG_SYSTEM_INODE, 2758ccd979bdSMark Fasheh slot_num); 2759ccd979bdSMark Fasheh if (!inode) { 2760ccd979bdSMark Fasheh status = -EINVAL; 2761ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 2762ccd979bdSMark Fasheh goto bail; 2763ccd979bdSMark Fasheh } 2764ccd979bdSMark Fasheh 2765ccd979bdSMark Fasheh status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 2766ccd979bdSMark Fasheh OCFS2_BH_CACHED, inode); 2767ccd979bdSMark Fasheh if (status < 0) { 2768ccd979bdSMark Fasheh iput(inode); 2769ccd979bdSMark Fasheh mlog_errno(status); 2770ccd979bdSMark Fasheh goto bail; 2771ccd979bdSMark Fasheh } 2772ccd979bdSMark Fasheh 2773ccd979bdSMark Fasheh *tl_inode = inode; 2774ccd979bdSMark Fasheh *tl_bh = bh; 2775ccd979bdSMark Fasheh bail: 2776ccd979bdSMark Fasheh mlog_exit(status); 2777ccd979bdSMark Fasheh return status; 2778ccd979bdSMark Fasheh } 2779ccd979bdSMark Fasheh 2780ccd979bdSMark Fasheh /* called during the 1st stage of node recovery. we stamp a clean 2781ccd979bdSMark Fasheh * truncate log and pass back a copy for processing later. if the 2782ccd979bdSMark Fasheh * truncate log does not require processing, a *tl_copy is set to 2783ccd979bdSMark Fasheh * NULL. */ 2784ccd979bdSMark Fasheh int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 2785ccd979bdSMark Fasheh int slot_num, 2786ccd979bdSMark Fasheh struct ocfs2_dinode **tl_copy) 2787ccd979bdSMark Fasheh { 2788ccd979bdSMark Fasheh int status; 2789ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2790ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2791ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2792ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2793ccd979bdSMark Fasheh 2794ccd979bdSMark Fasheh *tl_copy = NULL; 2795ccd979bdSMark Fasheh 2796ccd979bdSMark Fasheh mlog(0, "recover truncate log from slot %d\n", slot_num); 2797ccd979bdSMark Fasheh 2798ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 2799ccd979bdSMark Fasheh if (status < 0) { 2800ccd979bdSMark Fasheh mlog_errno(status); 2801ccd979bdSMark Fasheh goto bail; 2802ccd979bdSMark Fasheh } 2803ccd979bdSMark Fasheh 2804ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2805ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2806ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2807ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); 2808ccd979bdSMark Fasheh status = -EIO; 2809ccd979bdSMark Fasheh goto bail; 2810ccd979bdSMark Fasheh } 2811ccd979bdSMark Fasheh 2812ccd979bdSMark Fasheh if (le16_to_cpu(tl->tl_used)) { 2813ccd979bdSMark Fasheh mlog(0, "We'll have %u logs to recover\n", 2814ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used)); 2815ccd979bdSMark Fasheh 2816ccd979bdSMark Fasheh *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 2817ccd979bdSMark Fasheh if (!(*tl_copy)) { 2818ccd979bdSMark Fasheh status = -ENOMEM; 2819ccd979bdSMark Fasheh mlog_errno(status); 2820ccd979bdSMark Fasheh goto bail; 2821ccd979bdSMark Fasheh } 2822ccd979bdSMark Fasheh 2823ccd979bdSMark Fasheh /* Assuming the write-out below goes well, this copy 2824ccd979bdSMark Fasheh * will be passed back to recovery for processing. */ 2825ccd979bdSMark Fasheh memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 2826ccd979bdSMark Fasheh 2827ccd979bdSMark Fasheh /* All we need to do to clear the truncate log is set 2828ccd979bdSMark Fasheh * tl_used. */ 2829ccd979bdSMark Fasheh tl->tl_used = 0; 2830ccd979bdSMark Fasheh 2831ccd979bdSMark Fasheh status = ocfs2_write_block(osb, tl_bh, tl_inode); 2832ccd979bdSMark Fasheh if (status < 0) { 2833ccd979bdSMark Fasheh mlog_errno(status); 2834ccd979bdSMark Fasheh goto bail; 2835ccd979bdSMark Fasheh } 2836ccd979bdSMark Fasheh } 2837ccd979bdSMark Fasheh 2838ccd979bdSMark Fasheh bail: 2839ccd979bdSMark Fasheh if (tl_inode) 2840ccd979bdSMark Fasheh iput(tl_inode); 2841ccd979bdSMark Fasheh if (tl_bh) 2842ccd979bdSMark Fasheh brelse(tl_bh); 2843ccd979bdSMark Fasheh 2844ccd979bdSMark Fasheh if (status < 0 && (*tl_copy)) { 2845ccd979bdSMark Fasheh kfree(*tl_copy); 2846ccd979bdSMark Fasheh *tl_copy = NULL; 2847ccd979bdSMark Fasheh } 2848ccd979bdSMark Fasheh 2849ccd979bdSMark Fasheh mlog_exit(status); 2850ccd979bdSMark Fasheh return status; 2851ccd979bdSMark Fasheh } 2852ccd979bdSMark Fasheh 2853ccd979bdSMark Fasheh int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 2854ccd979bdSMark Fasheh struct ocfs2_dinode *tl_copy) 2855ccd979bdSMark Fasheh { 2856ccd979bdSMark Fasheh int status = 0; 2857ccd979bdSMark Fasheh int i; 2858ccd979bdSMark Fasheh unsigned int clusters, num_recs, start_cluster; 2859ccd979bdSMark Fasheh u64 start_blk; 28601fabe148SMark Fasheh handle_t *handle; 2861ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2862ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2863ccd979bdSMark Fasheh 2864ccd979bdSMark Fasheh mlog_entry_void(); 2865ccd979bdSMark Fasheh 2866ccd979bdSMark Fasheh if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 2867ccd979bdSMark Fasheh mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 2868ccd979bdSMark Fasheh return -EINVAL; 2869ccd979bdSMark Fasheh } 2870ccd979bdSMark Fasheh 2871ccd979bdSMark Fasheh tl = &tl_copy->id2.i_dealloc; 2872ccd979bdSMark Fasheh num_recs = le16_to_cpu(tl->tl_used); 2873b0697053SMark Fasheh mlog(0, "cleanup %u records from %llu\n", num_recs, 28741ca1a111SMark Fasheh (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 2875ccd979bdSMark Fasheh 28761b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2877ccd979bdSMark Fasheh for(i = 0; i < num_recs; i++) { 2878ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 2879ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 2880ccd979bdSMark Fasheh if (status < 0) { 2881ccd979bdSMark Fasheh mlog_errno(status); 2882ccd979bdSMark Fasheh goto bail_up; 2883ccd979bdSMark Fasheh } 2884ccd979bdSMark Fasheh } 2885ccd979bdSMark Fasheh 288665eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2887ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2888ccd979bdSMark Fasheh status = PTR_ERR(handle); 2889ccd979bdSMark Fasheh mlog_errno(status); 2890ccd979bdSMark Fasheh goto bail_up; 2891ccd979bdSMark Fasheh } 2892ccd979bdSMark Fasheh 2893ccd979bdSMark Fasheh clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 2894ccd979bdSMark Fasheh start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 2895ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 2896ccd979bdSMark Fasheh 2897ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, 2898ccd979bdSMark Fasheh start_blk, clusters); 289902dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2900ccd979bdSMark Fasheh if (status < 0) { 2901ccd979bdSMark Fasheh mlog_errno(status); 2902ccd979bdSMark Fasheh goto bail_up; 2903ccd979bdSMark Fasheh } 2904ccd979bdSMark Fasheh } 2905ccd979bdSMark Fasheh 2906ccd979bdSMark Fasheh bail_up: 29071b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2908ccd979bdSMark Fasheh 2909ccd979bdSMark Fasheh mlog_exit(status); 2910ccd979bdSMark Fasheh return status; 2911ccd979bdSMark Fasheh } 2912ccd979bdSMark Fasheh 2913ccd979bdSMark Fasheh void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 2914ccd979bdSMark Fasheh { 2915ccd979bdSMark Fasheh int status; 2916ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2917ccd979bdSMark Fasheh 2918ccd979bdSMark Fasheh mlog_entry_void(); 2919ccd979bdSMark Fasheh 2920ccd979bdSMark Fasheh if (tl_inode) { 2921ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2922ccd979bdSMark Fasheh flush_workqueue(ocfs2_wq); 2923ccd979bdSMark Fasheh 2924ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2925ccd979bdSMark Fasheh if (status < 0) 2926ccd979bdSMark Fasheh mlog_errno(status); 2927ccd979bdSMark Fasheh 2928ccd979bdSMark Fasheh brelse(osb->osb_tl_bh); 2929ccd979bdSMark Fasheh iput(osb->osb_tl_inode); 2930ccd979bdSMark Fasheh } 2931ccd979bdSMark Fasheh 2932ccd979bdSMark Fasheh mlog_exit_void(); 2933ccd979bdSMark Fasheh } 2934ccd979bdSMark Fasheh 2935ccd979bdSMark Fasheh int ocfs2_truncate_log_init(struct ocfs2_super *osb) 2936ccd979bdSMark Fasheh { 2937ccd979bdSMark Fasheh int status; 2938ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2939ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2940ccd979bdSMark Fasheh 2941ccd979bdSMark Fasheh mlog_entry_void(); 2942ccd979bdSMark Fasheh 2943ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, 2944ccd979bdSMark Fasheh osb->slot_num, 2945ccd979bdSMark Fasheh &tl_inode, 2946ccd979bdSMark Fasheh &tl_bh); 2947ccd979bdSMark Fasheh if (status < 0) 2948ccd979bdSMark Fasheh mlog_errno(status); 2949ccd979bdSMark Fasheh 2950ccd979bdSMark Fasheh /* ocfs2_truncate_log_shutdown keys on the existence of 2951ccd979bdSMark Fasheh * osb->osb_tl_inode so we don't set any of the osb variables 2952ccd979bdSMark Fasheh * until we're sure all is well. */ 2953c4028958SDavid Howells INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 2954c4028958SDavid Howells ocfs2_truncate_log_worker); 2955ccd979bdSMark Fasheh osb->osb_tl_bh = tl_bh; 2956ccd979bdSMark Fasheh osb->osb_tl_inode = tl_inode; 2957ccd979bdSMark Fasheh 2958ccd979bdSMark Fasheh mlog_exit(status); 2959ccd979bdSMark Fasheh return status; 2960ccd979bdSMark Fasheh } 2961ccd979bdSMark Fasheh 29622b604351SMark Fasheh /* 29632b604351SMark Fasheh * Delayed de-allocation of suballocator blocks. 29642b604351SMark Fasheh * 29652b604351SMark Fasheh * Some sets of block de-allocations might involve multiple suballocator inodes. 29662b604351SMark Fasheh * 29672b604351SMark Fasheh * The locking for this can get extremely complicated, especially when 29682b604351SMark Fasheh * the suballocator inodes to delete from aren't known until deep 29692b604351SMark Fasheh * within an unrelated codepath. 29702b604351SMark Fasheh * 29712b604351SMark Fasheh * ocfs2_extent_block structures are a good example of this - an inode 29722b604351SMark Fasheh * btree could have been grown by any number of nodes each allocating 29732b604351SMark Fasheh * out of their own suballoc inode. 29742b604351SMark Fasheh * 29752b604351SMark Fasheh * These structures allow the delay of block de-allocation until a 29762b604351SMark Fasheh * later time, when locking of multiple cluster inodes won't cause 29772b604351SMark Fasheh * deadlock. 29782b604351SMark Fasheh */ 29792b604351SMark Fasheh 29802b604351SMark Fasheh /* 29812b604351SMark Fasheh * Describes a single block free from a suballocator 29822b604351SMark Fasheh */ 29832b604351SMark Fasheh struct ocfs2_cached_block_free { 29842b604351SMark Fasheh struct ocfs2_cached_block_free *free_next; 29852b604351SMark Fasheh u64 free_blk; 29862b604351SMark Fasheh unsigned int free_bit; 29872b604351SMark Fasheh }; 29882b604351SMark Fasheh 29892b604351SMark Fasheh struct ocfs2_per_slot_free_list { 29902b604351SMark Fasheh struct ocfs2_per_slot_free_list *f_next_suballocator; 29912b604351SMark Fasheh int f_inode_type; 29922b604351SMark Fasheh int f_slot; 29932b604351SMark Fasheh struct ocfs2_cached_block_free *f_first; 29942b604351SMark Fasheh }; 29952b604351SMark Fasheh 29962b604351SMark Fasheh static int ocfs2_free_cached_items(struct ocfs2_super *osb, 29972b604351SMark Fasheh int sysfile_type, 29982b604351SMark Fasheh int slot, 29992b604351SMark Fasheh struct ocfs2_cached_block_free *head) 30002b604351SMark Fasheh { 30012b604351SMark Fasheh int ret; 30022b604351SMark Fasheh u64 bg_blkno; 30032b604351SMark Fasheh handle_t *handle; 30042b604351SMark Fasheh struct inode *inode; 30052b604351SMark Fasheh struct buffer_head *di_bh = NULL; 30062b604351SMark Fasheh struct ocfs2_cached_block_free *tmp; 30072b604351SMark Fasheh 30082b604351SMark Fasheh inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 30092b604351SMark Fasheh if (!inode) { 30102b604351SMark Fasheh ret = -EINVAL; 30112b604351SMark Fasheh mlog_errno(ret); 30122b604351SMark Fasheh goto out; 30132b604351SMark Fasheh } 30142b604351SMark Fasheh 30152b604351SMark Fasheh mutex_lock(&inode->i_mutex); 30162b604351SMark Fasheh 30172b604351SMark Fasheh ret = ocfs2_meta_lock(inode, &di_bh, 1); 30182b604351SMark Fasheh if (ret) { 30192b604351SMark Fasheh mlog_errno(ret); 30202b604351SMark Fasheh goto out_mutex; 30212b604351SMark Fasheh } 30222b604351SMark Fasheh 30232b604351SMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 30242b604351SMark Fasheh if (IS_ERR(handle)) { 30252b604351SMark Fasheh ret = PTR_ERR(handle); 30262b604351SMark Fasheh mlog_errno(ret); 30272b604351SMark Fasheh goto out_unlock; 30282b604351SMark Fasheh } 30292b604351SMark Fasheh 30302b604351SMark Fasheh while (head) { 30312b604351SMark Fasheh bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 30322b604351SMark Fasheh head->free_bit); 30332b604351SMark Fasheh mlog(0, "Free bit: (bit %u, blkno %llu)\n", 30342b604351SMark Fasheh head->free_bit, (unsigned long long)head->free_blk); 30352b604351SMark Fasheh 30362b604351SMark Fasheh ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 30372b604351SMark Fasheh head->free_bit, bg_blkno, 1); 30382b604351SMark Fasheh if (ret) { 30392b604351SMark Fasheh mlog_errno(ret); 30402b604351SMark Fasheh goto out_journal; 30412b604351SMark Fasheh } 30422b604351SMark Fasheh 30432b604351SMark Fasheh ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 30442b604351SMark Fasheh if (ret) { 30452b604351SMark Fasheh mlog_errno(ret); 30462b604351SMark Fasheh goto out_journal; 30472b604351SMark Fasheh } 30482b604351SMark Fasheh 30492b604351SMark Fasheh tmp = head; 30502b604351SMark Fasheh head = head->free_next; 30512b604351SMark Fasheh kfree(tmp); 30522b604351SMark Fasheh } 30532b604351SMark Fasheh 30542b604351SMark Fasheh out_journal: 30552b604351SMark Fasheh ocfs2_commit_trans(osb, handle); 30562b604351SMark Fasheh 30572b604351SMark Fasheh out_unlock: 30582b604351SMark Fasheh ocfs2_meta_unlock(inode, 1); 30592b604351SMark Fasheh brelse(di_bh); 30602b604351SMark Fasheh out_mutex: 30612b604351SMark Fasheh mutex_unlock(&inode->i_mutex); 30622b604351SMark Fasheh iput(inode); 30632b604351SMark Fasheh out: 30642b604351SMark Fasheh while(head) { 30652b604351SMark Fasheh /* Premature exit may have left some dangling items. */ 30662b604351SMark Fasheh tmp = head; 30672b604351SMark Fasheh head = head->free_next; 30682b604351SMark Fasheh kfree(tmp); 30692b604351SMark Fasheh } 30702b604351SMark Fasheh 30712b604351SMark Fasheh return ret; 30722b604351SMark Fasheh } 30732b604351SMark Fasheh 30742b604351SMark Fasheh int ocfs2_run_deallocs(struct ocfs2_super *osb, 30752b604351SMark Fasheh struct ocfs2_cached_dealloc_ctxt *ctxt) 30762b604351SMark Fasheh { 30772b604351SMark Fasheh int ret = 0, ret2; 30782b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl; 30792b604351SMark Fasheh 30802b604351SMark Fasheh if (!ctxt) 30812b604351SMark Fasheh return 0; 30822b604351SMark Fasheh 30832b604351SMark Fasheh while (ctxt->c_first_suballocator) { 30842b604351SMark Fasheh fl = ctxt->c_first_suballocator; 30852b604351SMark Fasheh 30862b604351SMark Fasheh if (fl->f_first) { 30872b604351SMark Fasheh mlog(0, "Free items: (type %u, slot %d)\n", 30882b604351SMark Fasheh fl->f_inode_type, fl->f_slot); 30892b604351SMark Fasheh ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 30902b604351SMark Fasheh fl->f_slot, fl->f_first); 30912b604351SMark Fasheh if (ret2) 30922b604351SMark Fasheh mlog_errno(ret2); 30932b604351SMark Fasheh if (!ret) 30942b604351SMark Fasheh ret = ret2; 30952b604351SMark Fasheh } 30962b604351SMark Fasheh 30972b604351SMark Fasheh ctxt->c_first_suballocator = fl->f_next_suballocator; 30982b604351SMark Fasheh kfree(fl); 30992b604351SMark Fasheh } 31002b604351SMark Fasheh 31012b604351SMark Fasheh return ret; 31022b604351SMark Fasheh } 31032b604351SMark Fasheh 31042b604351SMark Fasheh static struct ocfs2_per_slot_free_list * 31052b604351SMark Fasheh ocfs2_find_per_slot_free_list(int type, 31062b604351SMark Fasheh int slot, 31072b604351SMark Fasheh struct ocfs2_cached_dealloc_ctxt *ctxt) 31082b604351SMark Fasheh { 31092b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 31102b604351SMark Fasheh 31112b604351SMark Fasheh while (fl) { 31122b604351SMark Fasheh if (fl->f_inode_type == type && fl->f_slot == slot) 31132b604351SMark Fasheh return fl; 31142b604351SMark Fasheh 31152b604351SMark Fasheh fl = fl->f_next_suballocator; 31162b604351SMark Fasheh } 31172b604351SMark Fasheh 31182b604351SMark Fasheh fl = kmalloc(sizeof(*fl), GFP_NOFS); 31192b604351SMark Fasheh if (fl) { 31202b604351SMark Fasheh fl->f_inode_type = type; 31212b604351SMark Fasheh fl->f_slot = slot; 31222b604351SMark Fasheh fl->f_first = NULL; 31232b604351SMark Fasheh fl->f_next_suballocator = ctxt->c_first_suballocator; 31242b604351SMark Fasheh 31252b604351SMark Fasheh ctxt->c_first_suballocator = fl; 31262b604351SMark Fasheh } 31272b604351SMark Fasheh return fl; 31282b604351SMark Fasheh } 31292b604351SMark Fasheh 31302b604351SMark Fasheh static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 31312b604351SMark Fasheh int type, int slot, u64 blkno, 31322b604351SMark Fasheh unsigned int bit) 31332b604351SMark Fasheh { 31342b604351SMark Fasheh int ret; 31352b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl; 31362b604351SMark Fasheh struct ocfs2_cached_block_free *item; 31372b604351SMark Fasheh 31382b604351SMark Fasheh fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 31392b604351SMark Fasheh if (fl == NULL) { 31402b604351SMark Fasheh ret = -ENOMEM; 31412b604351SMark Fasheh mlog_errno(ret); 31422b604351SMark Fasheh goto out; 31432b604351SMark Fasheh } 31442b604351SMark Fasheh 31452b604351SMark Fasheh item = kmalloc(sizeof(*item), GFP_NOFS); 31462b604351SMark Fasheh if (item == NULL) { 31472b604351SMark Fasheh ret = -ENOMEM; 31482b604351SMark Fasheh mlog_errno(ret); 31492b604351SMark Fasheh goto out; 31502b604351SMark Fasheh } 31512b604351SMark Fasheh 31522b604351SMark Fasheh mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 31532b604351SMark Fasheh type, slot, bit, (unsigned long long)blkno); 31542b604351SMark Fasheh 31552b604351SMark Fasheh item->free_blk = blkno; 31562b604351SMark Fasheh item->free_bit = bit; 31572b604351SMark Fasheh item->free_next = fl->f_first; 31582b604351SMark Fasheh 31592b604351SMark Fasheh fl->f_first = item; 31602b604351SMark Fasheh 31612b604351SMark Fasheh ret = 0; 31622b604351SMark Fasheh out: 31632b604351SMark Fasheh return ret; 31642b604351SMark Fasheh } 31652b604351SMark Fasheh 3166*59a5e416SMark Fasheh static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 3167*59a5e416SMark Fasheh struct ocfs2_extent_block *eb) 3168*59a5e416SMark Fasheh { 3169*59a5e416SMark Fasheh return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 3170*59a5e416SMark Fasheh le16_to_cpu(eb->h_suballoc_slot), 3171*59a5e416SMark Fasheh le64_to_cpu(eb->h_blkno), 3172*59a5e416SMark Fasheh le16_to_cpu(eb->h_suballoc_bit)); 3173*59a5e416SMark Fasheh } 3174*59a5e416SMark Fasheh 3175ccd979bdSMark Fasheh /* This function will figure out whether the currently last extent 3176ccd979bdSMark Fasheh * block will be deleted, and if it will, what the new last extent 3177ccd979bdSMark Fasheh * block will be so we can update his h_next_leaf_blk field, as well 3178ccd979bdSMark Fasheh * as the dinodes i_last_eb_blk */ 3179dcd0538fSMark Fasheh static int ocfs2_find_new_last_ext_blk(struct inode *inode, 31803a0782d0SMark Fasheh unsigned int clusters_to_del, 3181dcd0538fSMark Fasheh struct ocfs2_path *path, 3182ccd979bdSMark Fasheh struct buffer_head **new_last_eb) 3183ccd979bdSMark Fasheh { 31843a0782d0SMark Fasheh int next_free, ret = 0; 3185dcd0538fSMark Fasheh u32 cpos; 31863a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 3187ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 3188ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3189ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 3190ccd979bdSMark Fasheh 3191ccd979bdSMark Fasheh *new_last_eb = NULL; 3192ccd979bdSMark Fasheh 3193ccd979bdSMark Fasheh /* we have no tree, so of course, no last_eb. */ 3194dcd0538fSMark Fasheh if (!path->p_tree_depth) 3195dcd0538fSMark Fasheh goto out; 3196ccd979bdSMark Fasheh 3197ccd979bdSMark Fasheh /* trunc to zero special case - this makes tree_depth = 0 3198ccd979bdSMark Fasheh * regardless of what it is. */ 31993a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 3200dcd0538fSMark Fasheh goto out; 3201ccd979bdSMark Fasheh 3202dcd0538fSMark Fasheh el = path_leaf_el(path); 3203ccd979bdSMark Fasheh BUG_ON(!el->l_next_free_rec); 3204ccd979bdSMark Fasheh 32053a0782d0SMark Fasheh /* 32063a0782d0SMark Fasheh * Make sure that this extent list will actually be empty 32073a0782d0SMark Fasheh * after we clear away the data. We can shortcut out if 32083a0782d0SMark Fasheh * there's more than one non-empty extent in the 32093a0782d0SMark Fasheh * list. Otherwise, a check of the remaining extent is 32103a0782d0SMark Fasheh * necessary. 32113a0782d0SMark Fasheh */ 32123a0782d0SMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 32133a0782d0SMark Fasheh rec = NULL; 3214dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 32153a0782d0SMark Fasheh if (next_free > 2) 3216dcd0538fSMark Fasheh goto out; 32173a0782d0SMark Fasheh 32183a0782d0SMark Fasheh /* We may have a valid extent in index 1, check it. */ 32193a0782d0SMark Fasheh if (next_free == 2) 32203a0782d0SMark Fasheh rec = &el->l_recs[1]; 32213a0782d0SMark Fasheh 32223a0782d0SMark Fasheh /* 32233a0782d0SMark Fasheh * Fall through - no more nonempty extents, so we want 32243a0782d0SMark Fasheh * to delete this leaf. 32253a0782d0SMark Fasheh */ 32263a0782d0SMark Fasheh } else { 32273a0782d0SMark Fasheh if (next_free > 1) 3228dcd0538fSMark Fasheh goto out; 3229ccd979bdSMark Fasheh 32303a0782d0SMark Fasheh rec = &el->l_recs[0]; 32313a0782d0SMark Fasheh } 32323a0782d0SMark Fasheh 32333a0782d0SMark Fasheh if (rec) { 32343a0782d0SMark Fasheh /* 32353a0782d0SMark Fasheh * Check it we'll only be trimming off the end of this 32363a0782d0SMark Fasheh * cluster. 32373a0782d0SMark Fasheh */ 3238e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 32393a0782d0SMark Fasheh goto out; 32403a0782d0SMark Fasheh } 32413a0782d0SMark Fasheh 3242dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3243dcd0538fSMark Fasheh if (ret) { 3244dcd0538fSMark Fasheh mlog_errno(ret); 3245dcd0538fSMark Fasheh goto out; 3246ccd979bdSMark Fasheh } 3247ccd979bdSMark Fasheh 3248dcd0538fSMark Fasheh ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 3249dcd0538fSMark Fasheh if (ret) { 3250dcd0538fSMark Fasheh mlog_errno(ret); 3251dcd0538fSMark Fasheh goto out; 3252ccd979bdSMark Fasheh } 3253dcd0538fSMark Fasheh 3254ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 3255ccd979bdSMark Fasheh el = &eb->h_list; 3256ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3257ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3258dcd0538fSMark Fasheh ret = -EROFS; 3259dcd0538fSMark Fasheh goto out; 3260ccd979bdSMark Fasheh } 3261ccd979bdSMark Fasheh 3262ccd979bdSMark Fasheh *new_last_eb = bh; 3263ccd979bdSMark Fasheh get_bh(*new_last_eb); 3264dcd0538fSMark Fasheh mlog(0, "returning block %llu, (cpos: %u)\n", 3265dcd0538fSMark Fasheh (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 3266dcd0538fSMark Fasheh out: 3267ccd979bdSMark Fasheh brelse(bh); 3268ccd979bdSMark Fasheh 3269dcd0538fSMark Fasheh return ret; 3270ccd979bdSMark Fasheh } 3271ccd979bdSMark Fasheh 32723a0782d0SMark Fasheh /* 32733a0782d0SMark Fasheh * Trim some clusters off the rightmost edge of a tree. Only called 32743a0782d0SMark Fasheh * during truncate. 32753a0782d0SMark Fasheh * 32763a0782d0SMark Fasheh * The caller needs to: 32773a0782d0SMark Fasheh * - start journaling of each path component. 32783a0782d0SMark Fasheh * - compute and fully set up any new last ext block 32793a0782d0SMark Fasheh */ 32803a0782d0SMark Fasheh static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 32813a0782d0SMark Fasheh handle_t *handle, struct ocfs2_truncate_context *tc, 32823a0782d0SMark Fasheh u32 clusters_to_del, u64 *delete_start) 32833a0782d0SMark Fasheh { 32843a0782d0SMark Fasheh int ret, i, index = path->p_tree_depth; 32853a0782d0SMark Fasheh u32 new_edge = 0; 32863a0782d0SMark Fasheh u64 deleted_eb = 0; 32873a0782d0SMark Fasheh struct buffer_head *bh; 32883a0782d0SMark Fasheh struct ocfs2_extent_list *el; 32893a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 32903a0782d0SMark Fasheh 32913a0782d0SMark Fasheh *delete_start = 0; 32923a0782d0SMark Fasheh 32933a0782d0SMark Fasheh while (index >= 0) { 32943a0782d0SMark Fasheh bh = path->p_node[index].bh; 32953a0782d0SMark Fasheh el = path->p_node[index].el; 32963a0782d0SMark Fasheh 32973a0782d0SMark Fasheh mlog(0, "traveling tree (index = %d, block = %llu)\n", 32983a0782d0SMark Fasheh index, (unsigned long long)bh->b_blocknr); 32993a0782d0SMark Fasheh 33003a0782d0SMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 33013a0782d0SMark Fasheh 33023a0782d0SMark Fasheh if (index != 33033a0782d0SMark Fasheh (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 33043a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 33053a0782d0SMark Fasheh "Inode %lu has invalid ext. block %llu", 33063a0782d0SMark Fasheh inode->i_ino, 33073a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr); 33083a0782d0SMark Fasheh ret = -EROFS; 33093a0782d0SMark Fasheh goto out; 33103a0782d0SMark Fasheh } 33113a0782d0SMark Fasheh 33123a0782d0SMark Fasheh find_tail_record: 33133a0782d0SMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 33143a0782d0SMark Fasheh rec = &el->l_recs[i]; 33153a0782d0SMark Fasheh 33163a0782d0SMark Fasheh mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 33173a0782d0SMark Fasheh "next = %u\n", i, le32_to_cpu(rec->e_cpos), 3318e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec), 33193a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 33203a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 33213a0782d0SMark Fasheh 3322e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 33233a0782d0SMark Fasheh 33243a0782d0SMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 33253a0782d0SMark Fasheh /* 33263a0782d0SMark Fasheh * If the leaf block contains a single empty 33273a0782d0SMark Fasheh * extent and no records, we can just remove 33283a0782d0SMark Fasheh * the block. 33293a0782d0SMark Fasheh */ 33303a0782d0SMark Fasheh if (i == 0 && ocfs2_is_empty_extent(rec)) { 33313a0782d0SMark Fasheh memset(rec, 0, 33323a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33333a0782d0SMark Fasheh el->l_next_free_rec = cpu_to_le16(0); 33343a0782d0SMark Fasheh 33353a0782d0SMark Fasheh goto delete; 33363a0782d0SMark Fasheh } 33373a0782d0SMark Fasheh 33383a0782d0SMark Fasheh /* 33393a0782d0SMark Fasheh * Remove any empty extents by shifting things 33403a0782d0SMark Fasheh * left. That should make life much easier on 33413a0782d0SMark Fasheh * the code below. This condition is rare 33423a0782d0SMark Fasheh * enough that we shouldn't see a performance 33433a0782d0SMark Fasheh * hit. 33443a0782d0SMark Fasheh */ 33453a0782d0SMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 33463a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 33473a0782d0SMark Fasheh 33483a0782d0SMark Fasheh for(i = 0; 33493a0782d0SMark Fasheh i < le16_to_cpu(el->l_next_free_rec); i++) 33503a0782d0SMark Fasheh el->l_recs[i] = el->l_recs[i + 1]; 33513a0782d0SMark Fasheh 33523a0782d0SMark Fasheh memset(&el->l_recs[i], 0, 33533a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33543a0782d0SMark Fasheh 33553a0782d0SMark Fasheh /* 33563a0782d0SMark Fasheh * We've modified our extent list. The 33573a0782d0SMark Fasheh * simplest way to handle this change 33583a0782d0SMark Fasheh * is to being the search from the 33593a0782d0SMark Fasheh * start again. 33603a0782d0SMark Fasheh */ 33613a0782d0SMark Fasheh goto find_tail_record; 33623a0782d0SMark Fasheh } 33633a0782d0SMark Fasheh 3364e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 33653a0782d0SMark Fasheh 33663a0782d0SMark Fasheh /* 33673a0782d0SMark Fasheh * We'll use "new_edge" on our way back up the 33683a0782d0SMark Fasheh * tree to know what our rightmost cpos is. 33693a0782d0SMark Fasheh */ 3370e48edee2SMark Fasheh new_edge = le16_to_cpu(rec->e_leaf_clusters); 33713a0782d0SMark Fasheh new_edge += le32_to_cpu(rec->e_cpos); 33723a0782d0SMark Fasheh 33733a0782d0SMark Fasheh /* 33743a0782d0SMark Fasheh * The caller will use this to delete data blocks. 33753a0782d0SMark Fasheh */ 33763a0782d0SMark Fasheh *delete_start = le64_to_cpu(rec->e_blkno) 33773a0782d0SMark Fasheh + ocfs2_clusters_to_blocks(inode->i_sb, 3378e48edee2SMark Fasheh le16_to_cpu(rec->e_leaf_clusters)); 33793a0782d0SMark Fasheh 33803a0782d0SMark Fasheh /* 33813a0782d0SMark Fasheh * If it's now empty, remove this record. 33823a0782d0SMark Fasheh */ 3383e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 33843a0782d0SMark Fasheh memset(rec, 0, 33853a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33863a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 33873a0782d0SMark Fasheh } 33883a0782d0SMark Fasheh } else { 33893a0782d0SMark Fasheh if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 33903a0782d0SMark Fasheh memset(rec, 0, 33913a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33923a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 33933a0782d0SMark Fasheh 33943a0782d0SMark Fasheh goto delete; 33953a0782d0SMark Fasheh } 33963a0782d0SMark Fasheh 33973a0782d0SMark Fasheh /* Can this actually happen? */ 33983a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) 33993a0782d0SMark Fasheh goto delete; 34003a0782d0SMark Fasheh 34013a0782d0SMark Fasheh /* 34023a0782d0SMark Fasheh * We never actually deleted any clusters 34033a0782d0SMark Fasheh * because our leaf was empty. There's no 34043a0782d0SMark Fasheh * reason to adjust the rightmost edge then. 34053a0782d0SMark Fasheh */ 34063a0782d0SMark Fasheh if (new_edge == 0) 34073a0782d0SMark Fasheh goto delete; 34083a0782d0SMark Fasheh 3409e48edee2SMark Fasheh rec->e_int_clusters = cpu_to_le32(new_edge); 3410e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 34113a0782d0SMark Fasheh -le32_to_cpu(rec->e_cpos)); 34123a0782d0SMark Fasheh 34133a0782d0SMark Fasheh /* 34143a0782d0SMark Fasheh * A deleted child record should have been 34153a0782d0SMark Fasheh * caught above. 34163a0782d0SMark Fasheh */ 3417e48edee2SMark Fasheh BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 34183a0782d0SMark Fasheh } 34193a0782d0SMark Fasheh 34203a0782d0SMark Fasheh delete: 34213a0782d0SMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 34223a0782d0SMark Fasheh if (ret) { 34233a0782d0SMark Fasheh mlog_errno(ret); 34243a0782d0SMark Fasheh goto out; 34253a0782d0SMark Fasheh } 34263a0782d0SMark Fasheh 34273a0782d0SMark Fasheh mlog(0, "extent list container %llu, after: record %d: " 34283a0782d0SMark Fasheh "(%u, %u, %llu), next = %u.\n", 34293a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr, i, 3430e48edee2SMark Fasheh le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 34313a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 34323a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 34333a0782d0SMark Fasheh 34343a0782d0SMark Fasheh /* 34353a0782d0SMark Fasheh * We must be careful to only attempt delete of an 34363a0782d0SMark Fasheh * extent block (and not the root inode block). 34373a0782d0SMark Fasheh */ 34383a0782d0SMark Fasheh if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 34393a0782d0SMark Fasheh struct ocfs2_extent_block *eb = 34403a0782d0SMark Fasheh (struct ocfs2_extent_block *)bh->b_data; 34413a0782d0SMark Fasheh 34423a0782d0SMark Fasheh /* 34433a0782d0SMark Fasheh * Save this for use when processing the 34443a0782d0SMark Fasheh * parent block. 34453a0782d0SMark Fasheh */ 34463a0782d0SMark Fasheh deleted_eb = le64_to_cpu(eb->h_blkno); 34473a0782d0SMark Fasheh 34483a0782d0SMark Fasheh mlog(0, "deleting this extent block.\n"); 34493a0782d0SMark Fasheh 34503a0782d0SMark Fasheh ocfs2_remove_from_cache(inode, bh); 34513a0782d0SMark Fasheh 3452e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 34533a0782d0SMark Fasheh BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 34543a0782d0SMark Fasheh BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 34553a0782d0SMark Fasheh 3456*59a5e416SMark Fasheh ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 34573a0782d0SMark Fasheh /* An error here is not fatal. */ 34583a0782d0SMark Fasheh if (ret < 0) 34593a0782d0SMark Fasheh mlog_errno(ret); 34603a0782d0SMark Fasheh } else { 34613a0782d0SMark Fasheh deleted_eb = 0; 34623a0782d0SMark Fasheh } 34633a0782d0SMark Fasheh 34643a0782d0SMark Fasheh index--; 34653a0782d0SMark Fasheh } 34663a0782d0SMark Fasheh 34673a0782d0SMark Fasheh ret = 0; 34683a0782d0SMark Fasheh out: 34693a0782d0SMark Fasheh return ret; 34703a0782d0SMark Fasheh } 34713a0782d0SMark Fasheh 3472ccd979bdSMark Fasheh static int ocfs2_do_truncate(struct ocfs2_super *osb, 3473ccd979bdSMark Fasheh unsigned int clusters_to_del, 3474ccd979bdSMark Fasheh struct inode *inode, 3475ccd979bdSMark Fasheh struct buffer_head *fe_bh, 34761fabe148SMark Fasheh handle_t *handle, 3477dcd0538fSMark Fasheh struct ocfs2_truncate_context *tc, 3478dcd0538fSMark Fasheh struct ocfs2_path *path) 3479ccd979bdSMark Fasheh { 34803a0782d0SMark Fasheh int status; 3481ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 3482ccd979bdSMark Fasheh struct ocfs2_extent_block *last_eb = NULL; 3483ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3484ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 3485ccd979bdSMark Fasheh u64 delete_blk = 0; 3486ccd979bdSMark Fasheh 3487ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 3488ccd979bdSMark Fasheh 34893a0782d0SMark Fasheh status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 3490dcd0538fSMark Fasheh path, &last_eb_bh); 3491ccd979bdSMark Fasheh if (status < 0) { 3492ccd979bdSMark Fasheh mlog_errno(status); 3493ccd979bdSMark Fasheh goto bail; 3494ccd979bdSMark Fasheh } 3495ccd979bdSMark Fasheh 3496dcd0538fSMark Fasheh /* 3497dcd0538fSMark Fasheh * Each component will be touched, so we might as well journal 3498dcd0538fSMark Fasheh * here to avoid having to handle errors later. 3499dcd0538fSMark Fasheh */ 35003a0782d0SMark Fasheh status = ocfs2_journal_access_path(inode, handle, path); 3501ccd979bdSMark Fasheh if (status < 0) { 3502ccd979bdSMark Fasheh mlog_errno(status); 3503ccd979bdSMark Fasheh goto bail; 3504ccd979bdSMark Fasheh } 3505dcd0538fSMark Fasheh 3506dcd0538fSMark Fasheh if (last_eb_bh) { 3507dcd0538fSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 3508dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 3509dcd0538fSMark Fasheh if (status < 0) { 3510dcd0538fSMark Fasheh mlog_errno(status); 3511dcd0538fSMark Fasheh goto bail; 3512dcd0538fSMark Fasheh } 3513dcd0538fSMark Fasheh 3514dcd0538fSMark Fasheh last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3515dcd0538fSMark Fasheh } 3516dcd0538fSMark Fasheh 3517ccd979bdSMark Fasheh el = &(fe->id2.i_list); 3518ccd979bdSMark Fasheh 3519dcd0538fSMark Fasheh /* 3520dcd0538fSMark Fasheh * Lower levels depend on this never happening, but it's best 3521dcd0538fSMark Fasheh * to check it up here before changing the tree. 3522dcd0538fSMark Fasheh */ 3523e48edee2SMark Fasheh if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 3524dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 3525dcd0538fSMark Fasheh "Inode %lu has an empty extent record, depth %u\n", 3526dcd0538fSMark Fasheh inode->i_ino, le16_to_cpu(el->l_tree_depth)); 35273a0782d0SMark Fasheh status = -EROFS; 3528dcd0538fSMark Fasheh goto bail; 3529dcd0538fSMark Fasheh } 3530dcd0538fSMark Fasheh 3531ccd979bdSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 3532ccd979bdSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3533ccd979bdSMark Fasheh clusters_to_del; 3534ccd979bdSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 3535ccd979bdSMark Fasheh le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3536ccd979bdSMark Fasheh 35373a0782d0SMark Fasheh status = ocfs2_trim_tree(inode, path, handle, tc, 35383a0782d0SMark Fasheh clusters_to_del, &delete_blk); 35393a0782d0SMark Fasheh if (status) { 35403a0782d0SMark Fasheh mlog_errno(status); 35413a0782d0SMark Fasheh goto bail; 3542ccd979bdSMark Fasheh } 3543ccd979bdSMark Fasheh 3544dcd0538fSMark Fasheh if (le32_to_cpu(fe->i_clusters) == 0) { 3545ccd979bdSMark Fasheh /* trunc to zero is a special case. */ 3546ccd979bdSMark Fasheh el->l_tree_depth = 0; 3547ccd979bdSMark Fasheh fe->i_last_eb_blk = 0; 3548ccd979bdSMark Fasheh } else if (last_eb) 3549ccd979bdSMark Fasheh fe->i_last_eb_blk = last_eb->h_blkno; 3550ccd979bdSMark Fasheh 3551ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 3552ccd979bdSMark Fasheh if (status < 0) { 3553ccd979bdSMark Fasheh mlog_errno(status); 3554ccd979bdSMark Fasheh goto bail; 3555ccd979bdSMark Fasheh } 3556ccd979bdSMark Fasheh 3557ccd979bdSMark Fasheh if (last_eb) { 3558ccd979bdSMark Fasheh /* If there will be a new last extent block, then by 3559ccd979bdSMark Fasheh * definition, there cannot be any leaves to the right of 3560ccd979bdSMark Fasheh * him. */ 3561ccd979bdSMark Fasheh last_eb->h_next_leaf_blk = 0; 3562ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 3563ccd979bdSMark Fasheh if (status < 0) { 3564ccd979bdSMark Fasheh mlog_errno(status); 3565ccd979bdSMark Fasheh goto bail; 3566ccd979bdSMark Fasheh } 3567ccd979bdSMark Fasheh } 3568ccd979bdSMark Fasheh 35693a0782d0SMark Fasheh if (delete_blk) { 3570ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, delete_blk, 3571ccd979bdSMark Fasheh clusters_to_del); 3572ccd979bdSMark Fasheh if (status < 0) { 3573ccd979bdSMark Fasheh mlog_errno(status); 3574ccd979bdSMark Fasheh goto bail; 3575ccd979bdSMark Fasheh } 35763a0782d0SMark Fasheh } 3577ccd979bdSMark Fasheh status = 0; 3578ccd979bdSMark Fasheh bail: 3579dcd0538fSMark Fasheh 3580ccd979bdSMark Fasheh mlog_exit(status); 3581ccd979bdSMark Fasheh return status; 3582ccd979bdSMark Fasheh } 3583ccd979bdSMark Fasheh 358460b11392SMark Fasheh static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 358560b11392SMark Fasheh { 358660b11392SMark Fasheh set_buffer_uptodate(bh); 358760b11392SMark Fasheh mark_buffer_dirty(bh); 358860b11392SMark Fasheh return 0; 358960b11392SMark Fasheh } 359060b11392SMark Fasheh 359160b11392SMark Fasheh static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) 359260b11392SMark Fasheh { 359360b11392SMark Fasheh set_buffer_uptodate(bh); 359460b11392SMark Fasheh mark_buffer_dirty(bh); 359560b11392SMark Fasheh return ocfs2_journal_dirty_data(handle, bh); 359660b11392SMark Fasheh } 359760b11392SMark Fasheh 359860b11392SMark Fasheh static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 359960b11392SMark Fasheh struct page **pages, int numpages, 360060b11392SMark Fasheh u64 phys, handle_t *handle) 360160b11392SMark Fasheh { 360260b11392SMark Fasheh int i, ret, partial = 0; 360360b11392SMark Fasheh void *kaddr; 360460b11392SMark Fasheh struct page *page; 360560b11392SMark Fasheh unsigned int from, to = PAGE_CACHE_SIZE; 360660b11392SMark Fasheh struct super_block *sb = inode->i_sb; 360760b11392SMark Fasheh 360860b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 360960b11392SMark Fasheh 361060b11392SMark Fasheh if (numpages == 0) 361160b11392SMark Fasheh goto out; 361260b11392SMark Fasheh 361360b11392SMark Fasheh from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 361460b11392SMark Fasheh if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { 361560b11392SMark Fasheh /* 361660b11392SMark Fasheh * Since 'from' has been capped to a value below page 361760b11392SMark Fasheh * size, this calculation won't be able to overflow 361860b11392SMark Fasheh * 'to' 361960b11392SMark Fasheh */ 362060b11392SMark Fasheh to = ocfs2_align_bytes_to_clusters(sb, from); 362160b11392SMark Fasheh 362260b11392SMark Fasheh /* 362360b11392SMark Fasheh * The truncate tail in this case should never contain 362460b11392SMark Fasheh * more than one page at maximum. The loop below also 362560b11392SMark Fasheh * assumes this. 362660b11392SMark Fasheh */ 362760b11392SMark Fasheh BUG_ON(numpages != 1); 362860b11392SMark Fasheh } 362960b11392SMark Fasheh 363060b11392SMark Fasheh for(i = 0; i < numpages; i++) { 363160b11392SMark Fasheh page = pages[i]; 363260b11392SMark Fasheh 363360b11392SMark Fasheh BUG_ON(from > PAGE_CACHE_SIZE); 363460b11392SMark Fasheh BUG_ON(to > PAGE_CACHE_SIZE); 363560b11392SMark Fasheh 363660b11392SMark Fasheh ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); 363760b11392SMark Fasheh if (ret) 363860b11392SMark Fasheh mlog_errno(ret); 363960b11392SMark Fasheh 364060b11392SMark Fasheh kaddr = kmap_atomic(page, KM_USER0); 364160b11392SMark Fasheh memset(kaddr + from, 0, to - from); 364260b11392SMark Fasheh kunmap_atomic(kaddr, KM_USER0); 364360b11392SMark Fasheh 364460b11392SMark Fasheh /* 364560b11392SMark Fasheh * Need to set the buffers we zero'd into uptodate 364660b11392SMark Fasheh * here if they aren't - ocfs2_map_page_blocks() 364760b11392SMark Fasheh * might've skipped some 364860b11392SMark Fasheh */ 364960b11392SMark Fasheh if (ocfs2_should_order_data(inode)) { 365060b11392SMark Fasheh ret = walk_page_buffers(handle, 365160b11392SMark Fasheh page_buffers(page), 365260b11392SMark Fasheh from, to, &partial, 365360b11392SMark Fasheh ocfs2_ordered_zero_func); 365460b11392SMark Fasheh if (ret < 0) 365560b11392SMark Fasheh mlog_errno(ret); 365660b11392SMark Fasheh } else { 365760b11392SMark Fasheh ret = walk_page_buffers(handle, page_buffers(page), 365860b11392SMark Fasheh from, to, &partial, 365960b11392SMark Fasheh ocfs2_writeback_zero_func); 366060b11392SMark Fasheh if (ret < 0) 366160b11392SMark Fasheh mlog_errno(ret); 366260b11392SMark Fasheh } 366360b11392SMark Fasheh 366460b11392SMark Fasheh if (!partial) 366560b11392SMark Fasheh SetPageUptodate(page); 366660b11392SMark Fasheh 366760b11392SMark Fasheh flush_dcache_page(page); 366860b11392SMark Fasheh 366960b11392SMark Fasheh /* 367060b11392SMark Fasheh * Every page after the 1st one should be completely zero'd. 367160b11392SMark Fasheh */ 367260b11392SMark Fasheh from = 0; 367360b11392SMark Fasheh } 367460b11392SMark Fasheh out: 367560b11392SMark Fasheh if (pages) { 367660b11392SMark Fasheh for (i = 0; i < numpages; i++) { 367760b11392SMark Fasheh page = pages[i]; 367860b11392SMark Fasheh unlock_page(page); 367960b11392SMark Fasheh mark_page_accessed(page); 368060b11392SMark Fasheh page_cache_release(page); 368160b11392SMark Fasheh } 368260b11392SMark Fasheh } 368360b11392SMark Fasheh } 368460b11392SMark Fasheh 368560b11392SMark Fasheh static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 368660b11392SMark Fasheh int *num, u64 *phys) 368760b11392SMark Fasheh { 368860b11392SMark Fasheh int i, numpages = 0, ret = 0; 368960b11392SMark Fasheh unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; 369049cb8d2dSMark Fasheh unsigned int ext_flags; 369160b11392SMark Fasheh struct super_block *sb = inode->i_sb; 369260b11392SMark Fasheh struct address_space *mapping = inode->i_mapping; 369360b11392SMark Fasheh unsigned long index; 369460b11392SMark Fasheh u64 next_cluster_bytes; 369560b11392SMark Fasheh 369660b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 369760b11392SMark Fasheh 369860b11392SMark Fasheh /* Cluster boundary, so we don't need to grab any pages. */ 369960b11392SMark Fasheh if ((isize & (csize - 1)) == 0) 370060b11392SMark Fasheh goto out; 370160b11392SMark Fasheh 370260b11392SMark Fasheh ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 370349cb8d2dSMark Fasheh phys, NULL, &ext_flags); 370460b11392SMark Fasheh if (ret) { 370560b11392SMark Fasheh mlog_errno(ret); 370660b11392SMark Fasheh goto out; 370760b11392SMark Fasheh } 370860b11392SMark Fasheh 370960b11392SMark Fasheh /* Tail is a hole. */ 371060b11392SMark Fasheh if (*phys == 0) 371160b11392SMark Fasheh goto out; 371260b11392SMark Fasheh 371349cb8d2dSMark Fasheh /* Tail is marked as unwritten, we can count on write to zero 371449cb8d2dSMark Fasheh * in that case. */ 371549cb8d2dSMark Fasheh if (ext_flags & OCFS2_EXT_UNWRITTEN) 371649cb8d2dSMark Fasheh goto out; 371749cb8d2dSMark Fasheh 371860b11392SMark Fasheh next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 371960b11392SMark Fasheh index = isize >> PAGE_CACHE_SHIFT; 372060b11392SMark Fasheh do { 372160b11392SMark Fasheh pages[numpages] = grab_cache_page(mapping, index); 372260b11392SMark Fasheh if (!pages[numpages]) { 372360b11392SMark Fasheh ret = -ENOMEM; 372460b11392SMark Fasheh mlog_errno(ret); 372560b11392SMark Fasheh goto out; 372660b11392SMark Fasheh } 372760b11392SMark Fasheh 372860b11392SMark Fasheh numpages++; 372960b11392SMark Fasheh index++; 373060b11392SMark Fasheh } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 373160b11392SMark Fasheh 373260b11392SMark Fasheh out: 373360b11392SMark Fasheh if (ret != 0) { 373460b11392SMark Fasheh if (pages) { 373560b11392SMark Fasheh for (i = 0; i < numpages; i++) { 373660b11392SMark Fasheh if (pages[i]) { 373760b11392SMark Fasheh unlock_page(pages[i]); 373860b11392SMark Fasheh page_cache_release(pages[i]); 373960b11392SMark Fasheh } 374060b11392SMark Fasheh } 374160b11392SMark Fasheh } 374260b11392SMark Fasheh numpages = 0; 374360b11392SMark Fasheh } 374460b11392SMark Fasheh 374560b11392SMark Fasheh *num = numpages; 374660b11392SMark Fasheh 374760b11392SMark Fasheh return ret; 374860b11392SMark Fasheh } 374960b11392SMark Fasheh 375060b11392SMark Fasheh /* 375160b11392SMark Fasheh * Zero the area past i_size but still within an allocated 375260b11392SMark Fasheh * cluster. This avoids exposing nonzero data on subsequent file 375360b11392SMark Fasheh * extends. 375460b11392SMark Fasheh * 375560b11392SMark Fasheh * We need to call this before i_size is updated on the inode because 375660b11392SMark Fasheh * otherwise block_write_full_page() will skip writeout of pages past 375760b11392SMark Fasheh * i_size. The new_i_size parameter is passed for this reason. 375860b11392SMark Fasheh */ 375960b11392SMark Fasheh int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 376060b11392SMark Fasheh u64 new_i_size) 376160b11392SMark Fasheh { 376260b11392SMark Fasheh int ret, numpages; 3763fa41045fSMark Fasheh loff_t endbyte; 376460b11392SMark Fasheh struct page **pages = NULL; 376560b11392SMark Fasheh u64 phys; 376660b11392SMark Fasheh 376760b11392SMark Fasheh /* 376860b11392SMark Fasheh * File systems which don't support sparse files zero on every 376960b11392SMark Fasheh * extend. 377060b11392SMark Fasheh */ 377160b11392SMark Fasheh if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 377260b11392SMark Fasheh return 0; 377360b11392SMark Fasheh 377460b11392SMark Fasheh pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), 377560b11392SMark Fasheh sizeof(struct page *), GFP_NOFS); 377660b11392SMark Fasheh if (pages == NULL) { 377760b11392SMark Fasheh ret = -ENOMEM; 377860b11392SMark Fasheh mlog_errno(ret); 377960b11392SMark Fasheh goto out; 378060b11392SMark Fasheh } 378160b11392SMark Fasheh 378260b11392SMark Fasheh ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 378360b11392SMark Fasheh if (ret) { 378460b11392SMark Fasheh mlog_errno(ret); 378560b11392SMark Fasheh goto out; 378660b11392SMark Fasheh } 378760b11392SMark Fasheh 378860b11392SMark Fasheh if (numpages == 0) 378960b11392SMark Fasheh goto out; 379060b11392SMark Fasheh 379160b11392SMark Fasheh ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 379260b11392SMark Fasheh handle); 379360b11392SMark Fasheh 379460b11392SMark Fasheh /* 379560b11392SMark Fasheh * Initiate writeout of the pages we zero'd here. We don't 379660b11392SMark Fasheh * wait on them - the truncate_inode_pages() call later will 379760b11392SMark Fasheh * do that for us. 379860b11392SMark Fasheh */ 3799fa41045fSMark Fasheh endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 3800fa41045fSMark Fasheh ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 3801fa41045fSMark Fasheh endbyte - 1, SYNC_FILE_RANGE_WRITE); 380260b11392SMark Fasheh if (ret) 380360b11392SMark Fasheh mlog_errno(ret); 380460b11392SMark Fasheh 380560b11392SMark Fasheh out: 380660b11392SMark Fasheh if (pages) 380760b11392SMark Fasheh kfree(pages); 380860b11392SMark Fasheh 380960b11392SMark Fasheh return ret; 381060b11392SMark Fasheh } 381160b11392SMark Fasheh 3812ccd979bdSMark Fasheh /* 3813ccd979bdSMark Fasheh * It is expected, that by the time you call this function, 3814ccd979bdSMark Fasheh * inode->i_size and fe->i_size have been adjusted. 3815ccd979bdSMark Fasheh * 3816ccd979bdSMark Fasheh * WARNING: This will kfree the truncate context 3817ccd979bdSMark Fasheh */ 3818ccd979bdSMark Fasheh int ocfs2_commit_truncate(struct ocfs2_super *osb, 3819ccd979bdSMark Fasheh struct inode *inode, 3820ccd979bdSMark Fasheh struct buffer_head *fe_bh, 3821ccd979bdSMark Fasheh struct ocfs2_truncate_context *tc) 3822ccd979bdSMark Fasheh { 3823ccd979bdSMark Fasheh int status, i, credits, tl_sem = 0; 3824dcd0538fSMark Fasheh u32 clusters_to_del, new_highest_cpos, range; 3825ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 38261fabe148SMark Fasheh handle_t *handle = NULL; 3827ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 3828dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 3829ccd979bdSMark Fasheh 3830ccd979bdSMark Fasheh mlog_entry_void(); 3831ccd979bdSMark Fasheh 3832dcd0538fSMark Fasheh new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 3833ccd979bdSMark Fasheh i_size_read(inode)); 3834ccd979bdSMark Fasheh 3835dcd0538fSMark Fasheh path = ocfs2_new_inode_path(fe_bh); 3836dcd0538fSMark Fasheh if (!path) { 3837dcd0538fSMark Fasheh status = -ENOMEM; 3838ccd979bdSMark Fasheh mlog_errno(status); 3839ccd979bdSMark Fasheh goto bail; 3840ccd979bdSMark Fasheh } 384183418978SMark Fasheh 384283418978SMark Fasheh ocfs2_extent_map_trunc(inode, new_highest_cpos); 384383418978SMark Fasheh 3844dcd0538fSMark Fasheh start: 3845dcd0538fSMark Fasheh /* 38463a0782d0SMark Fasheh * Check that we still have allocation to delete. 38473a0782d0SMark Fasheh */ 38483a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == 0) { 38493a0782d0SMark Fasheh status = 0; 38503a0782d0SMark Fasheh goto bail; 38513a0782d0SMark Fasheh } 38523a0782d0SMark Fasheh 38533a0782d0SMark Fasheh /* 3854dcd0538fSMark Fasheh * Truncate always works against the rightmost tree branch. 3855dcd0538fSMark Fasheh */ 3856dcd0538fSMark Fasheh status = ocfs2_find_path(inode, path, UINT_MAX); 3857dcd0538fSMark Fasheh if (status) { 3858dcd0538fSMark Fasheh mlog_errno(status); 3859ccd979bdSMark Fasheh goto bail; 3860ccd979bdSMark Fasheh } 3861ccd979bdSMark Fasheh 3862dcd0538fSMark Fasheh mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 3863dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 3864dcd0538fSMark Fasheh 3865dcd0538fSMark Fasheh /* 3866dcd0538fSMark Fasheh * By now, el will point to the extent list on the bottom most 3867dcd0538fSMark Fasheh * portion of this tree. Only the tail record is considered in 3868dcd0538fSMark Fasheh * each pass. 3869dcd0538fSMark Fasheh * 3870dcd0538fSMark Fasheh * We handle the following cases, in order: 3871dcd0538fSMark Fasheh * - empty extent: delete the remaining branch 3872dcd0538fSMark Fasheh * - remove the entire record 3873dcd0538fSMark Fasheh * - remove a partial record 3874dcd0538fSMark Fasheh * - no record needs to be removed (truncate has completed) 3875dcd0538fSMark Fasheh */ 3876dcd0538fSMark Fasheh el = path_leaf_el(path); 38773a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 38783a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 38793a0782d0SMark Fasheh "Inode %llu has empty extent block at %llu\n", 38803a0782d0SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 38813a0782d0SMark Fasheh (unsigned long long)path_leaf_bh(path)->b_blocknr); 38823a0782d0SMark Fasheh status = -EROFS; 38833a0782d0SMark Fasheh goto bail; 38843a0782d0SMark Fasheh } 38853a0782d0SMark Fasheh 3886ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 3887dcd0538fSMark Fasheh range = le32_to_cpu(el->l_recs[i].e_cpos) + 3888e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 3889dcd0538fSMark Fasheh if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 3890dcd0538fSMark Fasheh clusters_to_del = 0; 3891dcd0538fSMark Fasheh } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 3892e48edee2SMark Fasheh clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 3893dcd0538fSMark Fasheh } else if (range > new_highest_cpos) { 3894e48edee2SMark Fasheh clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 3895ccd979bdSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos)) - 3896dcd0538fSMark Fasheh new_highest_cpos; 3897dcd0538fSMark Fasheh } else { 3898dcd0538fSMark Fasheh status = 0; 3899dcd0538fSMark Fasheh goto bail; 3900dcd0538fSMark Fasheh } 3901ccd979bdSMark Fasheh 3902dcd0538fSMark Fasheh mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 3903dcd0538fSMark Fasheh clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 3904dcd0538fSMark Fasheh 3905dcd0538fSMark Fasheh BUG_ON(clusters_to_del == 0); 3906ccd979bdSMark Fasheh 39071b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 3908ccd979bdSMark Fasheh tl_sem = 1; 3909ccd979bdSMark Fasheh /* ocfs2_truncate_log_needs_flush guarantees us at least one 3910ccd979bdSMark Fasheh * record is free for use. If there isn't any, we flush to get 3911ccd979bdSMark Fasheh * an empty truncate log. */ 3912ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 3913ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 3914ccd979bdSMark Fasheh if (status < 0) { 3915ccd979bdSMark Fasheh mlog_errno(status); 3916ccd979bdSMark Fasheh goto bail; 3917ccd979bdSMark Fasheh } 3918ccd979bdSMark Fasheh } 3919ccd979bdSMark Fasheh 3920ccd979bdSMark Fasheh credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3921dcd0538fSMark Fasheh (struct ocfs2_dinode *)fe_bh->b_data, 3922dcd0538fSMark Fasheh el); 392365eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, credits); 3924ccd979bdSMark Fasheh if (IS_ERR(handle)) { 3925ccd979bdSMark Fasheh status = PTR_ERR(handle); 3926ccd979bdSMark Fasheh handle = NULL; 3927ccd979bdSMark Fasheh mlog_errno(status); 3928ccd979bdSMark Fasheh goto bail; 3929ccd979bdSMark Fasheh } 3930ccd979bdSMark Fasheh 3931dcd0538fSMark Fasheh status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 3932dcd0538fSMark Fasheh tc, path); 3933ccd979bdSMark Fasheh if (status < 0) { 3934ccd979bdSMark Fasheh mlog_errno(status); 3935ccd979bdSMark Fasheh goto bail; 3936ccd979bdSMark Fasheh } 3937ccd979bdSMark Fasheh 39381b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3939ccd979bdSMark Fasheh tl_sem = 0; 3940ccd979bdSMark Fasheh 394102dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3942ccd979bdSMark Fasheh handle = NULL; 3943ccd979bdSMark Fasheh 3944dcd0538fSMark Fasheh ocfs2_reinit_path(path, 1); 3945dcd0538fSMark Fasheh 3946dcd0538fSMark Fasheh /* 39473a0782d0SMark Fasheh * The check above will catch the case where we've truncated 39483a0782d0SMark Fasheh * away all allocation. 3949dcd0538fSMark Fasheh */ 3950ccd979bdSMark Fasheh goto start; 39513a0782d0SMark Fasheh 3952ccd979bdSMark Fasheh bail: 3953ccd979bdSMark Fasheh 3954ccd979bdSMark Fasheh ocfs2_schedule_truncate_log_flush(osb, 1); 3955ccd979bdSMark Fasheh 3956ccd979bdSMark Fasheh if (tl_sem) 39571b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3958ccd979bdSMark Fasheh 3959ccd979bdSMark Fasheh if (handle) 396002dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3961ccd979bdSMark Fasheh 3962*59a5e416SMark Fasheh ocfs2_run_deallocs(osb, &tc->tc_dealloc); 3963*59a5e416SMark Fasheh 3964dcd0538fSMark Fasheh ocfs2_free_path(path); 3965ccd979bdSMark Fasheh 3966ccd979bdSMark Fasheh /* This will drop the ext_alloc cluster lock for us */ 3967ccd979bdSMark Fasheh ocfs2_free_truncate_context(tc); 3968ccd979bdSMark Fasheh 3969ccd979bdSMark Fasheh mlog_exit(status); 3970ccd979bdSMark Fasheh return status; 3971ccd979bdSMark Fasheh } 3972ccd979bdSMark Fasheh 3973ccd979bdSMark Fasheh /* 3974*59a5e416SMark Fasheh * Expects the inode to already be locked. 3975ccd979bdSMark Fasheh */ 3976ccd979bdSMark Fasheh int ocfs2_prepare_truncate(struct ocfs2_super *osb, 3977ccd979bdSMark Fasheh struct inode *inode, 3978ccd979bdSMark Fasheh struct buffer_head *fe_bh, 3979ccd979bdSMark Fasheh struct ocfs2_truncate_context **tc) 3980ccd979bdSMark Fasheh { 3981*59a5e416SMark Fasheh int status; 3982ccd979bdSMark Fasheh unsigned int new_i_clusters; 3983ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 3984ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 3985ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 3986ccd979bdSMark Fasheh 3987ccd979bdSMark Fasheh mlog_entry_void(); 3988ccd979bdSMark Fasheh 3989ccd979bdSMark Fasheh *tc = NULL; 3990ccd979bdSMark Fasheh 3991ccd979bdSMark Fasheh new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 3992ccd979bdSMark Fasheh i_size_read(inode)); 3993ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 3994ccd979bdSMark Fasheh 3995ccd979bdSMark Fasheh mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 39961ca1a111SMark Fasheh "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 39971ca1a111SMark Fasheh (unsigned long long)le64_to_cpu(fe->i_size)); 3998ccd979bdSMark Fasheh 3999cd861280SRobert P. J. Day *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 4000ccd979bdSMark Fasheh if (!(*tc)) { 4001ccd979bdSMark Fasheh status = -ENOMEM; 4002ccd979bdSMark Fasheh mlog_errno(status); 4003ccd979bdSMark Fasheh goto bail; 4004ccd979bdSMark Fasheh } 4005*59a5e416SMark Fasheh ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 4006ccd979bdSMark Fasheh 4007ccd979bdSMark Fasheh if (fe->id2.i_list.l_tree_depth) { 4008ccd979bdSMark Fasheh status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 4009ccd979bdSMark Fasheh &last_eb_bh, OCFS2_BH_CACHED, inode); 4010ccd979bdSMark Fasheh if (status < 0) { 4011ccd979bdSMark Fasheh mlog_errno(status); 4012ccd979bdSMark Fasheh goto bail; 4013ccd979bdSMark Fasheh } 4014ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4015ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 4016ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 4017ccd979bdSMark Fasheh 4018ccd979bdSMark Fasheh brelse(last_eb_bh); 4019ccd979bdSMark Fasheh status = -EIO; 4020ccd979bdSMark Fasheh goto bail; 4021ccd979bdSMark Fasheh } 4022ccd979bdSMark Fasheh } 4023ccd979bdSMark Fasheh 4024ccd979bdSMark Fasheh (*tc)->tc_last_eb_bh = last_eb_bh; 4025ccd979bdSMark Fasheh 4026ccd979bdSMark Fasheh status = 0; 4027ccd979bdSMark Fasheh bail: 4028ccd979bdSMark Fasheh if (status < 0) { 4029ccd979bdSMark Fasheh if (*tc) 4030ccd979bdSMark Fasheh ocfs2_free_truncate_context(*tc); 4031ccd979bdSMark Fasheh *tc = NULL; 4032ccd979bdSMark Fasheh } 4033ccd979bdSMark Fasheh mlog_exit_void(); 4034ccd979bdSMark Fasheh return status; 4035ccd979bdSMark Fasheh } 4036ccd979bdSMark Fasheh 4037ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 4038ccd979bdSMark Fasheh { 4039*59a5e416SMark Fasheh /* 4040*59a5e416SMark Fasheh * The caller is responsible for completing deallocation 4041*59a5e416SMark Fasheh * before freeing the context. 4042*59a5e416SMark Fasheh */ 4043*59a5e416SMark Fasheh if (tc->tc_dealloc.c_first_suballocator != NULL) 4044*59a5e416SMark Fasheh mlog(ML_NOTICE, 4045*59a5e416SMark Fasheh "Truncate completion has non-empty dealloc context\n"); 4046ccd979bdSMark Fasheh 4047ccd979bdSMark Fasheh if (tc->tc_last_eb_bh) 4048ccd979bdSMark Fasheh brelse(tc->tc_last_eb_bh); 4049ccd979bdSMark Fasheh 4050ccd979bdSMark Fasheh kfree(tc); 4051ccd979bdSMark Fasheh } 4052