1ccd979bdSMark Fasheh /* -*- mode: c; c-basic-offset: 8; -*- 2ccd979bdSMark Fasheh * vim: noexpandtab sw=8 ts=8 sts=0: 3ccd979bdSMark Fasheh * 4ccd979bdSMark Fasheh * alloc.c 5ccd979bdSMark Fasheh * 6ccd979bdSMark Fasheh * Extent allocs and frees 7ccd979bdSMark Fasheh * 8ccd979bdSMark Fasheh * Copyright (C) 2002, 2004 Oracle. All rights reserved. 9ccd979bdSMark Fasheh * 10ccd979bdSMark Fasheh * This program is free software; you can redistribute it and/or 11ccd979bdSMark Fasheh * modify it under the terms of the GNU General Public 12ccd979bdSMark Fasheh * License as published by the Free Software Foundation; either 13ccd979bdSMark Fasheh * version 2 of the License, or (at your option) any later version. 14ccd979bdSMark Fasheh * 15ccd979bdSMark Fasheh * This program is distributed in the hope that it will be useful, 16ccd979bdSMark Fasheh * but WITHOUT ANY WARRANTY; without even the implied warranty of 17ccd979bdSMark Fasheh * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 18ccd979bdSMark Fasheh * General Public License for more details. 19ccd979bdSMark Fasheh * 20ccd979bdSMark Fasheh * You should have received a copy of the GNU General Public 21ccd979bdSMark Fasheh * License along with this program; if not, write to the 22ccd979bdSMark Fasheh * Free Software Foundation, Inc., 59 Temple Place - Suite 330, 23ccd979bdSMark Fasheh * Boston, MA 021110-1307, USA. 24ccd979bdSMark Fasheh */ 25ccd979bdSMark Fasheh 26ccd979bdSMark Fasheh #include <linux/fs.h> 27ccd979bdSMark Fasheh #include <linux/types.h> 28ccd979bdSMark Fasheh #include <linux/slab.h> 29ccd979bdSMark Fasheh #include <linux/highmem.h> 3060b11392SMark Fasheh #include <linux/swap.h> 31ccd979bdSMark Fasheh 32ccd979bdSMark Fasheh #define MLOG_MASK_PREFIX ML_DISK_ALLOC 33ccd979bdSMark Fasheh #include <cluster/masklog.h> 34ccd979bdSMark Fasheh 35ccd979bdSMark Fasheh #include "ocfs2.h" 36ccd979bdSMark Fasheh 37ccd979bdSMark Fasheh #include "alloc.h" 3860b11392SMark Fasheh #include "aops.h" 39ccd979bdSMark Fasheh #include "dlmglue.h" 40ccd979bdSMark Fasheh #include "extent_map.h" 41ccd979bdSMark Fasheh #include "inode.h" 42ccd979bdSMark Fasheh #include "journal.h" 43ccd979bdSMark Fasheh #include "localalloc.h" 44ccd979bdSMark Fasheh #include "suballoc.h" 45ccd979bdSMark Fasheh #include "sysfile.h" 46ccd979bdSMark Fasheh #include "file.h" 47ccd979bdSMark Fasheh #include "super.h" 48ccd979bdSMark Fasheh #include "uptodate.h" 49ccd979bdSMark Fasheh 50ccd979bdSMark Fasheh #include "buffer_head_io.h" 51ccd979bdSMark Fasheh 52ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc); 5359a5e416SMark Fasheh static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 5459a5e416SMark Fasheh struct ocfs2_extent_block *eb); 55ccd979bdSMark Fasheh 56dcd0538fSMark Fasheh /* 57dcd0538fSMark Fasheh * Structures which describe a path through a btree, and functions to 58dcd0538fSMark Fasheh * manipulate them. 59dcd0538fSMark Fasheh * 60dcd0538fSMark Fasheh * The idea here is to be as generic as possible with the tree 61dcd0538fSMark Fasheh * manipulation code. 62dcd0538fSMark Fasheh */ 63dcd0538fSMark Fasheh struct ocfs2_path_item { 64dcd0538fSMark Fasheh struct buffer_head *bh; 65dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 66dcd0538fSMark Fasheh }; 67dcd0538fSMark Fasheh 68dcd0538fSMark Fasheh #define OCFS2_MAX_PATH_DEPTH 5 69dcd0538fSMark Fasheh 70dcd0538fSMark Fasheh struct ocfs2_path { 71dcd0538fSMark Fasheh int p_tree_depth; 72dcd0538fSMark Fasheh struct ocfs2_path_item p_node[OCFS2_MAX_PATH_DEPTH]; 73dcd0538fSMark Fasheh }; 74dcd0538fSMark Fasheh 75dcd0538fSMark Fasheh #define path_root_bh(_path) ((_path)->p_node[0].bh) 76dcd0538fSMark Fasheh #define path_root_el(_path) ((_path)->p_node[0].el) 77dcd0538fSMark Fasheh #define path_leaf_bh(_path) ((_path)->p_node[(_path)->p_tree_depth].bh) 78dcd0538fSMark Fasheh #define path_leaf_el(_path) ((_path)->p_node[(_path)->p_tree_depth].el) 79dcd0538fSMark Fasheh #define path_num_items(_path) ((_path)->p_tree_depth + 1) 80dcd0538fSMark Fasheh 81dcd0538fSMark Fasheh /* 82dcd0538fSMark Fasheh * Reset the actual path elements so that we can re-use the structure 83dcd0538fSMark Fasheh * to build another path. Generally, this involves freeing the buffer 84dcd0538fSMark Fasheh * heads. 85dcd0538fSMark Fasheh */ 86dcd0538fSMark Fasheh static void ocfs2_reinit_path(struct ocfs2_path *path, int keep_root) 87dcd0538fSMark Fasheh { 88dcd0538fSMark Fasheh int i, start = 0, depth = 0; 89dcd0538fSMark Fasheh struct ocfs2_path_item *node; 90dcd0538fSMark Fasheh 91dcd0538fSMark Fasheh if (keep_root) 92dcd0538fSMark Fasheh start = 1; 93dcd0538fSMark Fasheh 94dcd0538fSMark Fasheh for(i = start; i < path_num_items(path); i++) { 95dcd0538fSMark Fasheh node = &path->p_node[i]; 96dcd0538fSMark Fasheh 97dcd0538fSMark Fasheh brelse(node->bh); 98dcd0538fSMark Fasheh node->bh = NULL; 99dcd0538fSMark Fasheh node->el = NULL; 100dcd0538fSMark Fasheh } 101dcd0538fSMark Fasheh 102dcd0538fSMark Fasheh /* 103dcd0538fSMark Fasheh * Tree depth may change during truncate, or insert. If we're 104dcd0538fSMark Fasheh * keeping the root extent list, then make sure that our path 105dcd0538fSMark Fasheh * structure reflects the proper depth. 106dcd0538fSMark Fasheh */ 107dcd0538fSMark Fasheh if (keep_root) 108dcd0538fSMark Fasheh depth = le16_to_cpu(path_root_el(path)->l_tree_depth); 109dcd0538fSMark Fasheh 110dcd0538fSMark Fasheh path->p_tree_depth = depth; 111dcd0538fSMark Fasheh } 112dcd0538fSMark Fasheh 113dcd0538fSMark Fasheh static void ocfs2_free_path(struct ocfs2_path *path) 114dcd0538fSMark Fasheh { 115dcd0538fSMark Fasheh if (path) { 116dcd0538fSMark Fasheh ocfs2_reinit_path(path, 0); 117dcd0538fSMark Fasheh kfree(path); 118dcd0538fSMark Fasheh } 119dcd0538fSMark Fasheh } 120dcd0538fSMark Fasheh 121dcd0538fSMark Fasheh /* 122dcd0538fSMark Fasheh * Make the *dest path the same as src and re-initialize src path to 123dcd0538fSMark Fasheh * have a root only. 124dcd0538fSMark Fasheh */ 125dcd0538fSMark Fasheh static void ocfs2_mv_path(struct ocfs2_path *dest, struct ocfs2_path *src) 126dcd0538fSMark Fasheh { 127dcd0538fSMark Fasheh int i; 128dcd0538fSMark Fasheh 129dcd0538fSMark Fasheh BUG_ON(path_root_bh(dest) != path_root_bh(src)); 130dcd0538fSMark Fasheh 131dcd0538fSMark Fasheh for(i = 1; i < OCFS2_MAX_PATH_DEPTH; i++) { 132dcd0538fSMark Fasheh brelse(dest->p_node[i].bh); 133dcd0538fSMark Fasheh 134dcd0538fSMark Fasheh dest->p_node[i].bh = src->p_node[i].bh; 135dcd0538fSMark Fasheh dest->p_node[i].el = src->p_node[i].el; 136dcd0538fSMark Fasheh 137dcd0538fSMark Fasheh src->p_node[i].bh = NULL; 138dcd0538fSMark Fasheh src->p_node[i].el = NULL; 139dcd0538fSMark Fasheh } 140dcd0538fSMark Fasheh } 141dcd0538fSMark Fasheh 142dcd0538fSMark Fasheh /* 143dcd0538fSMark Fasheh * Insert an extent block at given index. 144dcd0538fSMark Fasheh * 145dcd0538fSMark Fasheh * This will not take an additional reference on eb_bh. 146dcd0538fSMark Fasheh */ 147dcd0538fSMark Fasheh static inline void ocfs2_path_insert_eb(struct ocfs2_path *path, int index, 148dcd0538fSMark Fasheh struct buffer_head *eb_bh) 149dcd0538fSMark Fasheh { 150dcd0538fSMark Fasheh struct ocfs2_extent_block *eb = (struct ocfs2_extent_block *)eb_bh->b_data; 151dcd0538fSMark Fasheh 152dcd0538fSMark Fasheh /* 153dcd0538fSMark Fasheh * Right now, no root bh is an extent block, so this helps 154dcd0538fSMark Fasheh * catch code errors with dinode trees. The assertion can be 155dcd0538fSMark Fasheh * safely removed if we ever need to insert extent block 156dcd0538fSMark Fasheh * structures at the root. 157dcd0538fSMark Fasheh */ 158dcd0538fSMark Fasheh BUG_ON(index == 0); 159dcd0538fSMark Fasheh 160dcd0538fSMark Fasheh path->p_node[index].bh = eb_bh; 161dcd0538fSMark Fasheh path->p_node[index].el = &eb->h_list; 162dcd0538fSMark Fasheh } 163dcd0538fSMark Fasheh 164dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_path(struct buffer_head *root_bh, 165dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el) 166dcd0538fSMark Fasheh { 167dcd0538fSMark Fasheh struct ocfs2_path *path; 168dcd0538fSMark Fasheh 169dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH); 170dcd0538fSMark Fasheh 171dcd0538fSMark Fasheh path = kzalloc(sizeof(*path), GFP_NOFS); 172dcd0538fSMark Fasheh if (path) { 173dcd0538fSMark Fasheh path->p_tree_depth = le16_to_cpu(root_el->l_tree_depth); 174dcd0538fSMark Fasheh get_bh(root_bh); 175dcd0538fSMark Fasheh path_root_bh(path) = root_bh; 176dcd0538fSMark Fasheh path_root_el(path) = root_el; 177dcd0538fSMark Fasheh } 178dcd0538fSMark Fasheh 179dcd0538fSMark Fasheh return path; 180dcd0538fSMark Fasheh } 181dcd0538fSMark Fasheh 182dcd0538fSMark Fasheh /* 183dcd0538fSMark Fasheh * Allocate and initialize a new path based on a disk inode tree. 184dcd0538fSMark Fasheh */ 185dcd0538fSMark Fasheh static struct ocfs2_path *ocfs2_new_inode_path(struct buffer_head *di_bh) 186dcd0538fSMark Fasheh { 187dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 188dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &di->id2.i_list; 189dcd0538fSMark Fasheh 190dcd0538fSMark Fasheh return ocfs2_new_path(di_bh, el); 191dcd0538fSMark Fasheh } 192dcd0538fSMark Fasheh 193dcd0538fSMark Fasheh /* 194dcd0538fSMark Fasheh * Convenience function to journal all components in a path. 195dcd0538fSMark Fasheh */ 196dcd0538fSMark Fasheh static int ocfs2_journal_access_path(struct inode *inode, handle_t *handle, 197dcd0538fSMark Fasheh struct ocfs2_path *path) 198dcd0538fSMark Fasheh { 199dcd0538fSMark Fasheh int i, ret = 0; 200dcd0538fSMark Fasheh 201dcd0538fSMark Fasheh if (!path) 202dcd0538fSMark Fasheh goto out; 203dcd0538fSMark Fasheh 204dcd0538fSMark Fasheh for(i = 0; i < path_num_items(path); i++) { 205dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, path->p_node[i].bh, 206dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 207dcd0538fSMark Fasheh if (ret < 0) { 208dcd0538fSMark Fasheh mlog_errno(ret); 209dcd0538fSMark Fasheh goto out; 210dcd0538fSMark Fasheh } 211dcd0538fSMark Fasheh } 212dcd0538fSMark Fasheh 213dcd0538fSMark Fasheh out: 214dcd0538fSMark Fasheh return ret; 215dcd0538fSMark Fasheh } 216dcd0538fSMark Fasheh 217dcd0538fSMark Fasheh enum ocfs2_contig_type { 218dcd0538fSMark Fasheh CONTIG_NONE = 0, 219dcd0538fSMark Fasheh CONTIG_LEFT, 220dcd0538fSMark Fasheh CONTIG_RIGHT 221dcd0538fSMark Fasheh }; 222dcd0538fSMark Fasheh 223e48edee2SMark Fasheh 224e48edee2SMark Fasheh /* 225e48edee2SMark Fasheh * NOTE: ocfs2_block_extent_contig(), ocfs2_extents_adjacent() and 226e48edee2SMark Fasheh * ocfs2_extent_contig only work properly against leaf nodes! 227e48edee2SMark Fasheh */ 228dcd0538fSMark Fasheh static int ocfs2_block_extent_contig(struct super_block *sb, 229ccd979bdSMark Fasheh struct ocfs2_extent_rec *ext, 230ccd979bdSMark Fasheh u64 blkno) 231ccd979bdSMark Fasheh { 232e48edee2SMark Fasheh u64 blk_end = le64_to_cpu(ext->e_blkno); 233e48edee2SMark Fasheh 234e48edee2SMark Fasheh blk_end += ocfs2_clusters_to_blocks(sb, 235e48edee2SMark Fasheh le16_to_cpu(ext->e_leaf_clusters)); 236e48edee2SMark Fasheh 237e48edee2SMark Fasheh return blkno == blk_end; 238ccd979bdSMark Fasheh } 239ccd979bdSMark Fasheh 240dcd0538fSMark Fasheh static int ocfs2_extents_adjacent(struct ocfs2_extent_rec *left, 241dcd0538fSMark Fasheh struct ocfs2_extent_rec *right) 242dcd0538fSMark Fasheh { 243e48edee2SMark Fasheh u32 left_range; 244e48edee2SMark Fasheh 245e48edee2SMark Fasheh left_range = le32_to_cpu(left->e_cpos) + 246e48edee2SMark Fasheh le16_to_cpu(left->e_leaf_clusters); 247e48edee2SMark Fasheh 248e48edee2SMark Fasheh return (left_range == le32_to_cpu(right->e_cpos)); 249dcd0538fSMark Fasheh } 250dcd0538fSMark Fasheh 251dcd0538fSMark Fasheh static enum ocfs2_contig_type 252dcd0538fSMark Fasheh ocfs2_extent_contig(struct inode *inode, 253dcd0538fSMark Fasheh struct ocfs2_extent_rec *ext, 254dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 255dcd0538fSMark Fasheh { 256dcd0538fSMark Fasheh u64 blkno = le64_to_cpu(insert_rec->e_blkno); 257dcd0538fSMark Fasheh 258dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(ext, insert_rec) && 259dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, ext, blkno)) 260dcd0538fSMark Fasheh return CONTIG_RIGHT; 261dcd0538fSMark Fasheh 262dcd0538fSMark Fasheh blkno = le64_to_cpu(ext->e_blkno); 263dcd0538fSMark Fasheh if (ocfs2_extents_adjacent(insert_rec, ext) && 264dcd0538fSMark Fasheh ocfs2_block_extent_contig(inode->i_sb, insert_rec, blkno)) 265dcd0538fSMark Fasheh return CONTIG_LEFT; 266dcd0538fSMark Fasheh 267dcd0538fSMark Fasheh return CONTIG_NONE; 268dcd0538fSMark Fasheh } 269dcd0538fSMark Fasheh 270dcd0538fSMark Fasheh /* 271dcd0538fSMark Fasheh * NOTE: We can have pretty much any combination of contiguousness and 272dcd0538fSMark Fasheh * appending. 273dcd0538fSMark Fasheh * 274dcd0538fSMark Fasheh * The usefulness of APPEND_TAIL is more in that it lets us know that 275dcd0538fSMark Fasheh * we'll have to update the path to that leaf. 276dcd0538fSMark Fasheh */ 277dcd0538fSMark Fasheh enum ocfs2_append_type { 278dcd0538fSMark Fasheh APPEND_NONE = 0, 279dcd0538fSMark Fasheh APPEND_TAIL, 280dcd0538fSMark Fasheh }; 281dcd0538fSMark Fasheh 282dcd0538fSMark Fasheh struct ocfs2_insert_type { 283dcd0538fSMark Fasheh enum ocfs2_append_type ins_appending; 284dcd0538fSMark Fasheh enum ocfs2_contig_type ins_contig; 285dcd0538fSMark Fasheh int ins_contig_index; 286dcd0538fSMark Fasheh int ins_free_records; 287dcd0538fSMark Fasheh int ins_tree_depth; 288dcd0538fSMark Fasheh }; 289dcd0538fSMark Fasheh 290ccd979bdSMark Fasheh /* 291ccd979bdSMark Fasheh * How many free extents have we got before we need more meta data? 292ccd979bdSMark Fasheh */ 293ccd979bdSMark Fasheh int ocfs2_num_free_extents(struct ocfs2_super *osb, 294ccd979bdSMark Fasheh struct inode *inode, 295ccd979bdSMark Fasheh struct ocfs2_dinode *fe) 296ccd979bdSMark Fasheh { 297ccd979bdSMark Fasheh int retval; 298ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 299ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 300ccd979bdSMark Fasheh struct buffer_head *eb_bh = NULL; 301ccd979bdSMark Fasheh 302ccd979bdSMark Fasheh mlog_entry_void(); 303ccd979bdSMark Fasheh 304ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(fe)) { 305ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(inode->i_sb, fe); 306ccd979bdSMark Fasheh retval = -EIO; 307ccd979bdSMark Fasheh goto bail; 308ccd979bdSMark Fasheh } 309ccd979bdSMark Fasheh 310ccd979bdSMark Fasheh if (fe->i_last_eb_blk) { 311ccd979bdSMark Fasheh retval = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 312ccd979bdSMark Fasheh &eb_bh, OCFS2_BH_CACHED, inode); 313ccd979bdSMark Fasheh if (retval < 0) { 314ccd979bdSMark Fasheh mlog_errno(retval); 315ccd979bdSMark Fasheh goto bail; 316ccd979bdSMark Fasheh } 317ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 318ccd979bdSMark Fasheh el = &eb->h_list; 319ccd979bdSMark Fasheh } else 320ccd979bdSMark Fasheh el = &fe->id2.i_list; 321ccd979bdSMark Fasheh 322ccd979bdSMark Fasheh BUG_ON(el->l_tree_depth != 0); 323ccd979bdSMark Fasheh 324ccd979bdSMark Fasheh retval = le16_to_cpu(el->l_count) - le16_to_cpu(el->l_next_free_rec); 325ccd979bdSMark Fasheh bail: 326ccd979bdSMark Fasheh if (eb_bh) 327ccd979bdSMark Fasheh brelse(eb_bh); 328ccd979bdSMark Fasheh 329ccd979bdSMark Fasheh mlog_exit(retval); 330ccd979bdSMark Fasheh return retval; 331ccd979bdSMark Fasheh } 332ccd979bdSMark Fasheh 333ccd979bdSMark Fasheh /* expects array to already be allocated 334ccd979bdSMark Fasheh * 335ccd979bdSMark Fasheh * sets h_signature, h_blkno, h_suballoc_bit, h_suballoc_slot, and 336ccd979bdSMark Fasheh * l_count for you 337ccd979bdSMark Fasheh */ 338ccd979bdSMark Fasheh static int ocfs2_create_new_meta_bhs(struct ocfs2_super *osb, 3391fabe148SMark Fasheh handle_t *handle, 340ccd979bdSMark Fasheh struct inode *inode, 341ccd979bdSMark Fasheh int wanted, 342ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 343ccd979bdSMark Fasheh struct buffer_head *bhs[]) 344ccd979bdSMark Fasheh { 345ccd979bdSMark Fasheh int count, status, i; 346ccd979bdSMark Fasheh u16 suballoc_bit_start; 347ccd979bdSMark Fasheh u32 num_got; 348ccd979bdSMark Fasheh u64 first_blkno; 349ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 350ccd979bdSMark Fasheh 351ccd979bdSMark Fasheh mlog_entry_void(); 352ccd979bdSMark Fasheh 353ccd979bdSMark Fasheh count = 0; 354ccd979bdSMark Fasheh while (count < wanted) { 355ccd979bdSMark Fasheh status = ocfs2_claim_metadata(osb, 356ccd979bdSMark Fasheh handle, 357ccd979bdSMark Fasheh meta_ac, 358ccd979bdSMark Fasheh wanted - count, 359ccd979bdSMark Fasheh &suballoc_bit_start, 360ccd979bdSMark Fasheh &num_got, 361ccd979bdSMark Fasheh &first_blkno); 362ccd979bdSMark Fasheh if (status < 0) { 363ccd979bdSMark Fasheh mlog_errno(status); 364ccd979bdSMark Fasheh goto bail; 365ccd979bdSMark Fasheh } 366ccd979bdSMark Fasheh 367ccd979bdSMark Fasheh for(i = count; i < (num_got + count); i++) { 368ccd979bdSMark Fasheh bhs[i] = sb_getblk(osb->sb, first_blkno); 369ccd979bdSMark Fasheh if (bhs[i] == NULL) { 370ccd979bdSMark Fasheh status = -EIO; 371ccd979bdSMark Fasheh mlog_errno(status); 372ccd979bdSMark Fasheh goto bail; 373ccd979bdSMark Fasheh } 374ccd979bdSMark Fasheh ocfs2_set_new_buffer_uptodate(inode, bhs[i]); 375ccd979bdSMark Fasheh 376ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bhs[i], 377ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 378ccd979bdSMark Fasheh if (status < 0) { 379ccd979bdSMark Fasheh mlog_errno(status); 380ccd979bdSMark Fasheh goto bail; 381ccd979bdSMark Fasheh } 382ccd979bdSMark Fasheh 383ccd979bdSMark Fasheh memset(bhs[i]->b_data, 0, osb->sb->s_blocksize); 384ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bhs[i]->b_data; 385ccd979bdSMark Fasheh /* Ok, setup the minimal stuff here. */ 386ccd979bdSMark Fasheh strcpy(eb->h_signature, OCFS2_EXTENT_BLOCK_SIGNATURE); 387ccd979bdSMark Fasheh eb->h_blkno = cpu_to_le64(first_blkno); 388ccd979bdSMark Fasheh eb->h_fs_generation = cpu_to_le32(osb->fs_generation); 389ccd979bdSMark Fasheh eb->h_suballoc_slot = cpu_to_le16(osb->slot_num); 390ccd979bdSMark Fasheh eb->h_suballoc_bit = cpu_to_le16(suballoc_bit_start); 391ccd979bdSMark Fasheh eb->h_list.l_count = 392ccd979bdSMark Fasheh cpu_to_le16(ocfs2_extent_recs_per_eb(osb->sb)); 393ccd979bdSMark Fasheh 394ccd979bdSMark Fasheh suballoc_bit_start++; 395ccd979bdSMark Fasheh first_blkno++; 396ccd979bdSMark Fasheh 397ccd979bdSMark Fasheh /* We'll also be dirtied by the caller, so 398ccd979bdSMark Fasheh * this isn't absolutely necessary. */ 399ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bhs[i]); 400ccd979bdSMark Fasheh if (status < 0) { 401ccd979bdSMark Fasheh mlog_errno(status); 402ccd979bdSMark Fasheh goto bail; 403ccd979bdSMark Fasheh } 404ccd979bdSMark Fasheh } 405ccd979bdSMark Fasheh 406ccd979bdSMark Fasheh count += num_got; 407ccd979bdSMark Fasheh } 408ccd979bdSMark Fasheh 409ccd979bdSMark Fasheh status = 0; 410ccd979bdSMark Fasheh bail: 411ccd979bdSMark Fasheh if (status < 0) { 412ccd979bdSMark Fasheh for(i = 0; i < wanted; i++) { 413ccd979bdSMark Fasheh if (bhs[i]) 414ccd979bdSMark Fasheh brelse(bhs[i]); 415ccd979bdSMark Fasheh bhs[i] = NULL; 416ccd979bdSMark Fasheh } 417ccd979bdSMark Fasheh } 418ccd979bdSMark Fasheh mlog_exit(status); 419ccd979bdSMark Fasheh return status; 420ccd979bdSMark Fasheh } 421ccd979bdSMark Fasheh 422ccd979bdSMark Fasheh /* 423dcd0538fSMark Fasheh * Helper function for ocfs2_add_branch() and ocfs2_shift_tree_depth(). 424dcd0538fSMark Fasheh * 425dcd0538fSMark Fasheh * Returns the sum of the rightmost extent rec logical offset and 426dcd0538fSMark Fasheh * cluster count. 427dcd0538fSMark Fasheh * 428dcd0538fSMark Fasheh * ocfs2_add_branch() uses this to determine what logical cluster 429dcd0538fSMark Fasheh * value should be populated into the leftmost new branch records. 430dcd0538fSMark Fasheh * 431dcd0538fSMark Fasheh * ocfs2_shift_tree_depth() uses this to determine the # clusters 432dcd0538fSMark Fasheh * value for the new topmost tree record. 433dcd0538fSMark Fasheh */ 434dcd0538fSMark Fasheh static inline u32 ocfs2_sum_rightmost_rec(struct ocfs2_extent_list *el) 435dcd0538fSMark Fasheh { 436dcd0538fSMark Fasheh int i; 437dcd0538fSMark Fasheh 438dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 439dcd0538fSMark Fasheh 440dcd0538fSMark Fasheh return le32_to_cpu(el->l_recs[i].e_cpos) + 441e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 442dcd0538fSMark Fasheh } 443dcd0538fSMark Fasheh 444dcd0538fSMark Fasheh /* 445ccd979bdSMark Fasheh * Add an entire tree branch to our inode. eb_bh is the extent block 446ccd979bdSMark Fasheh * to start at, if we don't want to start the branch at the dinode 447ccd979bdSMark Fasheh * structure. 448ccd979bdSMark Fasheh * 449ccd979bdSMark Fasheh * last_eb_bh is required as we have to update it's next_leaf pointer 450ccd979bdSMark Fasheh * for the new last extent block. 451ccd979bdSMark Fasheh * 452ccd979bdSMark Fasheh * the new branch will be 'empty' in the sense that every block will 453e48edee2SMark Fasheh * contain a single record with cluster count == 0. 454ccd979bdSMark Fasheh */ 455ccd979bdSMark Fasheh static int ocfs2_add_branch(struct ocfs2_super *osb, 4561fabe148SMark Fasheh handle_t *handle, 457ccd979bdSMark Fasheh struct inode *inode, 458ccd979bdSMark Fasheh struct buffer_head *fe_bh, 459ccd979bdSMark Fasheh struct buffer_head *eb_bh, 460ccd979bdSMark Fasheh struct buffer_head *last_eb_bh, 461ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 462ccd979bdSMark Fasheh { 463ccd979bdSMark Fasheh int status, new_blocks, i; 464ccd979bdSMark Fasheh u64 next_blkno, new_last_eb_blk; 465ccd979bdSMark Fasheh struct buffer_head *bh; 466ccd979bdSMark Fasheh struct buffer_head **new_eb_bhs = NULL; 467ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 468ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 469ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 470ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 471dcd0538fSMark Fasheh u32 new_cpos; 472ccd979bdSMark Fasheh 473ccd979bdSMark Fasheh mlog_entry_void(); 474ccd979bdSMark Fasheh 475ccd979bdSMark Fasheh BUG_ON(!last_eb_bh); 476ccd979bdSMark Fasheh 477ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 478ccd979bdSMark Fasheh 479ccd979bdSMark Fasheh if (eb_bh) { 480ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) eb_bh->b_data; 481ccd979bdSMark Fasheh el = &eb->h_list; 482ccd979bdSMark Fasheh } else 483ccd979bdSMark Fasheh el = &fe->id2.i_list; 484ccd979bdSMark Fasheh 485ccd979bdSMark Fasheh /* we never add a branch to a leaf. */ 486ccd979bdSMark Fasheh BUG_ON(!el->l_tree_depth); 487ccd979bdSMark Fasheh 488ccd979bdSMark Fasheh new_blocks = le16_to_cpu(el->l_tree_depth); 489ccd979bdSMark Fasheh 490ccd979bdSMark Fasheh /* allocate the number of new eb blocks we need */ 491ccd979bdSMark Fasheh new_eb_bhs = kcalloc(new_blocks, sizeof(struct buffer_head *), 492ccd979bdSMark Fasheh GFP_KERNEL); 493ccd979bdSMark Fasheh if (!new_eb_bhs) { 494ccd979bdSMark Fasheh status = -ENOMEM; 495ccd979bdSMark Fasheh mlog_errno(status); 496ccd979bdSMark Fasheh goto bail; 497ccd979bdSMark Fasheh } 498ccd979bdSMark Fasheh 499ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, new_blocks, 500ccd979bdSMark Fasheh meta_ac, new_eb_bhs); 501ccd979bdSMark Fasheh if (status < 0) { 502ccd979bdSMark Fasheh mlog_errno(status); 503ccd979bdSMark Fasheh goto bail; 504ccd979bdSMark Fasheh } 505ccd979bdSMark Fasheh 506dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *)last_eb_bh->b_data; 507dcd0538fSMark Fasheh new_cpos = ocfs2_sum_rightmost_rec(&eb->h_list); 508dcd0538fSMark Fasheh 509ccd979bdSMark Fasheh /* Note: new_eb_bhs[new_blocks - 1] is the guy which will be 510ccd979bdSMark Fasheh * linked with the rest of the tree. 511ccd979bdSMark Fasheh * conversly, new_eb_bhs[0] is the new bottommost leaf. 512ccd979bdSMark Fasheh * 513ccd979bdSMark Fasheh * when we leave the loop, new_last_eb_blk will point to the 514ccd979bdSMark Fasheh * newest leaf, and next_blkno will point to the topmost extent 515ccd979bdSMark Fasheh * block. */ 516ccd979bdSMark Fasheh next_blkno = new_last_eb_blk = 0; 517ccd979bdSMark Fasheh for(i = 0; i < new_blocks; i++) { 518ccd979bdSMark Fasheh bh = new_eb_bhs[i]; 519ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 520ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 521ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 522ccd979bdSMark Fasheh status = -EIO; 523ccd979bdSMark Fasheh goto bail; 524ccd979bdSMark Fasheh } 525ccd979bdSMark Fasheh eb_el = &eb->h_list; 526ccd979bdSMark Fasheh 527ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, bh, 528ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 529ccd979bdSMark Fasheh if (status < 0) { 530ccd979bdSMark Fasheh mlog_errno(status); 531ccd979bdSMark Fasheh goto bail; 532ccd979bdSMark Fasheh } 533ccd979bdSMark Fasheh 534ccd979bdSMark Fasheh eb->h_next_leaf_blk = 0; 535ccd979bdSMark Fasheh eb_el->l_tree_depth = cpu_to_le16(i); 536ccd979bdSMark Fasheh eb_el->l_next_free_rec = cpu_to_le16(1); 537dcd0538fSMark Fasheh /* 538dcd0538fSMark Fasheh * This actually counts as an empty extent as 539dcd0538fSMark Fasheh * c_clusters == 0 540dcd0538fSMark Fasheh */ 541dcd0538fSMark Fasheh eb_el->l_recs[0].e_cpos = cpu_to_le32(new_cpos); 542ccd979bdSMark Fasheh eb_el->l_recs[0].e_blkno = cpu_to_le64(next_blkno); 543e48edee2SMark Fasheh /* 544e48edee2SMark Fasheh * eb_el isn't always an interior node, but even leaf 545e48edee2SMark Fasheh * nodes want a zero'd flags and reserved field so 546e48edee2SMark Fasheh * this gets the whole 32 bits regardless of use. 547e48edee2SMark Fasheh */ 548e48edee2SMark Fasheh eb_el->l_recs[0].e_int_clusters = cpu_to_le32(0); 549ccd979bdSMark Fasheh if (!eb_el->l_tree_depth) 550ccd979bdSMark Fasheh new_last_eb_blk = le64_to_cpu(eb->h_blkno); 551ccd979bdSMark Fasheh 552ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, bh); 553ccd979bdSMark Fasheh if (status < 0) { 554ccd979bdSMark Fasheh mlog_errno(status); 555ccd979bdSMark Fasheh goto bail; 556ccd979bdSMark Fasheh } 557ccd979bdSMark Fasheh 558ccd979bdSMark Fasheh next_blkno = le64_to_cpu(eb->h_blkno); 559ccd979bdSMark Fasheh } 560ccd979bdSMark Fasheh 561ccd979bdSMark Fasheh /* This is a bit hairy. We want to update up to three blocks 562ccd979bdSMark Fasheh * here without leaving any of them in an inconsistent state 563ccd979bdSMark Fasheh * in case of error. We don't have to worry about 564ccd979bdSMark Fasheh * journal_dirty erroring as it won't unless we've aborted the 565ccd979bdSMark Fasheh * handle (in which case we would never be here) so reserving 566ccd979bdSMark Fasheh * the write with journal_access is all we need to do. */ 567ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 568ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 569ccd979bdSMark Fasheh if (status < 0) { 570ccd979bdSMark Fasheh mlog_errno(status); 571ccd979bdSMark Fasheh goto bail; 572ccd979bdSMark Fasheh } 573ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 574ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 575ccd979bdSMark Fasheh if (status < 0) { 576ccd979bdSMark Fasheh mlog_errno(status); 577ccd979bdSMark Fasheh goto bail; 578ccd979bdSMark Fasheh } 579ccd979bdSMark Fasheh if (eb_bh) { 580ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, eb_bh, 581ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 582ccd979bdSMark Fasheh if (status < 0) { 583ccd979bdSMark Fasheh mlog_errno(status); 584ccd979bdSMark Fasheh goto bail; 585ccd979bdSMark Fasheh } 586ccd979bdSMark Fasheh } 587ccd979bdSMark Fasheh 588ccd979bdSMark Fasheh /* Link the new branch into the rest of the tree (el will 589ccd979bdSMark Fasheh * either be on the fe, or the extent block passed in. */ 590ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec); 591ccd979bdSMark Fasheh el->l_recs[i].e_blkno = cpu_to_le64(next_blkno); 592dcd0538fSMark Fasheh el->l_recs[i].e_cpos = cpu_to_le32(new_cpos); 593e48edee2SMark Fasheh el->l_recs[i].e_int_clusters = 0; 594ccd979bdSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 595ccd979bdSMark Fasheh 596ccd979bdSMark Fasheh /* fe needs a new last extent block pointer, as does the 597ccd979bdSMark Fasheh * next_leaf on the previously last-extent-block. */ 598ccd979bdSMark Fasheh fe->i_last_eb_blk = cpu_to_le64(new_last_eb_blk); 599ccd979bdSMark Fasheh 600ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 601ccd979bdSMark Fasheh eb->h_next_leaf_blk = cpu_to_le64(new_last_eb_blk); 602ccd979bdSMark Fasheh 603ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 604ccd979bdSMark Fasheh if (status < 0) 605ccd979bdSMark Fasheh mlog_errno(status); 606ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 607ccd979bdSMark Fasheh if (status < 0) 608ccd979bdSMark Fasheh mlog_errno(status); 609ccd979bdSMark Fasheh if (eb_bh) { 610ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, eb_bh); 611ccd979bdSMark Fasheh if (status < 0) 612ccd979bdSMark Fasheh mlog_errno(status); 613ccd979bdSMark Fasheh } 614ccd979bdSMark Fasheh 615ccd979bdSMark Fasheh status = 0; 616ccd979bdSMark Fasheh bail: 617ccd979bdSMark Fasheh if (new_eb_bhs) { 618ccd979bdSMark Fasheh for (i = 0; i < new_blocks; i++) 619ccd979bdSMark Fasheh if (new_eb_bhs[i]) 620ccd979bdSMark Fasheh brelse(new_eb_bhs[i]); 621ccd979bdSMark Fasheh kfree(new_eb_bhs); 622ccd979bdSMark Fasheh } 623ccd979bdSMark Fasheh 624ccd979bdSMark Fasheh mlog_exit(status); 625ccd979bdSMark Fasheh return status; 626ccd979bdSMark Fasheh } 627ccd979bdSMark Fasheh 628ccd979bdSMark Fasheh /* 629ccd979bdSMark Fasheh * adds another level to the allocation tree. 630ccd979bdSMark Fasheh * returns back the new extent block so you can add a branch to it 631ccd979bdSMark Fasheh * after this call. 632ccd979bdSMark Fasheh */ 633ccd979bdSMark Fasheh static int ocfs2_shift_tree_depth(struct ocfs2_super *osb, 6341fabe148SMark Fasheh handle_t *handle, 635ccd979bdSMark Fasheh struct inode *inode, 636ccd979bdSMark Fasheh struct buffer_head *fe_bh, 637ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac, 638ccd979bdSMark Fasheh struct buffer_head **ret_new_eb_bh) 639ccd979bdSMark Fasheh { 640ccd979bdSMark Fasheh int status, i; 641dcd0538fSMark Fasheh u32 new_clusters; 642ccd979bdSMark Fasheh struct buffer_head *new_eb_bh = NULL; 643ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 644ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 645ccd979bdSMark Fasheh struct ocfs2_extent_list *fe_el; 646ccd979bdSMark Fasheh struct ocfs2_extent_list *eb_el; 647ccd979bdSMark Fasheh 648ccd979bdSMark Fasheh mlog_entry_void(); 649ccd979bdSMark Fasheh 650ccd979bdSMark Fasheh status = ocfs2_create_new_meta_bhs(osb, handle, inode, 1, meta_ac, 651ccd979bdSMark Fasheh &new_eb_bh); 652ccd979bdSMark Fasheh if (status < 0) { 653ccd979bdSMark Fasheh mlog_errno(status); 654ccd979bdSMark Fasheh goto bail; 655ccd979bdSMark Fasheh } 656ccd979bdSMark Fasheh 657ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) new_eb_bh->b_data; 658ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 659ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 660ccd979bdSMark Fasheh status = -EIO; 661ccd979bdSMark Fasheh goto bail; 662ccd979bdSMark Fasheh } 663ccd979bdSMark Fasheh 664ccd979bdSMark Fasheh eb_el = &eb->h_list; 665ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 666ccd979bdSMark Fasheh fe_el = &fe->id2.i_list; 667ccd979bdSMark Fasheh 668ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, new_eb_bh, 669ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_CREATE); 670ccd979bdSMark Fasheh if (status < 0) { 671ccd979bdSMark Fasheh mlog_errno(status); 672ccd979bdSMark Fasheh goto bail; 673ccd979bdSMark Fasheh } 674ccd979bdSMark Fasheh 675ccd979bdSMark Fasheh /* copy the fe data into the new extent block */ 676ccd979bdSMark Fasheh eb_el->l_tree_depth = fe_el->l_tree_depth; 677ccd979bdSMark Fasheh eb_el->l_next_free_rec = fe_el->l_next_free_rec; 678e48edee2SMark Fasheh for(i = 0; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 679e48edee2SMark Fasheh eb_el->l_recs[i] = fe_el->l_recs[i]; 680ccd979bdSMark Fasheh 681ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, new_eb_bh); 682ccd979bdSMark Fasheh if (status < 0) { 683ccd979bdSMark Fasheh mlog_errno(status); 684ccd979bdSMark Fasheh goto bail; 685ccd979bdSMark Fasheh } 686ccd979bdSMark Fasheh 687ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, inode, fe_bh, 688ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 689ccd979bdSMark Fasheh if (status < 0) { 690ccd979bdSMark Fasheh mlog_errno(status); 691ccd979bdSMark Fasheh goto bail; 692ccd979bdSMark Fasheh } 693ccd979bdSMark Fasheh 694dcd0538fSMark Fasheh new_clusters = ocfs2_sum_rightmost_rec(eb_el); 695dcd0538fSMark Fasheh 696ccd979bdSMark Fasheh /* update fe now */ 697ccd979bdSMark Fasheh le16_add_cpu(&fe_el->l_tree_depth, 1); 698ccd979bdSMark Fasheh fe_el->l_recs[0].e_cpos = 0; 699ccd979bdSMark Fasheh fe_el->l_recs[0].e_blkno = eb->h_blkno; 700e48edee2SMark Fasheh fe_el->l_recs[0].e_int_clusters = cpu_to_le32(new_clusters); 701e48edee2SMark Fasheh for(i = 1; i < le16_to_cpu(fe_el->l_next_free_rec); i++) 702e48edee2SMark Fasheh memset(&fe_el->l_recs[i], 0, sizeof(struct ocfs2_extent_rec)); 703ccd979bdSMark Fasheh fe_el->l_next_free_rec = cpu_to_le16(1); 704ccd979bdSMark Fasheh 705ccd979bdSMark Fasheh /* If this is our 1st tree depth shift, then last_eb_blk 706ccd979bdSMark Fasheh * becomes the allocated extent block */ 707ccd979bdSMark Fasheh if (fe_el->l_tree_depth == cpu_to_le16(1)) 708ccd979bdSMark Fasheh fe->i_last_eb_blk = eb->h_blkno; 709ccd979bdSMark Fasheh 710ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 711ccd979bdSMark Fasheh if (status < 0) { 712ccd979bdSMark Fasheh mlog_errno(status); 713ccd979bdSMark Fasheh goto bail; 714ccd979bdSMark Fasheh } 715ccd979bdSMark Fasheh 716ccd979bdSMark Fasheh *ret_new_eb_bh = new_eb_bh; 717ccd979bdSMark Fasheh new_eb_bh = NULL; 718ccd979bdSMark Fasheh status = 0; 719ccd979bdSMark Fasheh bail: 720ccd979bdSMark Fasheh if (new_eb_bh) 721ccd979bdSMark Fasheh brelse(new_eb_bh); 722ccd979bdSMark Fasheh 723ccd979bdSMark Fasheh mlog_exit(status); 724ccd979bdSMark Fasheh return status; 725ccd979bdSMark Fasheh } 726ccd979bdSMark Fasheh 727ccd979bdSMark Fasheh /* 728ccd979bdSMark Fasheh * Should only be called when there is no space left in any of the 729ccd979bdSMark Fasheh * leaf nodes. What we want to do is find the lowest tree depth 730ccd979bdSMark Fasheh * non-leaf extent block with room for new records. There are three 731ccd979bdSMark Fasheh * valid results of this search: 732ccd979bdSMark Fasheh * 733ccd979bdSMark Fasheh * 1) a lowest extent block is found, then we pass it back in 734ccd979bdSMark Fasheh * *lowest_eb_bh and return '0' 735ccd979bdSMark Fasheh * 736ccd979bdSMark Fasheh * 2) the search fails to find anything, but the dinode has room. We 737ccd979bdSMark Fasheh * pass NULL back in *lowest_eb_bh, but still return '0' 738ccd979bdSMark Fasheh * 739ccd979bdSMark Fasheh * 3) the search fails to find anything AND the dinode is full, in 740ccd979bdSMark Fasheh * which case we return > 0 741ccd979bdSMark Fasheh * 742ccd979bdSMark Fasheh * return status < 0 indicates an error. 743ccd979bdSMark Fasheh */ 744ccd979bdSMark Fasheh static int ocfs2_find_branch_target(struct ocfs2_super *osb, 745ccd979bdSMark Fasheh struct inode *inode, 746ccd979bdSMark Fasheh struct buffer_head *fe_bh, 747ccd979bdSMark Fasheh struct buffer_head **target_bh) 748ccd979bdSMark Fasheh { 749ccd979bdSMark Fasheh int status = 0, i; 750ccd979bdSMark Fasheh u64 blkno; 751ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 752ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 753ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 754ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 755ccd979bdSMark Fasheh struct buffer_head *lowest_bh = NULL; 756ccd979bdSMark Fasheh 757ccd979bdSMark Fasheh mlog_entry_void(); 758ccd979bdSMark Fasheh 759ccd979bdSMark Fasheh *target_bh = NULL; 760ccd979bdSMark Fasheh 761ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 762ccd979bdSMark Fasheh el = &fe->id2.i_list; 763ccd979bdSMark Fasheh 764ccd979bdSMark Fasheh while(le16_to_cpu(el->l_tree_depth) > 1) { 765ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 766b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has empty " 767ccd979bdSMark Fasheh "extent list (next_free_rec == 0)", 768b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 769ccd979bdSMark Fasheh status = -EIO; 770ccd979bdSMark Fasheh goto bail; 771ccd979bdSMark Fasheh } 772ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 773ccd979bdSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 774ccd979bdSMark Fasheh if (!blkno) { 775b0697053SMark Fasheh ocfs2_error(inode->i_sb, "Dinode %llu has extent " 776ccd979bdSMark Fasheh "list where extent # %d has no physical " 777ccd979bdSMark Fasheh "block start", 778b0697053SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, i); 779ccd979bdSMark Fasheh status = -EIO; 780ccd979bdSMark Fasheh goto bail; 781ccd979bdSMark Fasheh } 782ccd979bdSMark Fasheh 783ccd979bdSMark Fasheh if (bh) { 784ccd979bdSMark Fasheh brelse(bh); 785ccd979bdSMark Fasheh bh = NULL; 786ccd979bdSMark Fasheh } 787ccd979bdSMark Fasheh 788ccd979bdSMark Fasheh status = ocfs2_read_block(osb, blkno, &bh, OCFS2_BH_CACHED, 789ccd979bdSMark Fasheh inode); 790ccd979bdSMark Fasheh if (status < 0) { 791ccd979bdSMark Fasheh mlog_errno(status); 792ccd979bdSMark Fasheh goto bail; 793ccd979bdSMark Fasheh } 794ccd979bdSMark Fasheh 795ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 796ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 797ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 798ccd979bdSMark Fasheh status = -EIO; 799ccd979bdSMark Fasheh goto bail; 800ccd979bdSMark Fasheh } 801ccd979bdSMark Fasheh el = &eb->h_list; 802ccd979bdSMark Fasheh 803ccd979bdSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) < 804ccd979bdSMark Fasheh le16_to_cpu(el->l_count)) { 805ccd979bdSMark Fasheh if (lowest_bh) 806ccd979bdSMark Fasheh brelse(lowest_bh); 807ccd979bdSMark Fasheh lowest_bh = bh; 808ccd979bdSMark Fasheh get_bh(lowest_bh); 809ccd979bdSMark Fasheh } 810ccd979bdSMark Fasheh } 811ccd979bdSMark Fasheh 812ccd979bdSMark Fasheh /* If we didn't find one and the fe doesn't have any room, 813ccd979bdSMark Fasheh * then return '1' */ 814ccd979bdSMark Fasheh if (!lowest_bh 815ccd979bdSMark Fasheh && (fe->id2.i_list.l_next_free_rec == fe->id2.i_list.l_count)) 816ccd979bdSMark Fasheh status = 1; 817ccd979bdSMark Fasheh 818ccd979bdSMark Fasheh *target_bh = lowest_bh; 819ccd979bdSMark Fasheh bail: 820ccd979bdSMark Fasheh if (bh) 821ccd979bdSMark Fasheh brelse(bh); 822ccd979bdSMark Fasheh 823ccd979bdSMark Fasheh mlog_exit(status); 824ccd979bdSMark Fasheh return status; 825ccd979bdSMark Fasheh } 826ccd979bdSMark Fasheh 827e48edee2SMark Fasheh /* 828*c3afcbb3SMark Fasheh * Grow a b-tree so that it has more records. 829*c3afcbb3SMark Fasheh * 830*c3afcbb3SMark Fasheh * We might shift the tree depth in which case existing paths should 831*c3afcbb3SMark Fasheh * be considered invalid. 832*c3afcbb3SMark Fasheh * 833*c3afcbb3SMark Fasheh * Tree depth after the grow is returned via *final_depth. 834*c3afcbb3SMark Fasheh */ 835*c3afcbb3SMark Fasheh static int ocfs2_grow_tree(struct inode *inode, handle_t *handle, 836*c3afcbb3SMark Fasheh struct buffer_head *di_bh, int *final_depth, 837*c3afcbb3SMark Fasheh struct buffer_head *last_eb_bh, 838*c3afcbb3SMark Fasheh struct ocfs2_alloc_context *meta_ac) 839*c3afcbb3SMark Fasheh { 840*c3afcbb3SMark Fasheh int ret, shift; 841*c3afcbb3SMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 842*c3afcbb3SMark Fasheh int depth = le16_to_cpu(di->id2.i_list.l_tree_depth); 843*c3afcbb3SMark Fasheh struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); 844*c3afcbb3SMark Fasheh struct buffer_head *bh = NULL; 845*c3afcbb3SMark Fasheh 846*c3afcbb3SMark Fasheh BUG_ON(meta_ac == NULL); 847*c3afcbb3SMark Fasheh 848*c3afcbb3SMark Fasheh shift = ocfs2_find_branch_target(osb, inode, di_bh, &bh); 849*c3afcbb3SMark Fasheh if (shift < 0) { 850*c3afcbb3SMark Fasheh ret = shift; 851*c3afcbb3SMark Fasheh mlog_errno(ret); 852*c3afcbb3SMark Fasheh goto out; 853*c3afcbb3SMark Fasheh } 854*c3afcbb3SMark Fasheh 855*c3afcbb3SMark Fasheh /* We traveled all the way to the bottom of the allocation tree 856*c3afcbb3SMark Fasheh * and didn't find room for any more extents - we need to add 857*c3afcbb3SMark Fasheh * another tree level */ 858*c3afcbb3SMark Fasheh if (shift) { 859*c3afcbb3SMark Fasheh BUG_ON(bh); 860*c3afcbb3SMark Fasheh mlog(0, "need to shift tree depth (current = %d)\n", depth); 861*c3afcbb3SMark Fasheh 862*c3afcbb3SMark Fasheh /* ocfs2_shift_tree_depth will return us a buffer with 863*c3afcbb3SMark Fasheh * the new extent block (so we can pass that to 864*c3afcbb3SMark Fasheh * ocfs2_add_branch). */ 865*c3afcbb3SMark Fasheh ret = ocfs2_shift_tree_depth(osb, handle, inode, di_bh, 866*c3afcbb3SMark Fasheh meta_ac, &bh); 867*c3afcbb3SMark Fasheh if (ret < 0) { 868*c3afcbb3SMark Fasheh mlog_errno(ret); 869*c3afcbb3SMark Fasheh goto out; 870*c3afcbb3SMark Fasheh } 871*c3afcbb3SMark Fasheh depth++; 872*c3afcbb3SMark Fasheh /* Special case: we have room now if we shifted from 873*c3afcbb3SMark Fasheh * tree_depth 0 */ 874*c3afcbb3SMark Fasheh if (depth == 1) 875*c3afcbb3SMark Fasheh goto out; 876*c3afcbb3SMark Fasheh } 877*c3afcbb3SMark Fasheh 878*c3afcbb3SMark Fasheh /* call ocfs2_add_branch to add the final part of the tree with 879*c3afcbb3SMark Fasheh * the new data. */ 880*c3afcbb3SMark Fasheh mlog(0, "add branch. bh = %p\n", bh); 881*c3afcbb3SMark Fasheh ret = ocfs2_add_branch(osb, handle, inode, di_bh, bh, last_eb_bh, 882*c3afcbb3SMark Fasheh meta_ac); 883*c3afcbb3SMark Fasheh if (ret < 0) { 884*c3afcbb3SMark Fasheh mlog_errno(ret); 885*c3afcbb3SMark Fasheh goto out; 886*c3afcbb3SMark Fasheh } 887*c3afcbb3SMark Fasheh 888*c3afcbb3SMark Fasheh out: 889*c3afcbb3SMark Fasheh if (final_depth) 890*c3afcbb3SMark Fasheh *final_depth = depth; 891*c3afcbb3SMark Fasheh brelse(bh); 892*c3afcbb3SMark Fasheh return ret; 893*c3afcbb3SMark Fasheh } 894*c3afcbb3SMark Fasheh 895*c3afcbb3SMark Fasheh /* 896e48edee2SMark Fasheh * This is only valid for leaf nodes, which are the only ones that can 897e48edee2SMark Fasheh * have empty extents anyway. 898e48edee2SMark Fasheh */ 899dcd0538fSMark Fasheh static inline int ocfs2_is_empty_extent(struct ocfs2_extent_rec *rec) 900dcd0538fSMark Fasheh { 901e48edee2SMark Fasheh return !rec->e_leaf_clusters; 902dcd0538fSMark Fasheh } 903dcd0538fSMark Fasheh 904dcd0538fSMark Fasheh /* 905dcd0538fSMark Fasheh * This function will discard the rightmost extent record. 906dcd0538fSMark Fasheh */ 907dcd0538fSMark Fasheh static void ocfs2_shift_records_right(struct ocfs2_extent_list *el) 908dcd0538fSMark Fasheh { 909dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 910dcd0538fSMark Fasheh int count = le16_to_cpu(el->l_count); 911dcd0538fSMark Fasheh unsigned int num_bytes; 912dcd0538fSMark Fasheh 913dcd0538fSMark Fasheh BUG_ON(!next_free); 914dcd0538fSMark Fasheh /* This will cause us to go off the end of our extent list. */ 915dcd0538fSMark Fasheh BUG_ON(next_free >= count); 916dcd0538fSMark Fasheh 917dcd0538fSMark Fasheh num_bytes = sizeof(struct ocfs2_extent_rec) * next_free; 918dcd0538fSMark Fasheh 919dcd0538fSMark Fasheh memmove(&el->l_recs[1], &el->l_recs[0], num_bytes); 920dcd0538fSMark Fasheh } 921dcd0538fSMark Fasheh 922dcd0538fSMark Fasheh static void ocfs2_rotate_leaf(struct ocfs2_extent_list *el, 923dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 924dcd0538fSMark Fasheh { 925dcd0538fSMark Fasheh int i, insert_index, next_free, has_empty, num_bytes; 926dcd0538fSMark Fasheh u32 insert_cpos = le32_to_cpu(insert_rec->e_cpos); 927dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 928dcd0538fSMark Fasheh 929dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 930dcd0538fSMark Fasheh has_empty = ocfs2_is_empty_extent(&el->l_recs[0]); 931dcd0538fSMark Fasheh 932dcd0538fSMark Fasheh BUG_ON(!next_free); 933dcd0538fSMark Fasheh 934dcd0538fSMark Fasheh /* The tree code before us didn't allow enough room in the leaf. */ 935dcd0538fSMark Fasheh if (el->l_next_free_rec == el->l_count && !has_empty) 936dcd0538fSMark Fasheh BUG(); 937dcd0538fSMark Fasheh 938dcd0538fSMark Fasheh /* 939dcd0538fSMark Fasheh * The easiest way to approach this is to just remove the 940dcd0538fSMark Fasheh * empty extent and temporarily decrement next_free. 941dcd0538fSMark Fasheh */ 942dcd0538fSMark Fasheh if (has_empty) { 943dcd0538fSMark Fasheh /* 944dcd0538fSMark Fasheh * If next_free was 1 (only an empty extent), this 945dcd0538fSMark Fasheh * loop won't execute, which is fine. We still want 946dcd0538fSMark Fasheh * the decrement above to happen. 947dcd0538fSMark Fasheh */ 948dcd0538fSMark Fasheh for(i = 0; i < (next_free - 1); i++) 949dcd0538fSMark Fasheh el->l_recs[i] = el->l_recs[i+1]; 950dcd0538fSMark Fasheh 951dcd0538fSMark Fasheh next_free--; 952dcd0538fSMark Fasheh } 953dcd0538fSMark Fasheh 954dcd0538fSMark Fasheh /* 955dcd0538fSMark Fasheh * Figure out what the new record index should be. 956dcd0538fSMark Fasheh */ 957dcd0538fSMark Fasheh for(i = 0; i < next_free; i++) { 958dcd0538fSMark Fasheh rec = &el->l_recs[i]; 959dcd0538fSMark Fasheh 960dcd0538fSMark Fasheh if (insert_cpos < le32_to_cpu(rec->e_cpos)) 961dcd0538fSMark Fasheh break; 962dcd0538fSMark Fasheh } 963dcd0538fSMark Fasheh insert_index = i; 964dcd0538fSMark Fasheh 965dcd0538fSMark Fasheh mlog(0, "ins %u: index %d, has_empty %d, next_free %d, count %d\n", 966dcd0538fSMark Fasheh insert_cpos, insert_index, has_empty, next_free, le16_to_cpu(el->l_count)); 967dcd0538fSMark Fasheh 968dcd0538fSMark Fasheh BUG_ON(insert_index < 0); 969dcd0538fSMark Fasheh BUG_ON(insert_index >= le16_to_cpu(el->l_count)); 970dcd0538fSMark Fasheh BUG_ON(insert_index > next_free); 971dcd0538fSMark Fasheh 972dcd0538fSMark Fasheh /* 973dcd0538fSMark Fasheh * No need to memmove if we're just adding to the tail. 974dcd0538fSMark Fasheh */ 975dcd0538fSMark Fasheh if (insert_index != next_free) { 976dcd0538fSMark Fasheh BUG_ON(next_free >= le16_to_cpu(el->l_count)); 977dcd0538fSMark Fasheh 978dcd0538fSMark Fasheh num_bytes = next_free - insert_index; 979dcd0538fSMark Fasheh num_bytes *= sizeof(struct ocfs2_extent_rec); 980dcd0538fSMark Fasheh memmove(&el->l_recs[insert_index + 1], 981dcd0538fSMark Fasheh &el->l_recs[insert_index], 982dcd0538fSMark Fasheh num_bytes); 983dcd0538fSMark Fasheh } 984dcd0538fSMark Fasheh 985dcd0538fSMark Fasheh /* 986dcd0538fSMark Fasheh * Either we had an empty extent, and need to re-increment or 987dcd0538fSMark Fasheh * there was no empty extent on a non full rightmost leaf node, 988dcd0538fSMark Fasheh * in which case we still need to increment. 989dcd0538fSMark Fasheh */ 990dcd0538fSMark Fasheh next_free++; 991dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(next_free); 992dcd0538fSMark Fasheh /* 993dcd0538fSMark Fasheh * Make sure none of the math above just messed up our tree. 994dcd0538fSMark Fasheh */ 995dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) > le16_to_cpu(el->l_count)); 996dcd0538fSMark Fasheh 997dcd0538fSMark Fasheh el->l_recs[insert_index] = *insert_rec; 998dcd0538fSMark Fasheh 999dcd0538fSMark Fasheh } 1000dcd0538fSMark Fasheh 1001dcd0538fSMark Fasheh /* 1002dcd0538fSMark Fasheh * Create an empty extent record . 1003dcd0538fSMark Fasheh * 1004dcd0538fSMark Fasheh * l_next_free_rec may be updated. 1005dcd0538fSMark Fasheh * 1006dcd0538fSMark Fasheh * If an empty extent already exists do nothing. 1007dcd0538fSMark Fasheh */ 1008dcd0538fSMark Fasheh static void ocfs2_create_empty_extent(struct ocfs2_extent_list *el) 1009dcd0538fSMark Fasheh { 1010dcd0538fSMark Fasheh int next_free = le16_to_cpu(el->l_next_free_rec); 1011dcd0538fSMark Fasheh 1012e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1013e48edee2SMark Fasheh 1014dcd0538fSMark Fasheh if (next_free == 0) 1015dcd0538fSMark Fasheh goto set_and_inc; 1016dcd0538fSMark Fasheh 1017dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) 1018dcd0538fSMark Fasheh return; 1019dcd0538fSMark Fasheh 1020dcd0538fSMark Fasheh mlog_bug_on_msg(el->l_count == el->l_next_free_rec, 1021dcd0538fSMark Fasheh "Asked to create an empty extent in a full list:\n" 1022dcd0538fSMark Fasheh "count = %u, tree depth = %u", 1023dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1024dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 1025dcd0538fSMark Fasheh 1026dcd0538fSMark Fasheh ocfs2_shift_records_right(el); 1027dcd0538fSMark Fasheh 1028dcd0538fSMark Fasheh set_and_inc: 1029dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 1030dcd0538fSMark Fasheh memset(&el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1031dcd0538fSMark Fasheh } 1032dcd0538fSMark Fasheh 1033dcd0538fSMark Fasheh /* 1034dcd0538fSMark Fasheh * For a rotation which involves two leaf nodes, the "root node" is 1035dcd0538fSMark Fasheh * the lowest level tree node which contains a path to both leafs. This 1036dcd0538fSMark Fasheh * resulting set of information can be used to form a complete "subtree" 1037dcd0538fSMark Fasheh * 1038dcd0538fSMark Fasheh * This function is passed two full paths from the dinode down to a 1039dcd0538fSMark Fasheh * pair of adjacent leaves. It's task is to figure out which path 1040dcd0538fSMark Fasheh * index contains the subtree root - this can be the root index itself 1041dcd0538fSMark Fasheh * in a worst-case rotation. 1042dcd0538fSMark Fasheh * 1043dcd0538fSMark Fasheh * The array index of the subtree root is passed back. 1044dcd0538fSMark Fasheh */ 1045dcd0538fSMark Fasheh static int ocfs2_find_subtree_root(struct inode *inode, 1046dcd0538fSMark Fasheh struct ocfs2_path *left, 1047dcd0538fSMark Fasheh struct ocfs2_path *right) 1048dcd0538fSMark Fasheh { 1049dcd0538fSMark Fasheh int i = 0; 1050dcd0538fSMark Fasheh 1051dcd0538fSMark Fasheh /* 1052dcd0538fSMark Fasheh * Check that the caller passed in two paths from the same tree. 1053dcd0538fSMark Fasheh */ 1054dcd0538fSMark Fasheh BUG_ON(path_root_bh(left) != path_root_bh(right)); 1055dcd0538fSMark Fasheh 1056dcd0538fSMark Fasheh do { 1057dcd0538fSMark Fasheh i++; 1058dcd0538fSMark Fasheh 1059dcd0538fSMark Fasheh /* 1060dcd0538fSMark Fasheh * The caller didn't pass two adjacent paths. 1061dcd0538fSMark Fasheh */ 1062dcd0538fSMark Fasheh mlog_bug_on_msg(i > left->p_tree_depth, 1063dcd0538fSMark Fasheh "Inode %lu, left depth %u, right depth %u\n" 1064dcd0538fSMark Fasheh "left leaf blk %llu, right leaf blk %llu\n", 1065dcd0538fSMark Fasheh inode->i_ino, left->p_tree_depth, 1066dcd0538fSMark Fasheh right->p_tree_depth, 1067dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(left)->b_blocknr, 1068dcd0538fSMark Fasheh (unsigned long long)path_leaf_bh(right)->b_blocknr); 1069dcd0538fSMark Fasheh } while (left->p_node[i].bh->b_blocknr == 1070dcd0538fSMark Fasheh right->p_node[i].bh->b_blocknr); 1071dcd0538fSMark Fasheh 1072dcd0538fSMark Fasheh return i - 1; 1073dcd0538fSMark Fasheh } 1074dcd0538fSMark Fasheh 1075dcd0538fSMark Fasheh typedef void (path_insert_t)(void *, struct buffer_head *); 1076dcd0538fSMark Fasheh 1077dcd0538fSMark Fasheh /* 1078dcd0538fSMark Fasheh * Traverse a btree path in search of cpos, starting at root_el. 1079dcd0538fSMark Fasheh * 1080dcd0538fSMark Fasheh * This code can be called with a cpos larger than the tree, in which 1081dcd0538fSMark Fasheh * case it will return the rightmost path. 1082dcd0538fSMark Fasheh */ 1083dcd0538fSMark Fasheh static int __ocfs2_find_path(struct inode *inode, 1084dcd0538fSMark Fasheh struct ocfs2_extent_list *root_el, u32 cpos, 1085dcd0538fSMark Fasheh path_insert_t *func, void *data) 1086dcd0538fSMark Fasheh { 1087dcd0538fSMark Fasheh int i, ret = 0; 1088dcd0538fSMark Fasheh u32 range; 1089dcd0538fSMark Fasheh u64 blkno; 1090dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1091dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 1092dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1093dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1094dcd0538fSMark Fasheh struct ocfs2_inode_info *oi = OCFS2_I(inode); 1095dcd0538fSMark Fasheh 1096dcd0538fSMark Fasheh el = root_el; 1097dcd0538fSMark Fasheh while (el->l_tree_depth) { 1098dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 1099dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1100dcd0538fSMark Fasheh "Inode %llu has empty extent list at " 1101dcd0538fSMark Fasheh "depth %u\n", 1102dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1103dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth)); 1104dcd0538fSMark Fasheh ret = -EROFS; 1105dcd0538fSMark Fasheh goto out; 1106dcd0538fSMark Fasheh 1107dcd0538fSMark Fasheh } 1108dcd0538fSMark Fasheh 1109dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec) - 1; i++) { 1110dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1111dcd0538fSMark Fasheh 1112dcd0538fSMark Fasheh /* 1113dcd0538fSMark Fasheh * In the case that cpos is off the allocation 1114dcd0538fSMark Fasheh * tree, this should just wind up returning the 1115dcd0538fSMark Fasheh * rightmost record. 1116dcd0538fSMark Fasheh */ 1117dcd0538fSMark Fasheh range = le32_to_cpu(rec->e_cpos) + 1118e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec); 1119dcd0538fSMark Fasheh if (cpos >= le32_to_cpu(rec->e_cpos) && cpos < range) 1120dcd0538fSMark Fasheh break; 1121dcd0538fSMark Fasheh } 1122dcd0538fSMark Fasheh 1123dcd0538fSMark Fasheh blkno = le64_to_cpu(el->l_recs[i].e_blkno); 1124dcd0538fSMark Fasheh if (blkno == 0) { 1125dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1126dcd0538fSMark Fasheh "Inode %llu has bad blkno in extent list " 1127dcd0538fSMark Fasheh "at depth %u (index %d)\n", 1128dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1129dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), i); 1130dcd0538fSMark Fasheh ret = -EROFS; 1131dcd0538fSMark Fasheh goto out; 1132dcd0538fSMark Fasheh } 1133dcd0538fSMark Fasheh 1134dcd0538fSMark Fasheh brelse(bh); 1135dcd0538fSMark Fasheh bh = NULL; 1136dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), blkno, 1137dcd0538fSMark Fasheh &bh, OCFS2_BH_CACHED, inode); 1138dcd0538fSMark Fasheh if (ret) { 1139dcd0538fSMark Fasheh mlog_errno(ret); 1140dcd0538fSMark Fasheh goto out; 1141dcd0538fSMark Fasheh } 1142dcd0538fSMark Fasheh 1143dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 1144dcd0538fSMark Fasheh el = &eb->h_list; 1145dcd0538fSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 1146dcd0538fSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 1147dcd0538fSMark Fasheh ret = -EIO; 1148dcd0538fSMark Fasheh goto out; 1149dcd0538fSMark Fasheh } 1150dcd0538fSMark Fasheh 1151dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) > 1152dcd0538fSMark Fasheh le16_to_cpu(el->l_count)) { 1153dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1154dcd0538fSMark Fasheh "Inode %llu has bad count in extent list " 1155dcd0538fSMark Fasheh "at block %llu (next free=%u, count=%u)\n", 1156dcd0538fSMark Fasheh (unsigned long long)oi->ip_blkno, 1157dcd0538fSMark Fasheh (unsigned long long)bh->b_blocknr, 1158dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1159dcd0538fSMark Fasheh le16_to_cpu(el->l_count)); 1160dcd0538fSMark Fasheh ret = -EROFS; 1161dcd0538fSMark Fasheh goto out; 1162dcd0538fSMark Fasheh } 1163dcd0538fSMark Fasheh 1164dcd0538fSMark Fasheh if (func) 1165dcd0538fSMark Fasheh func(data, bh); 1166dcd0538fSMark Fasheh } 1167dcd0538fSMark Fasheh 1168dcd0538fSMark Fasheh out: 1169dcd0538fSMark Fasheh /* 1170dcd0538fSMark Fasheh * Catch any trailing bh that the loop didn't handle. 1171dcd0538fSMark Fasheh */ 1172dcd0538fSMark Fasheh brelse(bh); 1173dcd0538fSMark Fasheh 1174dcd0538fSMark Fasheh return ret; 1175dcd0538fSMark Fasheh } 1176dcd0538fSMark Fasheh 1177dcd0538fSMark Fasheh /* 1178dcd0538fSMark Fasheh * Given an initialized path (that is, it has a valid root extent 1179dcd0538fSMark Fasheh * list), this function will traverse the btree in search of the path 1180dcd0538fSMark Fasheh * which would contain cpos. 1181dcd0538fSMark Fasheh * 1182dcd0538fSMark Fasheh * The path traveled is recorded in the path structure. 1183dcd0538fSMark Fasheh * 1184dcd0538fSMark Fasheh * Note that this will not do any comparisons on leaf node extent 1185dcd0538fSMark Fasheh * records, so it will work fine in the case that we just added a tree 1186dcd0538fSMark Fasheh * branch. 1187dcd0538fSMark Fasheh */ 1188dcd0538fSMark Fasheh struct find_path_data { 1189dcd0538fSMark Fasheh int index; 1190dcd0538fSMark Fasheh struct ocfs2_path *path; 1191dcd0538fSMark Fasheh }; 1192dcd0538fSMark Fasheh static void find_path_ins(void *data, struct buffer_head *bh) 1193dcd0538fSMark Fasheh { 1194dcd0538fSMark Fasheh struct find_path_data *fp = data; 1195dcd0538fSMark Fasheh 1196dcd0538fSMark Fasheh get_bh(bh); 1197dcd0538fSMark Fasheh ocfs2_path_insert_eb(fp->path, fp->index, bh); 1198dcd0538fSMark Fasheh fp->index++; 1199dcd0538fSMark Fasheh } 1200dcd0538fSMark Fasheh static int ocfs2_find_path(struct inode *inode, struct ocfs2_path *path, 1201dcd0538fSMark Fasheh u32 cpos) 1202dcd0538fSMark Fasheh { 1203dcd0538fSMark Fasheh struct find_path_data data; 1204dcd0538fSMark Fasheh 1205dcd0538fSMark Fasheh data.index = 1; 1206dcd0538fSMark Fasheh data.path = path; 1207dcd0538fSMark Fasheh return __ocfs2_find_path(inode, path_root_el(path), cpos, 1208dcd0538fSMark Fasheh find_path_ins, &data); 1209dcd0538fSMark Fasheh } 1210dcd0538fSMark Fasheh 1211dcd0538fSMark Fasheh static void find_leaf_ins(void *data, struct buffer_head *bh) 1212dcd0538fSMark Fasheh { 1213dcd0538fSMark Fasheh struct ocfs2_extent_block *eb =(struct ocfs2_extent_block *)bh->b_data; 1214dcd0538fSMark Fasheh struct ocfs2_extent_list *el = &eb->h_list; 1215dcd0538fSMark Fasheh struct buffer_head **ret = data; 1216dcd0538fSMark Fasheh 1217dcd0538fSMark Fasheh /* We want to retain only the leaf block. */ 1218dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 1219dcd0538fSMark Fasheh get_bh(bh); 1220dcd0538fSMark Fasheh *ret = bh; 1221dcd0538fSMark Fasheh } 1222dcd0538fSMark Fasheh } 1223dcd0538fSMark Fasheh /* 1224dcd0538fSMark Fasheh * Find the leaf block in the tree which would contain cpos. No 1225dcd0538fSMark Fasheh * checking of the actual leaf is done. 1226dcd0538fSMark Fasheh * 1227dcd0538fSMark Fasheh * Some paths want to call this instead of allocating a path structure 1228dcd0538fSMark Fasheh * and calling ocfs2_find_path(). 1229dcd0538fSMark Fasheh * 1230dcd0538fSMark Fasheh * This function doesn't handle non btree extent lists. 1231dcd0538fSMark Fasheh */ 1232363041a5SMark Fasheh int ocfs2_find_leaf(struct inode *inode, struct ocfs2_extent_list *root_el, 1233363041a5SMark Fasheh u32 cpos, struct buffer_head **leaf_bh) 1234dcd0538fSMark Fasheh { 1235dcd0538fSMark Fasheh int ret; 1236dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 1237dcd0538fSMark Fasheh 1238dcd0538fSMark Fasheh ret = __ocfs2_find_path(inode, root_el, cpos, find_leaf_ins, &bh); 1239dcd0538fSMark Fasheh if (ret) { 1240dcd0538fSMark Fasheh mlog_errno(ret); 1241dcd0538fSMark Fasheh goto out; 1242dcd0538fSMark Fasheh } 1243dcd0538fSMark Fasheh 1244dcd0538fSMark Fasheh *leaf_bh = bh; 1245dcd0538fSMark Fasheh out: 1246dcd0538fSMark Fasheh return ret; 1247dcd0538fSMark Fasheh } 1248dcd0538fSMark Fasheh 1249dcd0538fSMark Fasheh /* 1250dcd0538fSMark Fasheh * Adjust the adjacent records (left_rec, right_rec) involved in a rotation. 1251dcd0538fSMark Fasheh * 1252dcd0538fSMark Fasheh * Basically, we've moved stuff around at the bottom of the tree and 1253dcd0538fSMark Fasheh * we need to fix up the extent records above the changes to reflect 1254dcd0538fSMark Fasheh * the new changes. 1255dcd0538fSMark Fasheh * 1256dcd0538fSMark Fasheh * left_rec: the record on the left. 1257dcd0538fSMark Fasheh * left_child_el: is the child list pointed to by left_rec 1258dcd0538fSMark Fasheh * right_rec: the record to the right of left_rec 1259dcd0538fSMark Fasheh * right_child_el: is the child list pointed to by right_rec 1260dcd0538fSMark Fasheh * 1261dcd0538fSMark Fasheh * By definition, this only works on interior nodes. 1262dcd0538fSMark Fasheh */ 1263dcd0538fSMark Fasheh static void ocfs2_adjust_adjacent_records(struct ocfs2_extent_rec *left_rec, 1264dcd0538fSMark Fasheh struct ocfs2_extent_list *left_child_el, 1265dcd0538fSMark Fasheh struct ocfs2_extent_rec *right_rec, 1266dcd0538fSMark Fasheh struct ocfs2_extent_list *right_child_el) 1267dcd0538fSMark Fasheh { 1268dcd0538fSMark Fasheh u32 left_clusters, right_end; 1269dcd0538fSMark Fasheh 1270dcd0538fSMark Fasheh /* 1271dcd0538fSMark Fasheh * Interior nodes never have holes. Their cpos is the cpos of 1272dcd0538fSMark Fasheh * the leftmost record in their child list. Their cluster 1273dcd0538fSMark Fasheh * count covers the full theoretical range of their child list 1274dcd0538fSMark Fasheh * - the range between their cpos and the cpos of the record 1275dcd0538fSMark Fasheh * immediately to their right. 1276dcd0538fSMark Fasheh */ 1277dcd0538fSMark Fasheh left_clusters = le32_to_cpu(right_child_el->l_recs[0].e_cpos); 1278dcd0538fSMark Fasheh left_clusters -= le32_to_cpu(left_rec->e_cpos); 1279e48edee2SMark Fasheh left_rec->e_int_clusters = cpu_to_le32(left_clusters); 1280dcd0538fSMark Fasheh 1281dcd0538fSMark Fasheh /* 1282dcd0538fSMark Fasheh * Calculate the rightmost cluster count boundary before 1283e48edee2SMark Fasheh * moving cpos - we will need to adjust clusters after 1284dcd0538fSMark Fasheh * updating e_cpos to keep the same highest cluster count. 1285dcd0538fSMark Fasheh */ 1286dcd0538fSMark Fasheh right_end = le32_to_cpu(right_rec->e_cpos); 1287e48edee2SMark Fasheh right_end += le32_to_cpu(right_rec->e_int_clusters); 1288dcd0538fSMark Fasheh 1289dcd0538fSMark Fasheh right_rec->e_cpos = left_rec->e_cpos; 1290dcd0538fSMark Fasheh le32_add_cpu(&right_rec->e_cpos, left_clusters); 1291dcd0538fSMark Fasheh 1292dcd0538fSMark Fasheh right_end -= le32_to_cpu(right_rec->e_cpos); 1293e48edee2SMark Fasheh right_rec->e_int_clusters = cpu_to_le32(right_end); 1294dcd0538fSMark Fasheh } 1295dcd0538fSMark Fasheh 1296dcd0538fSMark Fasheh /* 1297dcd0538fSMark Fasheh * Adjust the adjacent root node records involved in a 1298dcd0538fSMark Fasheh * rotation. left_el_blkno is passed in as a key so that we can easily 1299dcd0538fSMark Fasheh * find it's index in the root list. 1300dcd0538fSMark Fasheh */ 1301dcd0538fSMark Fasheh static void ocfs2_adjust_root_records(struct ocfs2_extent_list *root_el, 1302dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el, 1303dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, 1304dcd0538fSMark Fasheh u64 left_el_blkno) 1305dcd0538fSMark Fasheh { 1306dcd0538fSMark Fasheh int i; 1307dcd0538fSMark Fasheh 1308dcd0538fSMark Fasheh BUG_ON(le16_to_cpu(root_el->l_tree_depth) <= 1309dcd0538fSMark Fasheh le16_to_cpu(left_el->l_tree_depth)); 1310dcd0538fSMark Fasheh 1311dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(root_el->l_next_free_rec) - 1; i++) { 1312dcd0538fSMark Fasheh if (le64_to_cpu(root_el->l_recs[i].e_blkno) == left_el_blkno) 1313dcd0538fSMark Fasheh break; 1314dcd0538fSMark Fasheh } 1315dcd0538fSMark Fasheh 1316dcd0538fSMark Fasheh /* 1317dcd0538fSMark Fasheh * The path walking code should have never returned a root and 1318dcd0538fSMark Fasheh * two paths which are not adjacent. 1319dcd0538fSMark Fasheh */ 1320dcd0538fSMark Fasheh BUG_ON(i >= (le16_to_cpu(root_el->l_next_free_rec) - 1)); 1321dcd0538fSMark Fasheh 1322dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(&root_el->l_recs[i], left_el, 1323dcd0538fSMark Fasheh &root_el->l_recs[i + 1], right_el); 1324dcd0538fSMark Fasheh } 1325dcd0538fSMark Fasheh 1326dcd0538fSMark Fasheh /* 1327dcd0538fSMark Fasheh * We've changed a leaf block (in right_path) and need to reflect that 1328dcd0538fSMark Fasheh * change back up the subtree. 1329dcd0538fSMark Fasheh * 1330dcd0538fSMark Fasheh * This happens in multiple places: 1331dcd0538fSMark Fasheh * - When we've moved an extent record from the left path leaf to the right 1332dcd0538fSMark Fasheh * path leaf to make room for an empty extent in the left path leaf. 1333dcd0538fSMark Fasheh * - When our insert into the right path leaf is at the leftmost edge 1334dcd0538fSMark Fasheh * and requires an update of the path immediately to it's left. This 1335dcd0538fSMark Fasheh * can occur at the end of some types of rotation and appending inserts. 1336dcd0538fSMark Fasheh */ 1337dcd0538fSMark Fasheh static void ocfs2_complete_edge_insert(struct inode *inode, handle_t *handle, 1338dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1339dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1340dcd0538fSMark Fasheh int subtree_index) 1341dcd0538fSMark Fasheh { 1342dcd0538fSMark Fasheh int ret, i, idx; 1343dcd0538fSMark Fasheh struct ocfs2_extent_list *el, *left_el, *right_el; 1344dcd0538fSMark Fasheh struct ocfs2_extent_rec *left_rec, *right_rec; 1345dcd0538fSMark Fasheh struct buffer_head *root_bh = left_path->p_node[subtree_index].bh; 1346dcd0538fSMark Fasheh 1347dcd0538fSMark Fasheh /* 1348dcd0538fSMark Fasheh * Update the counts and position values within all the 1349dcd0538fSMark Fasheh * interior nodes to reflect the leaf rotation we just did. 1350dcd0538fSMark Fasheh * 1351dcd0538fSMark Fasheh * The root node is handled below the loop. 1352dcd0538fSMark Fasheh * 1353dcd0538fSMark Fasheh * We begin the loop with right_el and left_el pointing to the 1354dcd0538fSMark Fasheh * leaf lists and work our way up. 1355dcd0538fSMark Fasheh * 1356dcd0538fSMark Fasheh * NOTE: within this loop, left_el and right_el always refer 1357dcd0538fSMark Fasheh * to the *child* lists. 1358dcd0538fSMark Fasheh */ 1359dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1360dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1361dcd0538fSMark Fasheh for(i = left_path->p_tree_depth - 1; i > subtree_index; i--) { 1362dcd0538fSMark Fasheh mlog(0, "Adjust records at index %u\n", i); 1363dcd0538fSMark Fasheh 1364dcd0538fSMark Fasheh /* 1365dcd0538fSMark Fasheh * One nice property of knowing that all of these 1366dcd0538fSMark Fasheh * nodes are below the root is that we only deal with 1367dcd0538fSMark Fasheh * the leftmost right node record and the rightmost 1368dcd0538fSMark Fasheh * left node record. 1369dcd0538fSMark Fasheh */ 1370dcd0538fSMark Fasheh el = left_path->p_node[i].el; 1371dcd0538fSMark Fasheh idx = le16_to_cpu(left_el->l_next_free_rec) - 1; 1372dcd0538fSMark Fasheh left_rec = &el->l_recs[idx]; 1373dcd0538fSMark Fasheh 1374dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1375dcd0538fSMark Fasheh right_rec = &el->l_recs[0]; 1376dcd0538fSMark Fasheh 1377dcd0538fSMark Fasheh ocfs2_adjust_adjacent_records(left_rec, left_el, right_rec, 1378dcd0538fSMark Fasheh right_el); 1379dcd0538fSMark Fasheh 1380dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_path->p_node[i].bh); 1381dcd0538fSMark Fasheh if (ret) 1382dcd0538fSMark Fasheh mlog_errno(ret); 1383dcd0538fSMark Fasheh 1384dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_path->p_node[i].bh); 1385dcd0538fSMark Fasheh if (ret) 1386dcd0538fSMark Fasheh mlog_errno(ret); 1387dcd0538fSMark Fasheh 1388dcd0538fSMark Fasheh /* 1389dcd0538fSMark Fasheh * Setup our list pointers now so that the current 1390dcd0538fSMark Fasheh * parents become children in the next iteration. 1391dcd0538fSMark Fasheh */ 1392dcd0538fSMark Fasheh left_el = left_path->p_node[i].el; 1393dcd0538fSMark Fasheh right_el = right_path->p_node[i].el; 1394dcd0538fSMark Fasheh } 1395dcd0538fSMark Fasheh 1396dcd0538fSMark Fasheh /* 1397dcd0538fSMark Fasheh * At the root node, adjust the two adjacent records which 1398dcd0538fSMark Fasheh * begin our path to the leaves. 1399dcd0538fSMark Fasheh */ 1400dcd0538fSMark Fasheh 1401dcd0538fSMark Fasheh el = left_path->p_node[subtree_index].el; 1402dcd0538fSMark Fasheh left_el = left_path->p_node[subtree_index + 1].el; 1403dcd0538fSMark Fasheh right_el = right_path->p_node[subtree_index + 1].el; 1404dcd0538fSMark Fasheh 1405dcd0538fSMark Fasheh ocfs2_adjust_root_records(el, left_el, right_el, 1406dcd0538fSMark Fasheh left_path->p_node[subtree_index + 1].bh->b_blocknr); 1407dcd0538fSMark Fasheh 1408dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1409dcd0538fSMark Fasheh 1410dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, root_bh); 1411dcd0538fSMark Fasheh if (ret) 1412dcd0538fSMark Fasheh mlog_errno(ret); 1413dcd0538fSMark Fasheh } 1414dcd0538fSMark Fasheh 1415dcd0538fSMark Fasheh static int ocfs2_rotate_subtree_right(struct inode *inode, 1416dcd0538fSMark Fasheh handle_t *handle, 1417dcd0538fSMark Fasheh struct ocfs2_path *left_path, 1418dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1419dcd0538fSMark Fasheh int subtree_index) 1420dcd0538fSMark Fasheh { 1421dcd0538fSMark Fasheh int ret, i; 1422dcd0538fSMark Fasheh struct buffer_head *right_leaf_bh; 1423dcd0538fSMark Fasheh struct buffer_head *left_leaf_bh = NULL; 1424dcd0538fSMark Fasheh struct buffer_head *root_bh; 1425dcd0538fSMark Fasheh struct ocfs2_extent_list *right_el, *left_el; 1426dcd0538fSMark Fasheh struct ocfs2_extent_rec move_rec; 1427dcd0538fSMark Fasheh 1428dcd0538fSMark Fasheh left_leaf_bh = path_leaf_bh(left_path); 1429dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1430dcd0538fSMark Fasheh 1431dcd0538fSMark Fasheh if (left_el->l_next_free_rec != left_el->l_count) { 1432dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1433dcd0538fSMark Fasheh "Inode %llu has non-full interior leaf node %llu" 1434dcd0538fSMark Fasheh "(next free = %u)", 1435dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1436dcd0538fSMark Fasheh (unsigned long long)left_leaf_bh->b_blocknr, 1437dcd0538fSMark Fasheh le16_to_cpu(left_el->l_next_free_rec)); 1438dcd0538fSMark Fasheh return -EROFS; 1439dcd0538fSMark Fasheh } 1440dcd0538fSMark Fasheh 1441dcd0538fSMark Fasheh /* 1442dcd0538fSMark Fasheh * This extent block may already have an empty record, so we 1443dcd0538fSMark Fasheh * return early if so. 1444dcd0538fSMark Fasheh */ 1445dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&left_el->l_recs[0])) 1446dcd0538fSMark Fasheh return 0; 1447dcd0538fSMark Fasheh 1448dcd0538fSMark Fasheh root_bh = left_path->p_node[subtree_index].bh; 1449dcd0538fSMark Fasheh BUG_ON(root_bh != right_path->p_node[subtree_index].bh); 1450dcd0538fSMark Fasheh 1451dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, root_bh, 1452dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1453dcd0538fSMark Fasheh if (ret) { 1454dcd0538fSMark Fasheh mlog_errno(ret); 1455dcd0538fSMark Fasheh goto out; 1456dcd0538fSMark Fasheh } 1457dcd0538fSMark Fasheh 1458dcd0538fSMark Fasheh for(i = subtree_index + 1; i < path_num_items(right_path); i++) { 1459dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1460dcd0538fSMark Fasheh right_path->p_node[i].bh, 1461dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1462dcd0538fSMark Fasheh if (ret) { 1463dcd0538fSMark Fasheh mlog_errno(ret); 1464dcd0538fSMark Fasheh goto out; 1465dcd0538fSMark Fasheh } 1466dcd0538fSMark Fasheh 1467dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, 1468dcd0538fSMark Fasheh left_path->p_node[i].bh, 1469dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 1470dcd0538fSMark Fasheh if (ret) { 1471dcd0538fSMark Fasheh mlog_errno(ret); 1472dcd0538fSMark Fasheh goto out; 1473dcd0538fSMark Fasheh } 1474dcd0538fSMark Fasheh } 1475dcd0538fSMark Fasheh 1476dcd0538fSMark Fasheh right_leaf_bh = path_leaf_bh(right_path); 1477dcd0538fSMark Fasheh right_el = path_leaf_el(right_path); 1478dcd0538fSMark Fasheh 1479dcd0538fSMark Fasheh /* This is a code error, not a disk corruption. */ 1480dcd0538fSMark Fasheh mlog_bug_on_msg(!right_el->l_next_free_rec, "Inode %llu: Rotate fails " 1481dcd0538fSMark Fasheh "because rightmost leaf block %llu is empty\n", 1482dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 1483dcd0538fSMark Fasheh (unsigned long long)right_leaf_bh->b_blocknr); 1484dcd0538fSMark Fasheh 1485dcd0538fSMark Fasheh ocfs2_create_empty_extent(right_el); 1486dcd0538fSMark Fasheh 1487dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, right_leaf_bh); 1488dcd0538fSMark Fasheh if (ret) { 1489dcd0538fSMark Fasheh mlog_errno(ret); 1490dcd0538fSMark Fasheh goto out; 1491dcd0538fSMark Fasheh } 1492dcd0538fSMark Fasheh 1493dcd0538fSMark Fasheh /* Do the copy now. */ 1494dcd0538fSMark Fasheh i = le16_to_cpu(left_el->l_next_free_rec) - 1; 1495dcd0538fSMark Fasheh move_rec = left_el->l_recs[i]; 1496dcd0538fSMark Fasheh right_el->l_recs[0] = move_rec; 1497dcd0538fSMark Fasheh 1498dcd0538fSMark Fasheh /* 1499dcd0538fSMark Fasheh * Clear out the record we just copied and shift everything 1500dcd0538fSMark Fasheh * over, leaving an empty extent in the left leaf. 1501dcd0538fSMark Fasheh * 1502dcd0538fSMark Fasheh * We temporarily subtract from next_free_rec so that the 1503dcd0538fSMark Fasheh * shift will lose the tail record (which is now defunct). 1504dcd0538fSMark Fasheh */ 1505dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, -1); 1506dcd0538fSMark Fasheh ocfs2_shift_records_right(left_el); 1507dcd0538fSMark Fasheh memset(&left_el->l_recs[0], 0, sizeof(struct ocfs2_extent_rec)); 1508dcd0538fSMark Fasheh le16_add_cpu(&left_el->l_next_free_rec, 1); 1509dcd0538fSMark Fasheh 1510dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, left_leaf_bh); 1511dcd0538fSMark Fasheh if (ret) { 1512dcd0538fSMark Fasheh mlog_errno(ret); 1513dcd0538fSMark Fasheh goto out; 1514dcd0538fSMark Fasheh } 1515dcd0538fSMark Fasheh 1516dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, right_path, 1517dcd0538fSMark Fasheh subtree_index); 1518dcd0538fSMark Fasheh 1519dcd0538fSMark Fasheh out: 1520dcd0538fSMark Fasheh return ret; 1521dcd0538fSMark Fasheh } 1522dcd0538fSMark Fasheh 1523dcd0538fSMark Fasheh /* 1524dcd0538fSMark Fasheh * Given a full path, determine what cpos value would return us a path 1525dcd0538fSMark Fasheh * containing the leaf immediately to the left of the current one. 1526dcd0538fSMark Fasheh * 1527dcd0538fSMark Fasheh * Will return zero if the path passed in is already the leftmost path. 1528dcd0538fSMark Fasheh */ 1529dcd0538fSMark Fasheh static int ocfs2_find_cpos_for_left_leaf(struct super_block *sb, 1530dcd0538fSMark Fasheh struct ocfs2_path *path, u32 *cpos) 1531dcd0538fSMark Fasheh { 1532dcd0538fSMark Fasheh int i, j, ret = 0; 1533dcd0538fSMark Fasheh u64 blkno; 1534dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1535dcd0538fSMark Fasheh 1536e48edee2SMark Fasheh BUG_ON(path->p_tree_depth == 0); 1537e48edee2SMark Fasheh 1538dcd0538fSMark Fasheh *cpos = 0; 1539dcd0538fSMark Fasheh 1540dcd0538fSMark Fasheh blkno = path_leaf_bh(path)->b_blocknr; 1541dcd0538fSMark Fasheh 1542dcd0538fSMark Fasheh /* Start at the tree node just above the leaf and work our way up. */ 1543dcd0538fSMark Fasheh i = path->p_tree_depth - 1; 1544dcd0538fSMark Fasheh while (i >= 0) { 1545dcd0538fSMark Fasheh el = path->p_node[i].el; 1546dcd0538fSMark Fasheh 1547dcd0538fSMark Fasheh /* 1548dcd0538fSMark Fasheh * Find the extent record just before the one in our 1549dcd0538fSMark Fasheh * path. 1550dcd0538fSMark Fasheh */ 1551dcd0538fSMark Fasheh for(j = 0; j < le16_to_cpu(el->l_next_free_rec); j++) { 1552dcd0538fSMark Fasheh if (le64_to_cpu(el->l_recs[j].e_blkno) == blkno) { 1553dcd0538fSMark Fasheh if (j == 0) { 1554dcd0538fSMark Fasheh if (i == 0) { 1555dcd0538fSMark Fasheh /* 1556dcd0538fSMark Fasheh * We've determined that the 1557dcd0538fSMark Fasheh * path specified is already 1558dcd0538fSMark Fasheh * the leftmost one - return a 1559dcd0538fSMark Fasheh * cpos of zero. 1560dcd0538fSMark Fasheh */ 1561dcd0538fSMark Fasheh goto out; 1562dcd0538fSMark Fasheh } 1563dcd0538fSMark Fasheh /* 1564dcd0538fSMark Fasheh * The leftmost record points to our 1565dcd0538fSMark Fasheh * leaf - we need to travel up the 1566dcd0538fSMark Fasheh * tree one level. 1567dcd0538fSMark Fasheh */ 1568dcd0538fSMark Fasheh goto next_node; 1569dcd0538fSMark Fasheh } 1570dcd0538fSMark Fasheh 1571dcd0538fSMark Fasheh *cpos = le32_to_cpu(el->l_recs[j - 1].e_cpos); 1572e48edee2SMark Fasheh *cpos = *cpos + ocfs2_rec_clusters(el, 1573e48edee2SMark Fasheh &el->l_recs[j - 1]); 1574e48edee2SMark Fasheh *cpos = *cpos - 1; 1575dcd0538fSMark Fasheh goto out; 1576dcd0538fSMark Fasheh } 1577dcd0538fSMark Fasheh } 1578dcd0538fSMark Fasheh 1579dcd0538fSMark Fasheh /* 1580dcd0538fSMark Fasheh * If we got here, we never found a valid node where 1581dcd0538fSMark Fasheh * the tree indicated one should be. 1582dcd0538fSMark Fasheh */ 1583dcd0538fSMark Fasheh ocfs2_error(sb, 1584dcd0538fSMark Fasheh "Invalid extent tree at extent block %llu\n", 1585dcd0538fSMark Fasheh (unsigned long long)blkno); 1586dcd0538fSMark Fasheh ret = -EROFS; 1587dcd0538fSMark Fasheh goto out; 1588dcd0538fSMark Fasheh 1589dcd0538fSMark Fasheh next_node: 1590dcd0538fSMark Fasheh blkno = path->p_node[i].bh->b_blocknr; 1591dcd0538fSMark Fasheh i--; 1592dcd0538fSMark Fasheh } 1593dcd0538fSMark Fasheh 1594dcd0538fSMark Fasheh out: 1595dcd0538fSMark Fasheh return ret; 1596dcd0538fSMark Fasheh } 1597dcd0538fSMark Fasheh 1598dcd0538fSMark Fasheh static int ocfs2_extend_rotate_transaction(handle_t *handle, int subtree_depth, 1599dcd0538fSMark Fasheh struct ocfs2_path *path) 1600dcd0538fSMark Fasheh { 1601dcd0538fSMark Fasheh int credits = (path->p_tree_depth - subtree_depth) * 2 + 1; 1602dcd0538fSMark Fasheh 1603dcd0538fSMark Fasheh if (handle->h_buffer_credits < credits) 1604dcd0538fSMark Fasheh return ocfs2_extend_trans(handle, credits); 1605dcd0538fSMark Fasheh 1606dcd0538fSMark Fasheh return 0; 1607dcd0538fSMark Fasheh } 1608dcd0538fSMark Fasheh 1609dcd0538fSMark Fasheh /* 1610dcd0538fSMark Fasheh * Trap the case where we're inserting into the theoretical range past 1611dcd0538fSMark Fasheh * the _actual_ left leaf range. Otherwise, we'll rotate a record 1612dcd0538fSMark Fasheh * whose cpos is less than ours into the right leaf. 1613dcd0538fSMark Fasheh * 1614dcd0538fSMark Fasheh * It's only necessary to look at the rightmost record of the left 1615dcd0538fSMark Fasheh * leaf because the logic that calls us should ensure that the 1616dcd0538fSMark Fasheh * theoretical ranges in the path components above the leaves are 1617dcd0538fSMark Fasheh * correct. 1618dcd0538fSMark Fasheh */ 1619dcd0538fSMark Fasheh static int ocfs2_rotate_requires_path_adjustment(struct ocfs2_path *left_path, 1620dcd0538fSMark Fasheh u32 insert_cpos) 1621dcd0538fSMark Fasheh { 1622dcd0538fSMark Fasheh struct ocfs2_extent_list *left_el; 1623dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1624dcd0538fSMark Fasheh int next_free; 1625dcd0538fSMark Fasheh 1626dcd0538fSMark Fasheh left_el = path_leaf_el(left_path); 1627dcd0538fSMark Fasheh next_free = le16_to_cpu(left_el->l_next_free_rec); 1628dcd0538fSMark Fasheh rec = &left_el->l_recs[next_free - 1]; 1629dcd0538fSMark Fasheh 1630dcd0538fSMark Fasheh if (insert_cpos > le32_to_cpu(rec->e_cpos)) 1631dcd0538fSMark Fasheh return 1; 1632dcd0538fSMark Fasheh return 0; 1633dcd0538fSMark Fasheh } 1634dcd0538fSMark Fasheh 1635dcd0538fSMark Fasheh /* 1636dcd0538fSMark Fasheh * Rotate all the records in a btree right one record, starting at insert_cpos. 1637dcd0538fSMark Fasheh * 1638dcd0538fSMark Fasheh * The path to the rightmost leaf should be passed in. 1639dcd0538fSMark Fasheh * 1640dcd0538fSMark Fasheh * The array is assumed to be large enough to hold an entire path (tree depth). 1641dcd0538fSMark Fasheh * 1642dcd0538fSMark Fasheh * Upon succesful return from this function: 1643dcd0538fSMark Fasheh * 1644dcd0538fSMark Fasheh * - The 'right_path' array will contain a path to the leaf block 1645dcd0538fSMark Fasheh * whose range contains e_cpos. 1646dcd0538fSMark Fasheh * - That leaf block will have a single empty extent in list index 0. 1647dcd0538fSMark Fasheh * - In the case that the rotation requires a post-insert update, 1648dcd0538fSMark Fasheh * *ret_left_path will contain a valid path which can be passed to 1649dcd0538fSMark Fasheh * ocfs2_insert_path(). 1650dcd0538fSMark Fasheh */ 1651dcd0538fSMark Fasheh static int ocfs2_rotate_tree_right(struct inode *inode, 1652dcd0538fSMark Fasheh handle_t *handle, 1653dcd0538fSMark Fasheh u32 insert_cpos, 1654dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1655dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1656dcd0538fSMark Fasheh { 1657dcd0538fSMark Fasheh int ret, start; 1658dcd0538fSMark Fasheh u32 cpos; 1659dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1660dcd0538fSMark Fasheh 1661dcd0538fSMark Fasheh *ret_left_path = NULL; 1662dcd0538fSMark Fasheh 1663dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1664dcd0538fSMark Fasheh path_root_el(right_path)); 1665dcd0538fSMark Fasheh if (!left_path) { 1666dcd0538fSMark Fasheh ret = -ENOMEM; 1667dcd0538fSMark Fasheh mlog_errno(ret); 1668dcd0538fSMark Fasheh goto out; 1669dcd0538fSMark Fasheh } 1670dcd0538fSMark Fasheh 1671dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, &cpos); 1672dcd0538fSMark Fasheh if (ret) { 1673dcd0538fSMark Fasheh mlog_errno(ret); 1674dcd0538fSMark Fasheh goto out; 1675dcd0538fSMark Fasheh } 1676dcd0538fSMark Fasheh 1677dcd0538fSMark Fasheh mlog(0, "Insert: %u, first left path cpos: %u\n", insert_cpos, cpos); 1678dcd0538fSMark Fasheh 1679dcd0538fSMark Fasheh /* 1680dcd0538fSMark Fasheh * What we want to do here is: 1681dcd0538fSMark Fasheh * 1682dcd0538fSMark Fasheh * 1) Start with the rightmost path. 1683dcd0538fSMark Fasheh * 1684dcd0538fSMark Fasheh * 2) Determine a path to the leaf block directly to the left 1685dcd0538fSMark Fasheh * of that leaf. 1686dcd0538fSMark Fasheh * 1687dcd0538fSMark Fasheh * 3) Determine the 'subtree root' - the lowest level tree node 1688dcd0538fSMark Fasheh * which contains a path to both leaves. 1689dcd0538fSMark Fasheh * 1690dcd0538fSMark Fasheh * 4) Rotate the subtree. 1691dcd0538fSMark Fasheh * 1692dcd0538fSMark Fasheh * 5) Find the next subtree by considering the left path to be 1693dcd0538fSMark Fasheh * the new right path. 1694dcd0538fSMark Fasheh * 1695dcd0538fSMark Fasheh * The check at the top of this while loop also accepts 1696dcd0538fSMark Fasheh * insert_cpos == cpos because cpos is only a _theoretical_ 1697dcd0538fSMark Fasheh * value to get us the left path - insert_cpos might very well 1698dcd0538fSMark Fasheh * be filling that hole. 1699dcd0538fSMark Fasheh * 1700dcd0538fSMark Fasheh * Stop at a cpos of '0' because we either started at the 1701dcd0538fSMark Fasheh * leftmost branch (i.e., a tree with one branch and a 1702dcd0538fSMark Fasheh * rotation inside of it), or we've gone as far as we can in 1703dcd0538fSMark Fasheh * rotating subtrees. 1704dcd0538fSMark Fasheh */ 1705dcd0538fSMark Fasheh while (cpos && insert_cpos <= cpos) { 1706dcd0538fSMark Fasheh mlog(0, "Rotating a tree: ins. cpos: %u, left path cpos: %u\n", 1707dcd0538fSMark Fasheh insert_cpos, cpos); 1708dcd0538fSMark Fasheh 1709dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, cpos); 1710dcd0538fSMark Fasheh if (ret) { 1711dcd0538fSMark Fasheh mlog_errno(ret); 1712dcd0538fSMark Fasheh goto out; 1713dcd0538fSMark Fasheh } 1714dcd0538fSMark Fasheh 1715dcd0538fSMark Fasheh mlog_bug_on_msg(path_leaf_bh(left_path) == 1716dcd0538fSMark Fasheh path_leaf_bh(right_path), 1717dcd0538fSMark Fasheh "Inode %lu: error during insert of %u " 1718dcd0538fSMark Fasheh "(left path cpos %u) results in two identical " 1719dcd0538fSMark Fasheh "paths ending at %llu\n", 1720dcd0538fSMark Fasheh inode->i_ino, insert_cpos, cpos, 1721dcd0538fSMark Fasheh (unsigned long long) 1722dcd0538fSMark Fasheh path_leaf_bh(left_path)->b_blocknr); 1723dcd0538fSMark Fasheh 1724dcd0538fSMark Fasheh if (ocfs2_rotate_requires_path_adjustment(left_path, 1725dcd0538fSMark Fasheh insert_cpos)) { 1726dcd0538fSMark Fasheh mlog(0, "Path adjustment required\n"); 1727dcd0538fSMark Fasheh 1728dcd0538fSMark Fasheh /* 1729dcd0538fSMark Fasheh * We've rotated the tree as much as we 1730dcd0538fSMark Fasheh * should. The rest is up to 1731dcd0538fSMark Fasheh * ocfs2_insert_path() to complete, after the 1732dcd0538fSMark Fasheh * record insertion. We indicate this 1733dcd0538fSMark Fasheh * situation by returning the left path. 1734dcd0538fSMark Fasheh * 1735dcd0538fSMark Fasheh * The reason we don't adjust the records here 1736dcd0538fSMark Fasheh * before the record insert is that an error 1737dcd0538fSMark Fasheh * later might break the rule where a parent 1738dcd0538fSMark Fasheh * record e_cpos will reflect the actual 1739dcd0538fSMark Fasheh * e_cpos of the 1st nonempty record of the 1740dcd0538fSMark Fasheh * child list. 1741dcd0538fSMark Fasheh */ 1742dcd0538fSMark Fasheh *ret_left_path = left_path; 1743dcd0538fSMark Fasheh goto out_ret_path; 1744dcd0538fSMark Fasheh } 1745dcd0538fSMark Fasheh 1746dcd0538fSMark Fasheh start = ocfs2_find_subtree_root(inode, left_path, right_path); 1747dcd0538fSMark Fasheh 1748dcd0538fSMark Fasheh mlog(0, "Subtree root at index %d (blk %llu, depth %d)\n", 1749dcd0538fSMark Fasheh start, 1750dcd0538fSMark Fasheh (unsigned long long) right_path->p_node[start].bh->b_blocknr, 1751dcd0538fSMark Fasheh right_path->p_tree_depth); 1752dcd0538fSMark Fasheh 1753dcd0538fSMark Fasheh ret = ocfs2_extend_rotate_transaction(handle, start, 1754dcd0538fSMark Fasheh right_path); 1755dcd0538fSMark Fasheh if (ret) { 1756dcd0538fSMark Fasheh mlog_errno(ret); 1757dcd0538fSMark Fasheh goto out; 1758dcd0538fSMark Fasheh } 1759dcd0538fSMark Fasheh 1760dcd0538fSMark Fasheh ret = ocfs2_rotate_subtree_right(inode, handle, left_path, 1761dcd0538fSMark Fasheh right_path, start); 1762dcd0538fSMark Fasheh if (ret) { 1763dcd0538fSMark Fasheh mlog_errno(ret); 1764dcd0538fSMark Fasheh goto out; 1765dcd0538fSMark Fasheh } 1766dcd0538fSMark Fasheh 1767dcd0538fSMark Fasheh /* 1768dcd0538fSMark Fasheh * There is no need to re-read the next right path 1769dcd0538fSMark Fasheh * as we know that it'll be our current left 1770dcd0538fSMark Fasheh * path. Optimize by copying values instead. 1771dcd0538fSMark Fasheh */ 1772dcd0538fSMark Fasheh ocfs2_mv_path(right_path, left_path); 1773dcd0538fSMark Fasheh 1774dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1775dcd0538fSMark Fasheh &cpos); 1776dcd0538fSMark Fasheh if (ret) { 1777dcd0538fSMark Fasheh mlog_errno(ret); 1778dcd0538fSMark Fasheh goto out; 1779dcd0538fSMark Fasheh } 1780dcd0538fSMark Fasheh } 1781dcd0538fSMark Fasheh 1782dcd0538fSMark Fasheh out: 1783dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1784dcd0538fSMark Fasheh 1785dcd0538fSMark Fasheh out_ret_path: 1786dcd0538fSMark Fasheh return ret; 1787dcd0538fSMark Fasheh } 1788dcd0538fSMark Fasheh 1789dcd0538fSMark Fasheh /* 1790dcd0538fSMark Fasheh * Do the final bits of extent record insertion at the target leaf 1791dcd0538fSMark Fasheh * list. If this leaf is part of an allocation tree, it is assumed 1792dcd0538fSMark Fasheh * that the tree above has been prepared. 1793dcd0538fSMark Fasheh */ 1794dcd0538fSMark Fasheh static void ocfs2_insert_at_leaf(struct ocfs2_extent_rec *insert_rec, 1795dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 1796dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 1797dcd0538fSMark Fasheh struct inode *inode) 1798dcd0538fSMark Fasheh { 1799dcd0538fSMark Fasheh int i = insert->ins_contig_index; 1800dcd0538fSMark Fasheh unsigned int range; 1801dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 1802dcd0538fSMark Fasheh 1803e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 1804dcd0538fSMark Fasheh 1805dcd0538fSMark Fasheh /* 1806dcd0538fSMark Fasheh * Contiguous insert - either left or right. 1807dcd0538fSMark Fasheh */ 1808dcd0538fSMark Fasheh if (insert->ins_contig != CONTIG_NONE) { 1809dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1810dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT) { 1811dcd0538fSMark Fasheh rec->e_blkno = insert_rec->e_blkno; 1812dcd0538fSMark Fasheh rec->e_cpos = insert_rec->e_cpos; 1813dcd0538fSMark Fasheh } 1814e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, 1815e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1816dcd0538fSMark Fasheh return; 1817dcd0538fSMark Fasheh } 1818dcd0538fSMark Fasheh 1819dcd0538fSMark Fasheh /* 1820dcd0538fSMark Fasheh * Handle insert into an empty leaf. 1821dcd0538fSMark Fasheh */ 1822dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0 || 1823dcd0538fSMark Fasheh ((le16_to_cpu(el->l_next_free_rec) == 1) && 1824dcd0538fSMark Fasheh ocfs2_is_empty_extent(&el->l_recs[0]))) { 1825dcd0538fSMark Fasheh el->l_recs[0] = *insert_rec; 1826dcd0538fSMark Fasheh el->l_next_free_rec = cpu_to_le16(1); 1827dcd0538fSMark Fasheh return; 1828dcd0538fSMark Fasheh } 1829dcd0538fSMark Fasheh 1830dcd0538fSMark Fasheh /* 1831dcd0538fSMark Fasheh * Appending insert. 1832dcd0538fSMark Fasheh */ 1833dcd0538fSMark Fasheh if (insert->ins_appending == APPEND_TAIL) { 1834dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 1835dcd0538fSMark Fasheh rec = &el->l_recs[i]; 1836e48edee2SMark Fasheh range = le32_to_cpu(rec->e_cpos) 1837e48edee2SMark Fasheh + le16_to_cpu(rec->e_leaf_clusters); 1838dcd0538fSMark Fasheh BUG_ON(le32_to_cpu(insert_rec->e_cpos) < range); 1839dcd0538fSMark Fasheh 1840dcd0538fSMark Fasheh mlog_bug_on_msg(le16_to_cpu(el->l_next_free_rec) >= 1841dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1842dcd0538fSMark Fasheh "inode %lu, depth %u, count %u, next free %u, " 1843dcd0538fSMark Fasheh "rec.cpos %u, rec.clusters %u, " 1844dcd0538fSMark Fasheh "insert.cpos %u, insert.clusters %u\n", 1845dcd0538fSMark Fasheh inode->i_ino, 1846dcd0538fSMark Fasheh le16_to_cpu(el->l_tree_depth), 1847dcd0538fSMark Fasheh le16_to_cpu(el->l_count), 1848dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec), 1849dcd0538fSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos), 1850e48edee2SMark Fasheh le16_to_cpu(el->l_recs[i].e_leaf_clusters), 1851dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 1852e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1853dcd0538fSMark Fasheh i++; 1854dcd0538fSMark Fasheh el->l_recs[i] = *insert_rec; 1855dcd0538fSMark Fasheh le16_add_cpu(&el->l_next_free_rec, 1); 1856dcd0538fSMark Fasheh return; 1857dcd0538fSMark Fasheh } 1858dcd0538fSMark Fasheh 1859dcd0538fSMark Fasheh /* 1860dcd0538fSMark Fasheh * Ok, we have to rotate. 1861dcd0538fSMark Fasheh * 1862dcd0538fSMark Fasheh * At this point, it is safe to assume that inserting into an 1863dcd0538fSMark Fasheh * empty leaf and appending to a leaf have both been handled 1864dcd0538fSMark Fasheh * above. 1865dcd0538fSMark Fasheh * 1866dcd0538fSMark Fasheh * This leaf needs to have space, either by the empty 1st 1867dcd0538fSMark Fasheh * extent record, or by virtue of an l_next_rec < l_count. 1868dcd0538fSMark Fasheh */ 1869dcd0538fSMark Fasheh ocfs2_rotate_leaf(el, insert_rec); 1870dcd0538fSMark Fasheh } 1871dcd0538fSMark Fasheh 1872dcd0538fSMark Fasheh static inline void ocfs2_update_dinode_clusters(struct inode *inode, 1873dcd0538fSMark Fasheh struct ocfs2_dinode *di, 1874dcd0538fSMark Fasheh u32 clusters) 1875dcd0538fSMark Fasheh { 1876dcd0538fSMark Fasheh le32_add_cpu(&di->i_clusters, clusters); 1877dcd0538fSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 1878dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(di->i_clusters); 1879dcd0538fSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 1880dcd0538fSMark Fasheh } 1881dcd0538fSMark Fasheh 1882dcd0538fSMark Fasheh static int ocfs2_append_rec_to_path(struct inode *inode, handle_t *handle, 1883dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 1884dcd0538fSMark Fasheh struct ocfs2_path *right_path, 1885dcd0538fSMark Fasheh struct ocfs2_path **ret_left_path) 1886dcd0538fSMark Fasheh { 1887dcd0538fSMark Fasheh int ret, i, next_free; 1888dcd0538fSMark Fasheh struct buffer_head *bh; 1889dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 1890dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 1891dcd0538fSMark Fasheh 1892dcd0538fSMark Fasheh *ret_left_path = NULL; 1893dcd0538fSMark Fasheh 1894dcd0538fSMark Fasheh /* 1895e48edee2SMark Fasheh * This shouldn't happen for non-trees. The extent rec cluster 1896e48edee2SMark Fasheh * count manipulation below only works for interior nodes. 1897e48edee2SMark Fasheh */ 1898e48edee2SMark Fasheh BUG_ON(right_path->p_tree_depth == 0); 1899e48edee2SMark Fasheh 1900e48edee2SMark Fasheh /* 1901dcd0538fSMark Fasheh * If our appending insert is at the leftmost edge of a leaf, 1902dcd0538fSMark Fasheh * then we might need to update the rightmost records of the 1903dcd0538fSMark Fasheh * neighboring path. 1904dcd0538fSMark Fasheh */ 1905dcd0538fSMark Fasheh el = path_leaf_el(right_path); 1906dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1907dcd0538fSMark Fasheh if (next_free == 0 || 1908dcd0538fSMark Fasheh (next_free == 1 && ocfs2_is_empty_extent(&el->l_recs[0]))) { 1909dcd0538fSMark Fasheh u32 left_cpos; 1910dcd0538fSMark Fasheh 1911dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, right_path, 1912dcd0538fSMark Fasheh &left_cpos); 1913dcd0538fSMark Fasheh if (ret) { 1914dcd0538fSMark Fasheh mlog_errno(ret); 1915dcd0538fSMark Fasheh goto out; 1916dcd0538fSMark Fasheh } 1917dcd0538fSMark Fasheh 1918dcd0538fSMark Fasheh mlog(0, "Append may need a left path update. cpos: %u, " 1919dcd0538fSMark Fasheh "left_cpos: %u\n", le32_to_cpu(insert_rec->e_cpos), 1920dcd0538fSMark Fasheh left_cpos); 1921dcd0538fSMark Fasheh 1922dcd0538fSMark Fasheh /* 1923dcd0538fSMark Fasheh * No need to worry if the append is already in the 1924dcd0538fSMark Fasheh * leftmost leaf. 1925dcd0538fSMark Fasheh */ 1926dcd0538fSMark Fasheh if (left_cpos) { 1927dcd0538fSMark Fasheh left_path = ocfs2_new_path(path_root_bh(right_path), 1928dcd0538fSMark Fasheh path_root_el(right_path)); 1929dcd0538fSMark Fasheh if (!left_path) { 1930dcd0538fSMark Fasheh ret = -ENOMEM; 1931dcd0538fSMark Fasheh mlog_errno(ret); 1932dcd0538fSMark Fasheh goto out; 1933dcd0538fSMark Fasheh } 1934dcd0538fSMark Fasheh 1935dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, left_path, left_cpos); 1936dcd0538fSMark Fasheh if (ret) { 1937dcd0538fSMark Fasheh mlog_errno(ret); 1938dcd0538fSMark Fasheh goto out; 1939dcd0538fSMark Fasheh } 1940dcd0538fSMark Fasheh 1941dcd0538fSMark Fasheh /* 1942dcd0538fSMark Fasheh * ocfs2_insert_path() will pass the left_path to the 1943dcd0538fSMark Fasheh * journal for us. 1944dcd0538fSMark Fasheh */ 1945dcd0538fSMark Fasheh } 1946dcd0538fSMark Fasheh } 1947dcd0538fSMark Fasheh 1948dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 1949dcd0538fSMark Fasheh if (ret) { 1950dcd0538fSMark Fasheh mlog_errno(ret); 1951dcd0538fSMark Fasheh goto out; 1952dcd0538fSMark Fasheh } 1953dcd0538fSMark Fasheh 1954dcd0538fSMark Fasheh el = path_root_el(right_path); 1955dcd0538fSMark Fasheh bh = path_root_bh(right_path); 1956dcd0538fSMark Fasheh i = 0; 1957dcd0538fSMark Fasheh while (1) { 1958e48edee2SMark Fasheh struct ocfs2_extent_rec *rec; 1959e48edee2SMark Fasheh 1960dcd0538fSMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 1961dcd0538fSMark Fasheh if (next_free == 0) { 1962dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 1963dcd0538fSMark Fasheh "Dinode %llu has a bad extent list", 1964dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno); 1965dcd0538fSMark Fasheh ret = -EIO; 1966dcd0538fSMark Fasheh goto out; 1967dcd0538fSMark Fasheh } 1968dcd0538fSMark Fasheh 1969e48edee2SMark Fasheh rec = &el->l_recs[next_free - 1]; 1970e48edee2SMark Fasheh 1971e48edee2SMark Fasheh rec->e_int_clusters = insert_rec->e_cpos; 1972e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1973e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 1974e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 1975e48edee2SMark Fasheh -le32_to_cpu(rec->e_cpos)); 1976dcd0538fSMark Fasheh 1977dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 1978dcd0538fSMark Fasheh if (ret) 1979dcd0538fSMark Fasheh mlog_errno(ret); 1980dcd0538fSMark Fasheh 1981e48edee2SMark Fasheh /* Don't touch the leaf node */ 1982dcd0538fSMark Fasheh if (++i >= right_path->p_tree_depth) 1983dcd0538fSMark Fasheh break; 1984dcd0538fSMark Fasheh 1985dcd0538fSMark Fasheh bh = right_path->p_node[i].bh; 1986dcd0538fSMark Fasheh el = right_path->p_node[i].el; 1987dcd0538fSMark Fasheh } 1988dcd0538fSMark Fasheh 1989dcd0538fSMark Fasheh *ret_left_path = left_path; 1990dcd0538fSMark Fasheh ret = 0; 1991dcd0538fSMark Fasheh out: 1992dcd0538fSMark Fasheh if (ret != 0) 1993dcd0538fSMark Fasheh ocfs2_free_path(left_path); 1994dcd0538fSMark Fasheh 1995dcd0538fSMark Fasheh return ret; 1996dcd0538fSMark Fasheh } 1997dcd0538fSMark Fasheh 1998dcd0538fSMark Fasheh /* 1999dcd0538fSMark Fasheh * This function only does inserts on an allocation b-tree. For dinode 2000dcd0538fSMark Fasheh * lists, ocfs2_insert_at_leaf() is called directly. 2001dcd0538fSMark Fasheh * 2002dcd0538fSMark Fasheh * right_path is the path we want to do the actual insert 2003dcd0538fSMark Fasheh * in. left_path should only be passed in if we need to update that 2004dcd0538fSMark Fasheh * portion of the tree after an edge insert. 2005dcd0538fSMark Fasheh */ 2006dcd0538fSMark Fasheh static int ocfs2_insert_path(struct inode *inode, 2007dcd0538fSMark Fasheh handle_t *handle, 2008dcd0538fSMark Fasheh struct ocfs2_path *left_path, 2009dcd0538fSMark Fasheh struct ocfs2_path *right_path, 2010dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2011dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 2012dcd0538fSMark Fasheh { 2013dcd0538fSMark Fasheh int ret, subtree_index; 2014dcd0538fSMark Fasheh struct buffer_head *leaf_bh = path_leaf_bh(right_path); 2015dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2016dcd0538fSMark Fasheh 2017dcd0538fSMark Fasheh /* 2018dcd0538fSMark Fasheh * Pass both paths to the journal. The majority of inserts 2019dcd0538fSMark Fasheh * will be touching all components anyway. 2020dcd0538fSMark Fasheh */ 2021dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, right_path); 2022dcd0538fSMark Fasheh if (ret < 0) { 2023dcd0538fSMark Fasheh mlog_errno(ret); 2024dcd0538fSMark Fasheh goto out; 2025dcd0538fSMark Fasheh } 2026dcd0538fSMark Fasheh 2027dcd0538fSMark Fasheh if (left_path) { 2028dcd0538fSMark Fasheh int credits = handle->h_buffer_credits; 2029dcd0538fSMark Fasheh 2030dcd0538fSMark Fasheh /* 2031dcd0538fSMark Fasheh * There's a chance that left_path got passed back to 2032dcd0538fSMark Fasheh * us without being accounted for in the 2033dcd0538fSMark Fasheh * journal. Extend our transaction here to be sure we 2034dcd0538fSMark Fasheh * can change those blocks. 2035dcd0538fSMark Fasheh */ 2036dcd0538fSMark Fasheh credits += left_path->p_tree_depth; 2037dcd0538fSMark Fasheh 2038dcd0538fSMark Fasheh ret = ocfs2_extend_trans(handle, credits); 2039dcd0538fSMark Fasheh if (ret < 0) { 2040dcd0538fSMark Fasheh mlog_errno(ret); 2041dcd0538fSMark Fasheh goto out; 2042dcd0538fSMark Fasheh } 2043dcd0538fSMark Fasheh 2044dcd0538fSMark Fasheh ret = ocfs2_journal_access_path(inode, handle, left_path); 2045dcd0538fSMark Fasheh if (ret < 0) { 2046dcd0538fSMark Fasheh mlog_errno(ret); 2047dcd0538fSMark Fasheh goto out; 2048dcd0538fSMark Fasheh } 2049dcd0538fSMark Fasheh } 2050dcd0538fSMark Fasheh 2051dcd0538fSMark Fasheh el = path_leaf_el(right_path); 2052dcd0538fSMark Fasheh 2053dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, insert, inode); 2054dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, leaf_bh); 2055dcd0538fSMark Fasheh if (ret) 2056dcd0538fSMark Fasheh mlog_errno(ret); 2057dcd0538fSMark Fasheh 2058dcd0538fSMark Fasheh if (left_path) { 2059dcd0538fSMark Fasheh /* 2060dcd0538fSMark Fasheh * The rotate code has indicated that we need to fix 2061dcd0538fSMark Fasheh * up portions of the tree after the insert. 2062dcd0538fSMark Fasheh * 2063dcd0538fSMark Fasheh * XXX: Should we extend the transaction here? 2064dcd0538fSMark Fasheh */ 2065dcd0538fSMark Fasheh subtree_index = ocfs2_find_subtree_root(inode, left_path, 2066dcd0538fSMark Fasheh right_path); 2067dcd0538fSMark Fasheh ocfs2_complete_edge_insert(inode, handle, left_path, 2068dcd0538fSMark Fasheh right_path, subtree_index); 2069dcd0538fSMark Fasheh } 2070dcd0538fSMark Fasheh 2071dcd0538fSMark Fasheh ret = 0; 2072dcd0538fSMark Fasheh out: 2073dcd0538fSMark Fasheh return ret; 2074dcd0538fSMark Fasheh } 2075dcd0538fSMark Fasheh 2076dcd0538fSMark Fasheh static int ocfs2_do_insert_extent(struct inode *inode, 2077dcd0538fSMark Fasheh handle_t *handle, 2078dcd0538fSMark Fasheh struct buffer_head *di_bh, 2079dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2080dcd0538fSMark Fasheh struct ocfs2_insert_type *type) 2081dcd0538fSMark Fasheh { 2082dcd0538fSMark Fasheh int ret, rotate = 0; 2083dcd0538fSMark Fasheh u32 cpos; 2084dcd0538fSMark Fasheh struct ocfs2_path *right_path = NULL; 2085dcd0538fSMark Fasheh struct ocfs2_path *left_path = NULL; 2086dcd0538fSMark Fasheh struct ocfs2_dinode *di; 2087dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2088dcd0538fSMark Fasheh 2089dcd0538fSMark Fasheh di = (struct ocfs2_dinode *) di_bh->b_data; 2090dcd0538fSMark Fasheh el = &di->id2.i_list; 2091dcd0538fSMark Fasheh 2092dcd0538fSMark Fasheh ret = ocfs2_journal_access(handle, inode, di_bh, 2093dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2094dcd0538fSMark Fasheh if (ret) { 2095dcd0538fSMark Fasheh mlog_errno(ret); 2096dcd0538fSMark Fasheh goto out; 2097dcd0538fSMark Fasheh } 2098dcd0538fSMark Fasheh 2099dcd0538fSMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 2100dcd0538fSMark Fasheh ocfs2_insert_at_leaf(insert_rec, el, type, inode); 2101dcd0538fSMark Fasheh goto out_update_clusters; 2102dcd0538fSMark Fasheh } 2103dcd0538fSMark Fasheh 2104dcd0538fSMark Fasheh right_path = ocfs2_new_inode_path(di_bh); 2105dcd0538fSMark Fasheh if (!right_path) { 2106dcd0538fSMark Fasheh ret = -ENOMEM; 2107dcd0538fSMark Fasheh mlog_errno(ret); 2108dcd0538fSMark Fasheh goto out; 2109dcd0538fSMark Fasheh } 2110dcd0538fSMark Fasheh 2111dcd0538fSMark Fasheh /* 2112dcd0538fSMark Fasheh * Determine the path to start with. Rotations need the 2113dcd0538fSMark Fasheh * rightmost path, everything else can go directly to the 2114dcd0538fSMark Fasheh * target leaf. 2115dcd0538fSMark Fasheh */ 2116dcd0538fSMark Fasheh cpos = le32_to_cpu(insert_rec->e_cpos); 2117dcd0538fSMark Fasheh if (type->ins_appending == APPEND_NONE && 2118dcd0538fSMark Fasheh type->ins_contig == CONTIG_NONE) { 2119dcd0538fSMark Fasheh rotate = 1; 2120dcd0538fSMark Fasheh cpos = UINT_MAX; 2121dcd0538fSMark Fasheh } 2122dcd0538fSMark Fasheh 2123dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, right_path, cpos); 2124dcd0538fSMark Fasheh if (ret) { 2125dcd0538fSMark Fasheh mlog_errno(ret); 2126dcd0538fSMark Fasheh goto out; 2127dcd0538fSMark Fasheh } 2128dcd0538fSMark Fasheh 2129dcd0538fSMark Fasheh /* 2130dcd0538fSMark Fasheh * Rotations and appends need special treatment - they modify 2131dcd0538fSMark Fasheh * parts of the tree's above them. 2132dcd0538fSMark Fasheh * 2133dcd0538fSMark Fasheh * Both might pass back a path immediate to the left of the 2134dcd0538fSMark Fasheh * one being inserted to. This will be cause 2135dcd0538fSMark Fasheh * ocfs2_insert_path() to modify the rightmost records of 2136dcd0538fSMark Fasheh * left_path to account for an edge insert. 2137dcd0538fSMark Fasheh * 2138dcd0538fSMark Fasheh * XXX: When modifying this code, keep in mind that an insert 2139dcd0538fSMark Fasheh * can wind up skipping both of these two special cases... 2140dcd0538fSMark Fasheh */ 2141dcd0538fSMark Fasheh if (rotate) { 2142dcd0538fSMark Fasheh ret = ocfs2_rotate_tree_right(inode, handle, 2143dcd0538fSMark Fasheh le32_to_cpu(insert_rec->e_cpos), 2144dcd0538fSMark Fasheh right_path, &left_path); 2145dcd0538fSMark Fasheh if (ret) { 2146dcd0538fSMark Fasheh mlog_errno(ret); 2147dcd0538fSMark Fasheh goto out; 2148dcd0538fSMark Fasheh } 2149dcd0538fSMark Fasheh } else if (type->ins_appending == APPEND_TAIL 2150dcd0538fSMark Fasheh && type->ins_contig != CONTIG_LEFT) { 2151dcd0538fSMark Fasheh ret = ocfs2_append_rec_to_path(inode, handle, insert_rec, 2152dcd0538fSMark Fasheh right_path, &left_path); 2153dcd0538fSMark Fasheh if (ret) { 2154dcd0538fSMark Fasheh mlog_errno(ret); 2155dcd0538fSMark Fasheh goto out; 2156dcd0538fSMark Fasheh } 2157dcd0538fSMark Fasheh } 2158dcd0538fSMark Fasheh 2159dcd0538fSMark Fasheh ret = ocfs2_insert_path(inode, handle, left_path, right_path, 2160dcd0538fSMark Fasheh insert_rec, type); 2161dcd0538fSMark Fasheh if (ret) { 2162dcd0538fSMark Fasheh mlog_errno(ret); 2163dcd0538fSMark Fasheh goto out; 2164dcd0538fSMark Fasheh } 2165dcd0538fSMark Fasheh 2166dcd0538fSMark Fasheh out_update_clusters: 2167dcd0538fSMark Fasheh ocfs2_update_dinode_clusters(inode, di, 2168e48edee2SMark Fasheh le16_to_cpu(insert_rec->e_leaf_clusters)); 2169dcd0538fSMark Fasheh 2170dcd0538fSMark Fasheh ret = ocfs2_journal_dirty(handle, di_bh); 2171dcd0538fSMark Fasheh if (ret) 2172dcd0538fSMark Fasheh mlog_errno(ret); 2173dcd0538fSMark Fasheh 2174dcd0538fSMark Fasheh out: 2175dcd0538fSMark Fasheh ocfs2_free_path(left_path); 2176dcd0538fSMark Fasheh ocfs2_free_path(right_path); 2177dcd0538fSMark Fasheh 2178dcd0538fSMark Fasheh return ret; 2179dcd0538fSMark Fasheh } 2180dcd0538fSMark Fasheh 2181dcd0538fSMark Fasheh static void ocfs2_figure_contig_type(struct inode *inode, 2182dcd0538fSMark Fasheh struct ocfs2_insert_type *insert, 2183dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2184dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2185dcd0538fSMark Fasheh { 2186dcd0538fSMark Fasheh int i; 2187dcd0538fSMark Fasheh enum ocfs2_contig_type contig_type = CONTIG_NONE; 2188dcd0538fSMark Fasheh 2189e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2190e48edee2SMark Fasheh 2191dcd0538fSMark Fasheh for(i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) { 2192dcd0538fSMark Fasheh contig_type = ocfs2_extent_contig(inode, &el->l_recs[i], 2193dcd0538fSMark Fasheh insert_rec); 2194dcd0538fSMark Fasheh if (contig_type != CONTIG_NONE) { 2195dcd0538fSMark Fasheh insert->ins_contig_index = i; 2196dcd0538fSMark Fasheh break; 2197dcd0538fSMark Fasheh } 2198dcd0538fSMark Fasheh } 2199dcd0538fSMark Fasheh insert->ins_contig = contig_type; 2200dcd0538fSMark Fasheh } 2201dcd0538fSMark Fasheh 2202dcd0538fSMark Fasheh /* 2203dcd0538fSMark Fasheh * This should only be called against the righmost leaf extent list. 2204dcd0538fSMark Fasheh * 2205dcd0538fSMark Fasheh * ocfs2_figure_appending_type() will figure out whether we'll have to 2206dcd0538fSMark Fasheh * insert at the tail of the rightmost leaf. 2207dcd0538fSMark Fasheh * 2208dcd0538fSMark Fasheh * This should also work against the dinode list for tree's with 0 2209dcd0538fSMark Fasheh * depth. If we consider the dinode list to be the rightmost leaf node 2210dcd0538fSMark Fasheh * then the logic here makes sense. 2211dcd0538fSMark Fasheh */ 2212dcd0538fSMark Fasheh static void ocfs2_figure_appending_type(struct ocfs2_insert_type *insert, 2213dcd0538fSMark Fasheh struct ocfs2_extent_list *el, 2214dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec) 2215dcd0538fSMark Fasheh { 2216dcd0538fSMark Fasheh int i; 2217dcd0538fSMark Fasheh u32 cpos = le32_to_cpu(insert_rec->e_cpos); 2218dcd0538fSMark Fasheh struct ocfs2_extent_rec *rec; 2219dcd0538fSMark Fasheh 2220dcd0538fSMark Fasheh insert->ins_appending = APPEND_NONE; 2221dcd0538fSMark Fasheh 2222e48edee2SMark Fasheh BUG_ON(le16_to_cpu(el->l_tree_depth) != 0); 2223dcd0538fSMark Fasheh 2224dcd0538fSMark Fasheh if (!el->l_next_free_rec) 2225dcd0538fSMark Fasheh goto set_tail_append; 2226dcd0538fSMark Fasheh 2227dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 2228dcd0538fSMark Fasheh /* Were all records empty? */ 2229dcd0538fSMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 1) 2230dcd0538fSMark Fasheh goto set_tail_append; 2231dcd0538fSMark Fasheh } 2232dcd0538fSMark Fasheh 2233dcd0538fSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 2234dcd0538fSMark Fasheh rec = &el->l_recs[i]; 2235dcd0538fSMark Fasheh 2236e48edee2SMark Fasheh if (cpos >= 2237e48edee2SMark Fasheh (le32_to_cpu(rec->e_cpos) + le16_to_cpu(rec->e_leaf_clusters))) 2238dcd0538fSMark Fasheh goto set_tail_append; 2239dcd0538fSMark Fasheh 2240dcd0538fSMark Fasheh return; 2241dcd0538fSMark Fasheh 2242dcd0538fSMark Fasheh set_tail_append: 2243dcd0538fSMark Fasheh insert->ins_appending = APPEND_TAIL; 2244dcd0538fSMark Fasheh } 2245dcd0538fSMark Fasheh 2246dcd0538fSMark Fasheh /* 2247dcd0538fSMark Fasheh * Helper function called at the begining of an insert. 2248dcd0538fSMark Fasheh * 2249dcd0538fSMark Fasheh * This computes a few things that are commonly used in the process of 2250dcd0538fSMark Fasheh * inserting into the btree: 2251dcd0538fSMark Fasheh * - Whether the new extent is contiguous with an existing one. 2252dcd0538fSMark Fasheh * - The current tree depth. 2253dcd0538fSMark Fasheh * - Whether the insert is an appending one. 2254dcd0538fSMark Fasheh * - The total # of free records in the tree. 2255dcd0538fSMark Fasheh * 2256dcd0538fSMark Fasheh * All of the information is stored on the ocfs2_insert_type 2257dcd0538fSMark Fasheh * structure. 2258dcd0538fSMark Fasheh */ 2259dcd0538fSMark Fasheh static int ocfs2_figure_insert_type(struct inode *inode, 2260dcd0538fSMark Fasheh struct buffer_head *di_bh, 2261dcd0538fSMark Fasheh struct buffer_head **last_eb_bh, 2262dcd0538fSMark Fasheh struct ocfs2_extent_rec *insert_rec, 2263dcd0538fSMark Fasheh struct ocfs2_insert_type *insert) 2264dcd0538fSMark Fasheh { 2265dcd0538fSMark Fasheh int ret; 2266dcd0538fSMark Fasheh struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data; 2267dcd0538fSMark Fasheh struct ocfs2_extent_block *eb; 2268dcd0538fSMark Fasheh struct ocfs2_extent_list *el; 2269dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 2270dcd0538fSMark Fasheh struct buffer_head *bh = NULL; 2271dcd0538fSMark Fasheh 2272dcd0538fSMark Fasheh el = &di->id2.i_list; 2273dcd0538fSMark Fasheh insert->ins_tree_depth = le16_to_cpu(el->l_tree_depth); 2274dcd0538fSMark Fasheh 2275dcd0538fSMark Fasheh if (el->l_tree_depth) { 2276dcd0538fSMark Fasheh /* 2277dcd0538fSMark Fasheh * If we have tree depth, we read in the 2278dcd0538fSMark Fasheh * rightmost extent block ahead of time as 2279dcd0538fSMark Fasheh * ocfs2_figure_insert_type() and ocfs2_add_branch() 2280dcd0538fSMark Fasheh * may want it later. 2281dcd0538fSMark Fasheh */ 2282dcd0538fSMark Fasheh ret = ocfs2_read_block(OCFS2_SB(inode->i_sb), 2283dcd0538fSMark Fasheh le64_to_cpu(di->i_last_eb_blk), &bh, 2284dcd0538fSMark Fasheh OCFS2_BH_CACHED, inode); 2285dcd0538fSMark Fasheh if (ret) { 2286dcd0538fSMark Fasheh mlog_exit(ret); 2287dcd0538fSMark Fasheh goto out; 2288dcd0538fSMark Fasheh } 2289dcd0538fSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 2290dcd0538fSMark Fasheh el = &eb->h_list; 2291dcd0538fSMark Fasheh } 2292dcd0538fSMark Fasheh 2293dcd0538fSMark Fasheh /* 2294dcd0538fSMark Fasheh * Unless we have a contiguous insert, we'll need to know if 2295dcd0538fSMark Fasheh * there is room left in our allocation tree for another 2296dcd0538fSMark Fasheh * extent record. 2297dcd0538fSMark Fasheh * 2298dcd0538fSMark Fasheh * XXX: This test is simplistic, we can search for empty 2299dcd0538fSMark Fasheh * extent records too. 2300dcd0538fSMark Fasheh */ 2301dcd0538fSMark Fasheh insert->ins_free_records = le16_to_cpu(el->l_count) - 2302dcd0538fSMark Fasheh le16_to_cpu(el->l_next_free_rec); 2303dcd0538fSMark Fasheh 2304dcd0538fSMark Fasheh if (!insert->ins_tree_depth) { 2305dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2306dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2307dcd0538fSMark Fasheh return 0; 2308dcd0538fSMark Fasheh } 2309dcd0538fSMark Fasheh 2310dcd0538fSMark Fasheh path = ocfs2_new_inode_path(di_bh); 2311dcd0538fSMark Fasheh if (!path) { 2312dcd0538fSMark Fasheh ret = -ENOMEM; 2313dcd0538fSMark Fasheh mlog_errno(ret); 2314dcd0538fSMark Fasheh goto out; 2315dcd0538fSMark Fasheh } 2316dcd0538fSMark Fasheh 2317dcd0538fSMark Fasheh /* 2318dcd0538fSMark Fasheh * In the case that we're inserting past what the tree 2319dcd0538fSMark Fasheh * currently accounts for, ocfs2_find_path() will return for 2320dcd0538fSMark Fasheh * us the rightmost tree path. This is accounted for below in 2321dcd0538fSMark Fasheh * the appending code. 2322dcd0538fSMark Fasheh */ 2323dcd0538fSMark Fasheh ret = ocfs2_find_path(inode, path, le32_to_cpu(insert_rec->e_cpos)); 2324dcd0538fSMark Fasheh if (ret) { 2325dcd0538fSMark Fasheh mlog_errno(ret); 2326dcd0538fSMark Fasheh goto out; 2327dcd0538fSMark Fasheh } 2328dcd0538fSMark Fasheh 2329dcd0538fSMark Fasheh el = path_leaf_el(path); 2330dcd0538fSMark Fasheh 2331dcd0538fSMark Fasheh /* 2332dcd0538fSMark Fasheh * Now that we have the path, there's two things we want to determine: 2333dcd0538fSMark Fasheh * 1) Contiguousness (also set contig_index if this is so) 2334dcd0538fSMark Fasheh * 2335dcd0538fSMark Fasheh * 2) Are we doing an append? We can trivially break this up 2336dcd0538fSMark Fasheh * into two types of appends: simple record append, or a 2337dcd0538fSMark Fasheh * rotate inside the tail leaf. 2338dcd0538fSMark Fasheh */ 2339dcd0538fSMark Fasheh ocfs2_figure_contig_type(inode, insert, el, insert_rec); 2340dcd0538fSMark Fasheh 2341dcd0538fSMark Fasheh /* 2342dcd0538fSMark Fasheh * The insert code isn't quite ready to deal with all cases of 2343dcd0538fSMark Fasheh * left contiguousness. Specifically, if it's an insert into 2344dcd0538fSMark Fasheh * the 1st record in a leaf, it will require the adjustment of 2345e48edee2SMark Fasheh * cluster count on the last record of the path directly to it's 2346dcd0538fSMark Fasheh * left. For now, just catch that case and fool the layers 2347dcd0538fSMark Fasheh * above us. This works just fine for tree_depth == 0, which 2348dcd0538fSMark Fasheh * is why we allow that above. 2349dcd0538fSMark Fasheh */ 2350dcd0538fSMark Fasheh if (insert->ins_contig == CONTIG_LEFT && 2351dcd0538fSMark Fasheh insert->ins_contig_index == 0) 2352dcd0538fSMark Fasheh insert->ins_contig = CONTIG_NONE; 2353dcd0538fSMark Fasheh 2354dcd0538fSMark Fasheh /* 2355dcd0538fSMark Fasheh * Ok, so we can simply compare against last_eb to figure out 2356dcd0538fSMark Fasheh * whether the path doesn't exist. This will only happen in 2357dcd0538fSMark Fasheh * the case that we're doing a tail append, so maybe we can 2358dcd0538fSMark Fasheh * take advantage of that information somehow. 2359dcd0538fSMark Fasheh */ 2360dcd0538fSMark Fasheh if (le64_to_cpu(di->i_last_eb_blk) == path_leaf_bh(path)->b_blocknr) { 2361dcd0538fSMark Fasheh /* 2362dcd0538fSMark Fasheh * Ok, ocfs2_find_path() returned us the rightmost 2363dcd0538fSMark Fasheh * tree path. This might be an appending insert. There are 2364dcd0538fSMark Fasheh * two cases: 2365dcd0538fSMark Fasheh * 1) We're doing a true append at the tail: 2366dcd0538fSMark Fasheh * -This might even be off the end of the leaf 2367dcd0538fSMark Fasheh * 2) We're "appending" by rotating in the tail 2368dcd0538fSMark Fasheh */ 2369dcd0538fSMark Fasheh ocfs2_figure_appending_type(insert, el, insert_rec); 2370dcd0538fSMark Fasheh } 2371dcd0538fSMark Fasheh 2372dcd0538fSMark Fasheh out: 2373dcd0538fSMark Fasheh ocfs2_free_path(path); 2374dcd0538fSMark Fasheh 2375dcd0538fSMark Fasheh if (ret == 0) 2376dcd0538fSMark Fasheh *last_eb_bh = bh; 2377dcd0538fSMark Fasheh else 2378dcd0538fSMark Fasheh brelse(bh); 2379dcd0538fSMark Fasheh return ret; 2380dcd0538fSMark Fasheh } 2381dcd0538fSMark Fasheh 2382dcd0538fSMark Fasheh /* 2383dcd0538fSMark Fasheh * Insert an extent into an inode btree. 2384dcd0538fSMark Fasheh * 2385dcd0538fSMark Fasheh * The caller needs to update fe->i_clusters 2386dcd0538fSMark Fasheh */ 2387ccd979bdSMark Fasheh int ocfs2_insert_extent(struct ocfs2_super *osb, 23881fabe148SMark Fasheh handle_t *handle, 2389ccd979bdSMark Fasheh struct inode *inode, 2390ccd979bdSMark Fasheh struct buffer_head *fe_bh, 2391dcd0538fSMark Fasheh u32 cpos, 2392ccd979bdSMark Fasheh u64 start_blk, 2393ccd979bdSMark Fasheh u32 new_clusters, 2394ccd979bdSMark Fasheh struct ocfs2_alloc_context *meta_ac) 2395ccd979bdSMark Fasheh { 2396*c3afcbb3SMark Fasheh int status; 2397ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 2398ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2399dcd0538fSMark Fasheh struct ocfs2_insert_type insert = {0, }; 2400dcd0538fSMark Fasheh struct ocfs2_extent_rec rec; 2401ccd979bdSMark Fasheh 2402dcd0538fSMark Fasheh mlog(0, "add %u clusters at position %u to inode %llu\n", 2403dcd0538fSMark Fasheh new_clusters, cpos, (unsigned long long)OCFS2_I(inode)->ip_blkno); 2404ccd979bdSMark Fasheh 2405dcd0538fSMark Fasheh mlog_bug_on_msg(!ocfs2_sparse_alloc(osb) && 2406dcd0538fSMark Fasheh (OCFS2_I(inode)->ip_clusters != cpos), 2407dcd0538fSMark Fasheh "Device %s, asking for sparse allocation: inode %llu, " 2408dcd0538fSMark Fasheh "cpos %u, clusters %u\n", 2409dcd0538fSMark Fasheh osb->dev_str, 2410dcd0538fSMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, cpos, 2411dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters); 2412ccd979bdSMark Fasheh 2413e48edee2SMark Fasheh memset(&rec, 0, sizeof(rec)); 2414dcd0538fSMark Fasheh rec.e_cpos = cpu_to_le32(cpos); 2415dcd0538fSMark Fasheh rec.e_blkno = cpu_to_le64(start_blk); 2416e48edee2SMark Fasheh rec.e_leaf_clusters = cpu_to_le16(new_clusters); 2417ccd979bdSMark Fasheh 2418dcd0538fSMark Fasheh status = ocfs2_figure_insert_type(inode, fe_bh, &last_eb_bh, &rec, 2419dcd0538fSMark Fasheh &insert); 2420ccd979bdSMark Fasheh if (status < 0) { 2421dcd0538fSMark Fasheh mlog_errno(status); 2422ccd979bdSMark Fasheh goto bail; 2423ccd979bdSMark Fasheh } 2424ccd979bdSMark Fasheh 2425dcd0538fSMark Fasheh mlog(0, "Insert.appending: %u, Insert.Contig: %u, " 2426dcd0538fSMark Fasheh "Insert.contig_index: %d, Insert.free_records: %d, " 2427dcd0538fSMark Fasheh "Insert.tree_depth: %d\n", 2428dcd0538fSMark Fasheh insert.ins_appending, insert.ins_contig, insert.ins_contig_index, 2429dcd0538fSMark Fasheh insert.ins_free_records, insert.ins_tree_depth); 2430dcd0538fSMark Fasheh 2431*c3afcbb3SMark Fasheh if (insert.ins_contig == CONTIG_NONE && insert.ins_free_records == 0) { 2432*c3afcbb3SMark Fasheh status = ocfs2_grow_tree(inode, handle, fe_bh, 2433*c3afcbb3SMark Fasheh &insert.ins_tree_depth, last_eb_bh, 2434ccd979bdSMark Fasheh meta_ac); 2435*c3afcbb3SMark Fasheh if (status) { 2436ccd979bdSMark Fasheh mlog_errno(status); 2437ccd979bdSMark Fasheh goto bail; 2438ccd979bdSMark Fasheh } 2439*c3afcbb3SMark Fasheh } 2440ccd979bdSMark Fasheh 2441dcd0538fSMark Fasheh /* Finally, we can add clusters. This might rotate the tree for us. */ 2442dcd0538fSMark Fasheh status = ocfs2_do_insert_extent(inode, handle, fe_bh, &rec, &insert); 2443ccd979bdSMark Fasheh if (status < 0) 2444ccd979bdSMark Fasheh mlog_errno(status); 244583418978SMark Fasheh else 244683418978SMark Fasheh ocfs2_extent_map_insert_rec(inode, &rec); 2447ccd979bdSMark Fasheh 2448ccd979bdSMark Fasheh bail: 2449ccd979bdSMark Fasheh if (bh) 2450ccd979bdSMark Fasheh brelse(bh); 2451ccd979bdSMark Fasheh 2452ccd979bdSMark Fasheh if (last_eb_bh) 2453ccd979bdSMark Fasheh brelse(last_eb_bh); 2454ccd979bdSMark Fasheh 2455ccd979bdSMark Fasheh mlog_exit(status); 2456ccd979bdSMark Fasheh return status; 2457ccd979bdSMark Fasheh } 2458ccd979bdSMark Fasheh 2459ccd979bdSMark Fasheh static inline int ocfs2_truncate_log_needs_flush(struct ocfs2_super *osb) 2460ccd979bdSMark Fasheh { 2461ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2462ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2463ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2464ccd979bdSMark Fasheh 2465ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2466ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2467ccd979bdSMark Fasheh 2468ccd979bdSMark Fasheh mlog_bug_on_msg(le16_to_cpu(tl->tl_used) > le16_to_cpu(tl->tl_count), 2469ccd979bdSMark Fasheh "slot %d, invalid truncate log parameters: used = " 2470ccd979bdSMark Fasheh "%u, count = %u\n", osb->slot_num, 2471ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used), le16_to_cpu(tl->tl_count)); 2472ccd979bdSMark Fasheh return le16_to_cpu(tl->tl_used) == le16_to_cpu(tl->tl_count); 2473ccd979bdSMark Fasheh } 2474ccd979bdSMark Fasheh 2475ccd979bdSMark Fasheh static int ocfs2_truncate_log_can_coalesce(struct ocfs2_truncate_log *tl, 2476ccd979bdSMark Fasheh unsigned int new_start) 2477ccd979bdSMark Fasheh { 2478ccd979bdSMark Fasheh unsigned int tail_index; 2479ccd979bdSMark Fasheh unsigned int current_tail; 2480ccd979bdSMark Fasheh 2481ccd979bdSMark Fasheh /* No records, nothing to coalesce */ 2482ccd979bdSMark Fasheh if (!le16_to_cpu(tl->tl_used)) 2483ccd979bdSMark Fasheh return 0; 2484ccd979bdSMark Fasheh 2485ccd979bdSMark Fasheh tail_index = le16_to_cpu(tl->tl_used) - 1; 2486ccd979bdSMark Fasheh current_tail = le32_to_cpu(tl->tl_recs[tail_index].t_start); 2487ccd979bdSMark Fasheh current_tail += le32_to_cpu(tl->tl_recs[tail_index].t_clusters); 2488ccd979bdSMark Fasheh 2489ccd979bdSMark Fasheh return current_tail == new_start; 2490ccd979bdSMark Fasheh } 2491ccd979bdSMark Fasheh 2492ccd979bdSMark Fasheh static int ocfs2_truncate_log_append(struct ocfs2_super *osb, 24931fabe148SMark Fasheh handle_t *handle, 2494ccd979bdSMark Fasheh u64 start_blk, 2495ccd979bdSMark Fasheh unsigned int num_clusters) 2496ccd979bdSMark Fasheh { 2497ccd979bdSMark Fasheh int status, index; 2498ccd979bdSMark Fasheh unsigned int start_cluster, tl_count; 2499ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2500ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2501ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2502ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2503ccd979bdSMark Fasheh 2504b0697053SMark Fasheh mlog_entry("start_blk = %llu, num_clusters = %u\n", 2505b0697053SMark Fasheh (unsigned long long)start_blk, num_clusters); 2506ccd979bdSMark Fasheh 25071b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2508ccd979bdSMark Fasheh 2509ccd979bdSMark Fasheh start_cluster = ocfs2_blocks_to_clusters(osb->sb, start_blk); 2510ccd979bdSMark Fasheh 2511ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2512ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2513ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2514ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2515ccd979bdSMark Fasheh status = -EIO; 2516ccd979bdSMark Fasheh goto bail; 2517ccd979bdSMark Fasheh } 2518ccd979bdSMark Fasheh 2519ccd979bdSMark Fasheh tl_count = le16_to_cpu(tl->tl_count); 2520ccd979bdSMark Fasheh mlog_bug_on_msg(tl_count > ocfs2_truncate_recs_per_inode(osb->sb) || 2521ccd979bdSMark Fasheh tl_count == 0, 2522b0697053SMark Fasheh "Truncate record count on #%llu invalid " 2523b0697053SMark Fasheh "wanted %u, actual %u\n", 2524b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, 2525ccd979bdSMark Fasheh ocfs2_truncate_recs_per_inode(osb->sb), 2526ccd979bdSMark Fasheh le16_to_cpu(tl->tl_count)); 2527ccd979bdSMark Fasheh 2528ccd979bdSMark Fasheh /* Caller should have known to flush before calling us. */ 2529ccd979bdSMark Fasheh index = le16_to_cpu(tl->tl_used); 2530ccd979bdSMark Fasheh if (index >= tl_count) { 2531ccd979bdSMark Fasheh status = -ENOSPC; 2532ccd979bdSMark Fasheh mlog_errno(status); 2533ccd979bdSMark Fasheh goto bail; 2534ccd979bdSMark Fasheh } 2535ccd979bdSMark Fasheh 2536ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2537ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2538ccd979bdSMark Fasheh if (status < 0) { 2539ccd979bdSMark Fasheh mlog_errno(status); 2540ccd979bdSMark Fasheh goto bail; 2541ccd979bdSMark Fasheh } 2542ccd979bdSMark Fasheh 2543ccd979bdSMark Fasheh mlog(0, "Log truncate of %u clusters starting at cluster %u to " 2544b0697053SMark Fasheh "%llu (index = %d)\n", num_clusters, start_cluster, 2545b0697053SMark Fasheh (unsigned long long)OCFS2_I(tl_inode)->ip_blkno, index); 2546ccd979bdSMark Fasheh 2547ccd979bdSMark Fasheh if (ocfs2_truncate_log_can_coalesce(tl, start_cluster)) { 2548ccd979bdSMark Fasheh /* 2549ccd979bdSMark Fasheh * Move index back to the record we are coalescing with. 2550ccd979bdSMark Fasheh * ocfs2_truncate_log_can_coalesce() guarantees nonzero 2551ccd979bdSMark Fasheh */ 2552ccd979bdSMark Fasheh index--; 2553ccd979bdSMark Fasheh 2554ccd979bdSMark Fasheh num_clusters += le32_to_cpu(tl->tl_recs[index].t_clusters); 2555ccd979bdSMark Fasheh mlog(0, "Coalesce with index %u (start = %u, clusters = %u)\n", 2556ccd979bdSMark Fasheh index, le32_to_cpu(tl->tl_recs[index].t_start), 2557ccd979bdSMark Fasheh num_clusters); 2558ccd979bdSMark Fasheh } else { 2559ccd979bdSMark Fasheh tl->tl_recs[index].t_start = cpu_to_le32(start_cluster); 2560ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(index + 1); 2561ccd979bdSMark Fasheh } 2562ccd979bdSMark Fasheh tl->tl_recs[index].t_clusters = cpu_to_le32(num_clusters); 2563ccd979bdSMark Fasheh 2564ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2565ccd979bdSMark Fasheh if (status < 0) { 2566ccd979bdSMark Fasheh mlog_errno(status); 2567ccd979bdSMark Fasheh goto bail; 2568ccd979bdSMark Fasheh } 2569ccd979bdSMark Fasheh 2570ccd979bdSMark Fasheh bail: 2571ccd979bdSMark Fasheh mlog_exit(status); 2572ccd979bdSMark Fasheh return status; 2573ccd979bdSMark Fasheh } 2574ccd979bdSMark Fasheh 2575ccd979bdSMark Fasheh static int ocfs2_replay_truncate_records(struct ocfs2_super *osb, 25761fabe148SMark Fasheh handle_t *handle, 2577ccd979bdSMark Fasheh struct inode *data_alloc_inode, 2578ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh) 2579ccd979bdSMark Fasheh { 2580ccd979bdSMark Fasheh int status = 0; 2581ccd979bdSMark Fasheh int i; 2582ccd979bdSMark Fasheh unsigned int num_clusters; 2583ccd979bdSMark Fasheh u64 start_blk; 2584ccd979bdSMark Fasheh struct ocfs2_truncate_rec rec; 2585ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2586ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2587ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2588ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2589ccd979bdSMark Fasheh 2590ccd979bdSMark Fasheh mlog_entry_void(); 2591ccd979bdSMark Fasheh 2592ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2593ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2594ccd979bdSMark Fasheh i = le16_to_cpu(tl->tl_used) - 1; 2595ccd979bdSMark Fasheh while (i >= 0) { 2596ccd979bdSMark Fasheh /* Caller has given us at least enough credits to 2597ccd979bdSMark Fasheh * update the truncate log dinode */ 2598ccd979bdSMark Fasheh status = ocfs2_journal_access(handle, tl_inode, tl_bh, 2599ccd979bdSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 2600ccd979bdSMark Fasheh if (status < 0) { 2601ccd979bdSMark Fasheh mlog_errno(status); 2602ccd979bdSMark Fasheh goto bail; 2603ccd979bdSMark Fasheh } 2604ccd979bdSMark Fasheh 2605ccd979bdSMark Fasheh tl->tl_used = cpu_to_le16(i); 2606ccd979bdSMark Fasheh 2607ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, tl_bh); 2608ccd979bdSMark Fasheh if (status < 0) { 2609ccd979bdSMark Fasheh mlog_errno(status); 2610ccd979bdSMark Fasheh goto bail; 2611ccd979bdSMark Fasheh } 2612ccd979bdSMark Fasheh 2613ccd979bdSMark Fasheh /* TODO: Perhaps we can calculate the bulk of the 2614ccd979bdSMark Fasheh * credits up front rather than extending like 2615ccd979bdSMark Fasheh * this. */ 2616ccd979bdSMark Fasheh status = ocfs2_extend_trans(handle, 2617ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_ONE_REC); 2618ccd979bdSMark Fasheh if (status < 0) { 2619ccd979bdSMark Fasheh mlog_errno(status); 2620ccd979bdSMark Fasheh goto bail; 2621ccd979bdSMark Fasheh } 2622ccd979bdSMark Fasheh 2623ccd979bdSMark Fasheh rec = tl->tl_recs[i]; 2624ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(data_alloc_inode->i_sb, 2625ccd979bdSMark Fasheh le32_to_cpu(rec.t_start)); 2626ccd979bdSMark Fasheh num_clusters = le32_to_cpu(rec.t_clusters); 2627ccd979bdSMark Fasheh 2628ccd979bdSMark Fasheh /* if start_blk is not set, we ignore the record as 2629ccd979bdSMark Fasheh * invalid. */ 2630ccd979bdSMark Fasheh if (start_blk) { 2631ccd979bdSMark Fasheh mlog(0, "free record %d, start = %u, clusters = %u\n", 2632ccd979bdSMark Fasheh i, le32_to_cpu(rec.t_start), num_clusters); 2633ccd979bdSMark Fasheh 2634ccd979bdSMark Fasheh status = ocfs2_free_clusters(handle, data_alloc_inode, 2635ccd979bdSMark Fasheh data_alloc_bh, start_blk, 2636ccd979bdSMark Fasheh num_clusters); 2637ccd979bdSMark Fasheh if (status < 0) { 2638ccd979bdSMark Fasheh mlog_errno(status); 2639ccd979bdSMark Fasheh goto bail; 2640ccd979bdSMark Fasheh } 2641ccd979bdSMark Fasheh } 2642ccd979bdSMark Fasheh i--; 2643ccd979bdSMark Fasheh } 2644ccd979bdSMark Fasheh 2645ccd979bdSMark Fasheh bail: 2646ccd979bdSMark Fasheh mlog_exit(status); 2647ccd979bdSMark Fasheh return status; 2648ccd979bdSMark Fasheh } 2649ccd979bdSMark Fasheh 26501b1dcc1bSJes Sorensen /* Expects you to already be holding tl_inode->i_mutex */ 2651ccd979bdSMark Fasheh static int __ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2652ccd979bdSMark Fasheh { 2653ccd979bdSMark Fasheh int status; 2654ccd979bdSMark Fasheh unsigned int num_to_flush; 26551fabe148SMark Fasheh handle_t *handle; 2656ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2657ccd979bdSMark Fasheh struct inode *data_alloc_inode = NULL; 2658ccd979bdSMark Fasheh struct buffer_head *tl_bh = osb->osb_tl_bh; 2659ccd979bdSMark Fasheh struct buffer_head *data_alloc_bh = NULL; 2660ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2661ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2662ccd979bdSMark Fasheh 2663ccd979bdSMark Fasheh mlog_entry_void(); 2664ccd979bdSMark Fasheh 26651b1dcc1bSJes Sorensen BUG_ON(mutex_trylock(&tl_inode->i_mutex)); 2666ccd979bdSMark Fasheh 2667ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2668ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2669ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2670ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(osb->sb, di); 2671ccd979bdSMark Fasheh status = -EIO; 2672e08dc8b9SMark Fasheh goto out; 2673ccd979bdSMark Fasheh } 2674ccd979bdSMark Fasheh 2675ccd979bdSMark Fasheh num_to_flush = le16_to_cpu(tl->tl_used); 2676b0697053SMark Fasheh mlog(0, "Flush %u records from truncate log #%llu\n", 2677b0697053SMark Fasheh num_to_flush, (unsigned long long)OCFS2_I(tl_inode)->ip_blkno); 2678ccd979bdSMark Fasheh if (!num_to_flush) { 2679ccd979bdSMark Fasheh status = 0; 2680e08dc8b9SMark Fasheh goto out; 2681ccd979bdSMark Fasheh } 2682ccd979bdSMark Fasheh 2683ccd979bdSMark Fasheh data_alloc_inode = ocfs2_get_system_file_inode(osb, 2684ccd979bdSMark Fasheh GLOBAL_BITMAP_SYSTEM_INODE, 2685ccd979bdSMark Fasheh OCFS2_INVALID_SLOT); 2686ccd979bdSMark Fasheh if (!data_alloc_inode) { 2687ccd979bdSMark Fasheh status = -EINVAL; 2688ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get bitmap inode!\n"); 2689e08dc8b9SMark Fasheh goto out; 2690ccd979bdSMark Fasheh } 2691ccd979bdSMark Fasheh 2692e08dc8b9SMark Fasheh mutex_lock(&data_alloc_inode->i_mutex); 2693e08dc8b9SMark Fasheh 26944bcec184SMark Fasheh status = ocfs2_meta_lock(data_alloc_inode, &data_alloc_bh, 1); 2695ccd979bdSMark Fasheh if (status < 0) { 2696ccd979bdSMark Fasheh mlog_errno(status); 2697e08dc8b9SMark Fasheh goto out_mutex; 2698ccd979bdSMark Fasheh } 2699ccd979bdSMark Fasheh 270065eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2701ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2702ccd979bdSMark Fasheh status = PTR_ERR(handle); 2703ccd979bdSMark Fasheh mlog_errno(status); 2704e08dc8b9SMark Fasheh goto out_unlock; 2705ccd979bdSMark Fasheh } 2706ccd979bdSMark Fasheh 2707ccd979bdSMark Fasheh status = ocfs2_replay_truncate_records(osb, handle, data_alloc_inode, 2708ccd979bdSMark Fasheh data_alloc_bh); 2709e08dc8b9SMark Fasheh if (status < 0) 2710ccd979bdSMark Fasheh mlog_errno(status); 2711ccd979bdSMark Fasheh 271202dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2713ccd979bdSMark Fasheh 2714e08dc8b9SMark Fasheh out_unlock: 2715e08dc8b9SMark Fasheh brelse(data_alloc_bh); 2716e08dc8b9SMark Fasheh ocfs2_meta_unlock(data_alloc_inode, 1); 2717e08dc8b9SMark Fasheh 2718e08dc8b9SMark Fasheh out_mutex: 2719e08dc8b9SMark Fasheh mutex_unlock(&data_alloc_inode->i_mutex); 2720ccd979bdSMark Fasheh iput(data_alloc_inode); 2721ccd979bdSMark Fasheh 2722e08dc8b9SMark Fasheh out: 2723ccd979bdSMark Fasheh mlog_exit(status); 2724ccd979bdSMark Fasheh return status; 2725ccd979bdSMark Fasheh } 2726ccd979bdSMark Fasheh 2727ccd979bdSMark Fasheh int ocfs2_flush_truncate_log(struct ocfs2_super *osb) 2728ccd979bdSMark Fasheh { 2729ccd979bdSMark Fasheh int status; 2730ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2731ccd979bdSMark Fasheh 27321b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2733ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 27341b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2735ccd979bdSMark Fasheh 2736ccd979bdSMark Fasheh return status; 2737ccd979bdSMark Fasheh } 2738ccd979bdSMark Fasheh 2739c4028958SDavid Howells static void ocfs2_truncate_log_worker(struct work_struct *work) 2740ccd979bdSMark Fasheh { 2741ccd979bdSMark Fasheh int status; 2742c4028958SDavid Howells struct ocfs2_super *osb = 2743c4028958SDavid Howells container_of(work, struct ocfs2_super, 2744c4028958SDavid Howells osb_truncate_log_wq.work); 2745ccd979bdSMark Fasheh 2746ccd979bdSMark Fasheh mlog_entry_void(); 2747ccd979bdSMark Fasheh 2748ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2749ccd979bdSMark Fasheh if (status < 0) 2750ccd979bdSMark Fasheh mlog_errno(status); 2751ccd979bdSMark Fasheh 2752ccd979bdSMark Fasheh mlog_exit(status); 2753ccd979bdSMark Fasheh } 2754ccd979bdSMark Fasheh 2755ccd979bdSMark Fasheh #define OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL (2 * HZ) 2756ccd979bdSMark Fasheh void ocfs2_schedule_truncate_log_flush(struct ocfs2_super *osb, 2757ccd979bdSMark Fasheh int cancel) 2758ccd979bdSMark Fasheh { 2759ccd979bdSMark Fasheh if (osb->osb_tl_inode) { 2760ccd979bdSMark Fasheh /* We want to push off log flushes while truncates are 2761ccd979bdSMark Fasheh * still running. */ 2762ccd979bdSMark Fasheh if (cancel) 2763ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2764ccd979bdSMark Fasheh 2765ccd979bdSMark Fasheh queue_delayed_work(ocfs2_wq, &osb->osb_truncate_log_wq, 2766ccd979bdSMark Fasheh OCFS2_TRUNCATE_LOG_FLUSH_INTERVAL); 2767ccd979bdSMark Fasheh } 2768ccd979bdSMark Fasheh } 2769ccd979bdSMark Fasheh 2770ccd979bdSMark Fasheh static int ocfs2_get_truncate_log_info(struct ocfs2_super *osb, 2771ccd979bdSMark Fasheh int slot_num, 2772ccd979bdSMark Fasheh struct inode **tl_inode, 2773ccd979bdSMark Fasheh struct buffer_head **tl_bh) 2774ccd979bdSMark Fasheh { 2775ccd979bdSMark Fasheh int status; 2776ccd979bdSMark Fasheh struct inode *inode = NULL; 2777ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 2778ccd979bdSMark Fasheh 2779ccd979bdSMark Fasheh inode = ocfs2_get_system_file_inode(osb, 2780ccd979bdSMark Fasheh TRUNCATE_LOG_SYSTEM_INODE, 2781ccd979bdSMark Fasheh slot_num); 2782ccd979bdSMark Fasheh if (!inode) { 2783ccd979bdSMark Fasheh status = -EINVAL; 2784ccd979bdSMark Fasheh mlog(ML_ERROR, "Could not get load truncate log inode!\n"); 2785ccd979bdSMark Fasheh goto bail; 2786ccd979bdSMark Fasheh } 2787ccd979bdSMark Fasheh 2788ccd979bdSMark Fasheh status = ocfs2_read_block(osb, OCFS2_I(inode)->ip_blkno, &bh, 2789ccd979bdSMark Fasheh OCFS2_BH_CACHED, inode); 2790ccd979bdSMark Fasheh if (status < 0) { 2791ccd979bdSMark Fasheh iput(inode); 2792ccd979bdSMark Fasheh mlog_errno(status); 2793ccd979bdSMark Fasheh goto bail; 2794ccd979bdSMark Fasheh } 2795ccd979bdSMark Fasheh 2796ccd979bdSMark Fasheh *tl_inode = inode; 2797ccd979bdSMark Fasheh *tl_bh = bh; 2798ccd979bdSMark Fasheh bail: 2799ccd979bdSMark Fasheh mlog_exit(status); 2800ccd979bdSMark Fasheh return status; 2801ccd979bdSMark Fasheh } 2802ccd979bdSMark Fasheh 2803ccd979bdSMark Fasheh /* called during the 1st stage of node recovery. we stamp a clean 2804ccd979bdSMark Fasheh * truncate log and pass back a copy for processing later. if the 2805ccd979bdSMark Fasheh * truncate log does not require processing, a *tl_copy is set to 2806ccd979bdSMark Fasheh * NULL. */ 2807ccd979bdSMark Fasheh int ocfs2_begin_truncate_log_recovery(struct ocfs2_super *osb, 2808ccd979bdSMark Fasheh int slot_num, 2809ccd979bdSMark Fasheh struct ocfs2_dinode **tl_copy) 2810ccd979bdSMark Fasheh { 2811ccd979bdSMark Fasheh int status; 2812ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2813ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2814ccd979bdSMark Fasheh struct ocfs2_dinode *di; 2815ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2816ccd979bdSMark Fasheh 2817ccd979bdSMark Fasheh *tl_copy = NULL; 2818ccd979bdSMark Fasheh 2819ccd979bdSMark Fasheh mlog(0, "recover truncate log from slot %d\n", slot_num); 2820ccd979bdSMark Fasheh 2821ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, slot_num, &tl_inode, &tl_bh); 2822ccd979bdSMark Fasheh if (status < 0) { 2823ccd979bdSMark Fasheh mlog_errno(status); 2824ccd979bdSMark Fasheh goto bail; 2825ccd979bdSMark Fasheh } 2826ccd979bdSMark Fasheh 2827ccd979bdSMark Fasheh di = (struct ocfs2_dinode *) tl_bh->b_data; 2828ccd979bdSMark Fasheh tl = &di->id2.i_dealloc; 2829ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_DINODE(di)) { 2830ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_DINODE(tl_inode->i_sb, di); 2831ccd979bdSMark Fasheh status = -EIO; 2832ccd979bdSMark Fasheh goto bail; 2833ccd979bdSMark Fasheh } 2834ccd979bdSMark Fasheh 2835ccd979bdSMark Fasheh if (le16_to_cpu(tl->tl_used)) { 2836ccd979bdSMark Fasheh mlog(0, "We'll have %u logs to recover\n", 2837ccd979bdSMark Fasheh le16_to_cpu(tl->tl_used)); 2838ccd979bdSMark Fasheh 2839ccd979bdSMark Fasheh *tl_copy = kmalloc(tl_bh->b_size, GFP_KERNEL); 2840ccd979bdSMark Fasheh if (!(*tl_copy)) { 2841ccd979bdSMark Fasheh status = -ENOMEM; 2842ccd979bdSMark Fasheh mlog_errno(status); 2843ccd979bdSMark Fasheh goto bail; 2844ccd979bdSMark Fasheh } 2845ccd979bdSMark Fasheh 2846ccd979bdSMark Fasheh /* Assuming the write-out below goes well, this copy 2847ccd979bdSMark Fasheh * will be passed back to recovery for processing. */ 2848ccd979bdSMark Fasheh memcpy(*tl_copy, tl_bh->b_data, tl_bh->b_size); 2849ccd979bdSMark Fasheh 2850ccd979bdSMark Fasheh /* All we need to do to clear the truncate log is set 2851ccd979bdSMark Fasheh * tl_used. */ 2852ccd979bdSMark Fasheh tl->tl_used = 0; 2853ccd979bdSMark Fasheh 2854ccd979bdSMark Fasheh status = ocfs2_write_block(osb, tl_bh, tl_inode); 2855ccd979bdSMark Fasheh if (status < 0) { 2856ccd979bdSMark Fasheh mlog_errno(status); 2857ccd979bdSMark Fasheh goto bail; 2858ccd979bdSMark Fasheh } 2859ccd979bdSMark Fasheh } 2860ccd979bdSMark Fasheh 2861ccd979bdSMark Fasheh bail: 2862ccd979bdSMark Fasheh if (tl_inode) 2863ccd979bdSMark Fasheh iput(tl_inode); 2864ccd979bdSMark Fasheh if (tl_bh) 2865ccd979bdSMark Fasheh brelse(tl_bh); 2866ccd979bdSMark Fasheh 2867ccd979bdSMark Fasheh if (status < 0 && (*tl_copy)) { 2868ccd979bdSMark Fasheh kfree(*tl_copy); 2869ccd979bdSMark Fasheh *tl_copy = NULL; 2870ccd979bdSMark Fasheh } 2871ccd979bdSMark Fasheh 2872ccd979bdSMark Fasheh mlog_exit(status); 2873ccd979bdSMark Fasheh return status; 2874ccd979bdSMark Fasheh } 2875ccd979bdSMark Fasheh 2876ccd979bdSMark Fasheh int ocfs2_complete_truncate_log_recovery(struct ocfs2_super *osb, 2877ccd979bdSMark Fasheh struct ocfs2_dinode *tl_copy) 2878ccd979bdSMark Fasheh { 2879ccd979bdSMark Fasheh int status = 0; 2880ccd979bdSMark Fasheh int i; 2881ccd979bdSMark Fasheh unsigned int clusters, num_recs, start_cluster; 2882ccd979bdSMark Fasheh u64 start_blk; 28831fabe148SMark Fasheh handle_t *handle; 2884ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2885ccd979bdSMark Fasheh struct ocfs2_truncate_log *tl; 2886ccd979bdSMark Fasheh 2887ccd979bdSMark Fasheh mlog_entry_void(); 2888ccd979bdSMark Fasheh 2889ccd979bdSMark Fasheh if (OCFS2_I(tl_inode)->ip_blkno == le64_to_cpu(tl_copy->i_blkno)) { 2890ccd979bdSMark Fasheh mlog(ML_ERROR, "Asked to recover my own truncate log!\n"); 2891ccd979bdSMark Fasheh return -EINVAL; 2892ccd979bdSMark Fasheh } 2893ccd979bdSMark Fasheh 2894ccd979bdSMark Fasheh tl = &tl_copy->id2.i_dealloc; 2895ccd979bdSMark Fasheh num_recs = le16_to_cpu(tl->tl_used); 2896b0697053SMark Fasheh mlog(0, "cleanup %u records from %llu\n", num_recs, 28971ca1a111SMark Fasheh (unsigned long long)le64_to_cpu(tl_copy->i_blkno)); 2898ccd979bdSMark Fasheh 28991b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 2900ccd979bdSMark Fasheh for(i = 0; i < num_recs; i++) { 2901ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 2902ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 2903ccd979bdSMark Fasheh if (status < 0) { 2904ccd979bdSMark Fasheh mlog_errno(status); 2905ccd979bdSMark Fasheh goto bail_up; 2906ccd979bdSMark Fasheh } 2907ccd979bdSMark Fasheh } 2908ccd979bdSMark Fasheh 290965eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_TRUNCATE_LOG_UPDATE); 2910ccd979bdSMark Fasheh if (IS_ERR(handle)) { 2911ccd979bdSMark Fasheh status = PTR_ERR(handle); 2912ccd979bdSMark Fasheh mlog_errno(status); 2913ccd979bdSMark Fasheh goto bail_up; 2914ccd979bdSMark Fasheh } 2915ccd979bdSMark Fasheh 2916ccd979bdSMark Fasheh clusters = le32_to_cpu(tl->tl_recs[i].t_clusters); 2917ccd979bdSMark Fasheh start_cluster = le32_to_cpu(tl->tl_recs[i].t_start); 2918ccd979bdSMark Fasheh start_blk = ocfs2_clusters_to_blocks(osb->sb, start_cluster); 2919ccd979bdSMark Fasheh 2920ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, 2921ccd979bdSMark Fasheh start_blk, clusters); 292202dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 2923ccd979bdSMark Fasheh if (status < 0) { 2924ccd979bdSMark Fasheh mlog_errno(status); 2925ccd979bdSMark Fasheh goto bail_up; 2926ccd979bdSMark Fasheh } 2927ccd979bdSMark Fasheh } 2928ccd979bdSMark Fasheh 2929ccd979bdSMark Fasheh bail_up: 29301b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 2931ccd979bdSMark Fasheh 2932ccd979bdSMark Fasheh mlog_exit(status); 2933ccd979bdSMark Fasheh return status; 2934ccd979bdSMark Fasheh } 2935ccd979bdSMark Fasheh 2936ccd979bdSMark Fasheh void ocfs2_truncate_log_shutdown(struct ocfs2_super *osb) 2937ccd979bdSMark Fasheh { 2938ccd979bdSMark Fasheh int status; 2939ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 2940ccd979bdSMark Fasheh 2941ccd979bdSMark Fasheh mlog_entry_void(); 2942ccd979bdSMark Fasheh 2943ccd979bdSMark Fasheh if (tl_inode) { 2944ccd979bdSMark Fasheh cancel_delayed_work(&osb->osb_truncate_log_wq); 2945ccd979bdSMark Fasheh flush_workqueue(ocfs2_wq); 2946ccd979bdSMark Fasheh 2947ccd979bdSMark Fasheh status = ocfs2_flush_truncate_log(osb); 2948ccd979bdSMark Fasheh if (status < 0) 2949ccd979bdSMark Fasheh mlog_errno(status); 2950ccd979bdSMark Fasheh 2951ccd979bdSMark Fasheh brelse(osb->osb_tl_bh); 2952ccd979bdSMark Fasheh iput(osb->osb_tl_inode); 2953ccd979bdSMark Fasheh } 2954ccd979bdSMark Fasheh 2955ccd979bdSMark Fasheh mlog_exit_void(); 2956ccd979bdSMark Fasheh } 2957ccd979bdSMark Fasheh 2958ccd979bdSMark Fasheh int ocfs2_truncate_log_init(struct ocfs2_super *osb) 2959ccd979bdSMark Fasheh { 2960ccd979bdSMark Fasheh int status; 2961ccd979bdSMark Fasheh struct inode *tl_inode = NULL; 2962ccd979bdSMark Fasheh struct buffer_head *tl_bh = NULL; 2963ccd979bdSMark Fasheh 2964ccd979bdSMark Fasheh mlog_entry_void(); 2965ccd979bdSMark Fasheh 2966ccd979bdSMark Fasheh status = ocfs2_get_truncate_log_info(osb, 2967ccd979bdSMark Fasheh osb->slot_num, 2968ccd979bdSMark Fasheh &tl_inode, 2969ccd979bdSMark Fasheh &tl_bh); 2970ccd979bdSMark Fasheh if (status < 0) 2971ccd979bdSMark Fasheh mlog_errno(status); 2972ccd979bdSMark Fasheh 2973ccd979bdSMark Fasheh /* ocfs2_truncate_log_shutdown keys on the existence of 2974ccd979bdSMark Fasheh * osb->osb_tl_inode so we don't set any of the osb variables 2975ccd979bdSMark Fasheh * until we're sure all is well. */ 2976c4028958SDavid Howells INIT_DELAYED_WORK(&osb->osb_truncate_log_wq, 2977c4028958SDavid Howells ocfs2_truncate_log_worker); 2978ccd979bdSMark Fasheh osb->osb_tl_bh = tl_bh; 2979ccd979bdSMark Fasheh osb->osb_tl_inode = tl_inode; 2980ccd979bdSMark Fasheh 2981ccd979bdSMark Fasheh mlog_exit(status); 2982ccd979bdSMark Fasheh return status; 2983ccd979bdSMark Fasheh } 2984ccd979bdSMark Fasheh 29852b604351SMark Fasheh /* 29862b604351SMark Fasheh * Delayed de-allocation of suballocator blocks. 29872b604351SMark Fasheh * 29882b604351SMark Fasheh * Some sets of block de-allocations might involve multiple suballocator inodes. 29892b604351SMark Fasheh * 29902b604351SMark Fasheh * The locking for this can get extremely complicated, especially when 29912b604351SMark Fasheh * the suballocator inodes to delete from aren't known until deep 29922b604351SMark Fasheh * within an unrelated codepath. 29932b604351SMark Fasheh * 29942b604351SMark Fasheh * ocfs2_extent_block structures are a good example of this - an inode 29952b604351SMark Fasheh * btree could have been grown by any number of nodes each allocating 29962b604351SMark Fasheh * out of their own suballoc inode. 29972b604351SMark Fasheh * 29982b604351SMark Fasheh * These structures allow the delay of block de-allocation until a 29992b604351SMark Fasheh * later time, when locking of multiple cluster inodes won't cause 30002b604351SMark Fasheh * deadlock. 30012b604351SMark Fasheh */ 30022b604351SMark Fasheh 30032b604351SMark Fasheh /* 30042b604351SMark Fasheh * Describes a single block free from a suballocator 30052b604351SMark Fasheh */ 30062b604351SMark Fasheh struct ocfs2_cached_block_free { 30072b604351SMark Fasheh struct ocfs2_cached_block_free *free_next; 30082b604351SMark Fasheh u64 free_blk; 30092b604351SMark Fasheh unsigned int free_bit; 30102b604351SMark Fasheh }; 30112b604351SMark Fasheh 30122b604351SMark Fasheh struct ocfs2_per_slot_free_list { 30132b604351SMark Fasheh struct ocfs2_per_slot_free_list *f_next_suballocator; 30142b604351SMark Fasheh int f_inode_type; 30152b604351SMark Fasheh int f_slot; 30162b604351SMark Fasheh struct ocfs2_cached_block_free *f_first; 30172b604351SMark Fasheh }; 30182b604351SMark Fasheh 30192b604351SMark Fasheh static int ocfs2_free_cached_items(struct ocfs2_super *osb, 30202b604351SMark Fasheh int sysfile_type, 30212b604351SMark Fasheh int slot, 30222b604351SMark Fasheh struct ocfs2_cached_block_free *head) 30232b604351SMark Fasheh { 30242b604351SMark Fasheh int ret; 30252b604351SMark Fasheh u64 bg_blkno; 30262b604351SMark Fasheh handle_t *handle; 30272b604351SMark Fasheh struct inode *inode; 30282b604351SMark Fasheh struct buffer_head *di_bh = NULL; 30292b604351SMark Fasheh struct ocfs2_cached_block_free *tmp; 30302b604351SMark Fasheh 30312b604351SMark Fasheh inode = ocfs2_get_system_file_inode(osb, sysfile_type, slot); 30322b604351SMark Fasheh if (!inode) { 30332b604351SMark Fasheh ret = -EINVAL; 30342b604351SMark Fasheh mlog_errno(ret); 30352b604351SMark Fasheh goto out; 30362b604351SMark Fasheh } 30372b604351SMark Fasheh 30382b604351SMark Fasheh mutex_lock(&inode->i_mutex); 30392b604351SMark Fasheh 30402b604351SMark Fasheh ret = ocfs2_meta_lock(inode, &di_bh, 1); 30412b604351SMark Fasheh if (ret) { 30422b604351SMark Fasheh mlog_errno(ret); 30432b604351SMark Fasheh goto out_mutex; 30442b604351SMark Fasheh } 30452b604351SMark Fasheh 30462b604351SMark Fasheh handle = ocfs2_start_trans(osb, OCFS2_SUBALLOC_FREE); 30472b604351SMark Fasheh if (IS_ERR(handle)) { 30482b604351SMark Fasheh ret = PTR_ERR(handle); 30492b604351SMark Fasheh mlog_errno(ret); 30502b604351SMark Fasheh goto out_unlock; 30512b604351SMark Fasheh } 30522b604351SMark Fasheh 30532b604351SMark Fasheh while (head) { 30542b604351SMark Fasheh bg_blkno = ocfs2_which_suballoc_group(head->free_blk, 30552b604351SMark Fasheh head->free_bit); 30562b604351SMark Fasheh mlog(0, "Free bit: (bit %u, blkno %llu)\n", 30572b604351SMark Fasheh head->free_bit, (unsigned long long)head->free_blk); 30582b604351SMark Fasheh 30592b604351SMark Fasheh ret = ocfs2_free_suballoc_bits(handle, inode, di_bh, 30602b604351SMark Fasheh head->free_bit, bg_blkno, 1); 30612b604351SMark Fasheh if (ret) { 30622b604351SMark Fasheh mlog_errno(ret); 30632b604351SMark Fasheh goto out_journal; 30642b604351SMark Fasheh } 30652b604351SMark Fasheh 30662b604351SMark Fasheh ret = ocfs2_extend_trans(handle, OCFS2_SUBALLOC_FREE); 30672b604351SMark Fasheh if (ret) { 30682b604351SMark Fasheh mlog_errno(ret); 30692b604351SMark Fasheh goto out_journal; 30702b604351SMark Fasheh } 30712b604351SMark Fasheh 30722b604351SMark Fasheh tmp = head; 30732b604351SMark Fasheh head = head->free_next; 30742b604351SMark Fasheh kfree(tmp); 30752b604351SMark Fasheh } 30762b604351SMark Fasheh 30772b604351SMark Fasheh out_journal: 30782b604351SMark Fasheh ocfs2_commit_trans(osb, handle); 30792b604351SMark Fasheh 30802b604351SMark Fasheh out_unlock: 30812b604351SMark Fasheh ocfs2_meta_unlock(inode, 1); 30822b604351SMark Fasheh brelse(di_bh); 30832b604351SMark Fasheh out_mutex: 30842b604351SMark Fasheh mutex_unlock(&inode->i_mutex); 30852b604351SMark Fasheh iput(inode); 30862b604351SMark Fasheh out: 30872b604351SMark Fasheh while(head) { 30882b604351SMark Fasheh /* Premature exit may have left some dangling items. */ 30892b604351SMark Fasheh tmp = head; 30902b604351SMark Fasheh head = head->free_next; 30912b604351SMark Fasheh kfree(tmp); 30922b604351SMark Fasheh } 30932b604351SMark Fasheh 30942b604351SMark Fasheh return ret; 30952b604351SMark Fasheh } 30962b604351SMark Fasheh 30972b604351SMark Fasheh int ocfs2_run_deallocs(struct ocfs2_super *osb, 30982b604351SMark Fasheh struct ocfs2_cached_dealloc_ctxt *ctxt) 30992b604351SMark Fasheh { 31002b604351SMark Fasheh int ret = 0, ret2; 31012b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl; 31022b604351SMark Fasheh 31032b604351SMark Fasheh if (!ctxt) 31042b604351SMark Fasheh return 0; 31052b604351SMark Fasheh 31062b604351SMark Fasheh while (ctxt->c_first_suballocator) { 31072b604351SMark Fasheh fl = ctxt->c_first_suballocator; 31082b604351SMark Fasheh 31092b604351SMark Fasheh if (fl->f_first) { 31102b604351SMark Fasheh mlog(0, "Free items: (type %u, slot %d)\n", 31112b604351SMark Fasheh fl->f_inode_type, fl->f_slot); 31122b604351SMark Fasheh ret2 = ocfs2_free_cached_items(osb, fl->f_inode_type, 31132b604351SMark Fasheh fl->f_slot, fl->f_first); 31142b604351SMark Fasheh if (ret2) 31152b604351SMark Fasheh mlog_errno(ret2); 31162b604351SMark Fasheh if (!ret) 31172b604351SMark Fasheh ret = ret2; 31182b604351SMark Fasheh } 31192b604351SMark Fasheh 31202b604351SMark Fasheh ctxt->c_first_suballocator = fl->f_next_suballocator; 31212b604351SMark Fasheh kfree(fl); 31222b604351SMark Fasheh } 31232b604351SMark Fasheh 31242b604351SMark Fasheh return ret; 31252b604351SMark Fasheh } 31262b604351SMark Fasheh 31272b604351SMark Fasheh static struct ocfs2_per_slot_free_list * 31282b604351SMark Fasheh ocfs2_find_per_slot_free_list(int type, 31292b604351SMark Fasheh int slot, 31302b604351SMark Fasheh struct ocfs2_cached_dealloc_ctxt *ctxt) 31312b604351SMark Fasheh { 31322b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl = ctxt->c_first_suballocator; 31332b604351SMark Fasheh 31342b604351SMark Fasheh while (fl) { 31352b604351SMark Fasheh if (fl->f_inode_type == type && fl->f_slot == slot) 31362b604351SMark Fasheh return fl; 31372b604351SMark Fasheh 31382b604351SMark Fasheh fl = fl->f_next_suballocator; 31392b604351SMark Fasheh } 31402b604351SMark Fasheh 31412b604351SMark Fasheh fl = kmalloc(sizeof(*fl), GFP_NOFS); 31422b604351SMark Fasheh if (fl) { 31432b604351SMark Fasheh fl->f_inode_type = type; 31442b604351SMark Fasheh fl->f_slot = slot; 31452b604351SMark Fasheh fl->f_first = NULL; 31462b604351SMark Fasheh fl->f_next_suballocator = ctxt->c_first_suballocator; 31472b604351SMark Fasheh 31482b604351SMark Fasheh ctxt->c_first_suballocator = fl; 31492b604351SMark Fasheh } 31502b604351SMark Fasheh return fl; 31512b604351SMark Fasheh } 31522b604351SMark Fasheh 31532b604351SMark Fasheh static int ocfs2_cache_block_dealloc(struct ocfs2_cached_dealloc_ctxt *ctxt, 31542b604351SMark Fasheh int type, int slot, u64 blkno, 31552b604351SMark Fasheh unsigned int bit) 31562b604351SMark Fasheh { 31572b604351SMark Fasheh int ret; 31582b604351SMark Fasheh struct ocfs2_per_slot_free_list *fl; 31592b604351SMark Fasheh struct ocfs2_cached_block_free *item; 31602b604351SMark Fasheh 31612b604351SMark Fasheh fl = ocfs2_find_per_slot_free_list(type, slot, ctxt); 31622b604351SMark Fasheh if (fl == NULL) { 31632b604351SMark Fasheh ret = -ENOMEM; 31642b604351SMark Fasheh mlog_errno(ret); 31652b604351SMark Fasheh goto out; 31662b604351SMark Fasheh } 31672b604351SMark Fasheh 31682b604351SMark Fasheh item = kmalloc(sizeof(*item), GFP_NOFS); 31692b604351SMark Fasheh if (item == NULL) { 31702b604351SMark Fasheh ret = -ENOMEM; 31712b604351SMark Fasheh mlog_errno(ret); 31722b604351SMark Fasheh goto out; 31732b604351SMark Fasheh } 31742b604351SMark Fasheh 31752b604351SMark Fasheh mlog(0, "Insert: (type %d, slot %u, bit %u, blk %llu)\n", 31762b604351SMark Fasheh type, slot, bit, (unsigned long long)blkno); 31772b604351SMark Fasheh 31782b604351SMark Fasheh item->free_blk = blkno; 31792b604351SMark Fasheh item->free_bit = bit; 31802b604351SMark Fasheh item->free_next = fl->f_first; 31812b604351SMark Fasheh 31822b604351SMark Fasheh fl->f_first = item; 31832b604351SMark Fasheh 31842b604351SMark Fasheh ret = 0; 31852b604351SMark Fasheh out: 31862b604351SMark Fasheh return ret; 31872b604351SMark Fasheh } 31882b604351SMark Fasheh 318959a5e416SMark Fasheh static int ocfs2_cache_extent_block_free(struct ocfs2_cached_dealloc_ctxt *ctxt, 319059a5e416SMark Fasheh struct ocfs2_extent_block *eb) 319159a5e416SMark Fasheh { 319259a5e416SMark Fasheh return ocfs2_cache_block_dealloc(ctxt, EXTENT_ALLOC_SYSTEM_INODE, 319359a5e416SMark Fasheh le16_to_cpu(eb->h_suballoc_slot), 319459a5e416SMark Fasheh le64_to_cpu(eb->h_blkno), 319559a5e416SMark Fasheh le16_to_cpu(eb->h_suballoc_bit)); 319659a5e416SMark Fasheh } 319759a5e416SMark Fasheh 3198ccd979bdSMark Fasheh /* This function will figure out whether the currently last extent 3199ccd979bdSMark Fasheh * block will be deleted, and if it will, what the new last extent 3200ccd979bdSMark Fasheh * block will be so we can update his h_next_leaf_blk field, as well 3201ccd979bdSMark Fasheh * as the dinodes i_last_eb_blk */ 3202dcd0538fSMark Fasheh static int ocfs2_find_new_last_ext_blk(struct inode *inode, 32033a0782d0SMark Fasheh unsigned int clusters_to_del, 3204dcd0538fSMark Fasheh struct ocfs2_path *path, 3205ccd979bdSMark Fasheh struct buffer_head **new_last_eb) 3206ccd979bdSMark Fasheh { 32073a0782d0SMark Fasheh int next_free, ret = 0; 3208dcd0538fSMark Fasheh u32 cpos; 32093a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 3210ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 3211ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3212ccd979bdSMark Fasheh struct buffer_head *bh = NULL; 3213ccd979bdSMark Fasheh 3214ccd979bdSMark Fasheh *new_last_eb = NULL; 3215ccd979bdSMark Fasheh 3216ccd979bdSMark Fasheh /* we have no tree, so of course, no last_eb. */ 3217dcd0538fSMark Fasheh if (!path->p_tree_depth) 3218dcd0538fSMark Fasheh goto out; 3219ccd979bdSMark Fasheh 3220ccd979bdSMark Fasheh /* trunc to zero special case - this makes tree_depth = 0 3221ccd979bdSMark Fasheh * regardless of what it is. */ 32223a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == clusters_to_del) 3223dcd0538fSMark Fasheh goto out; 3224ccd979bdSMark Fasheh 3225dcd0538fSMark Fasheh el = path_leaf_el(path); 3226ccd979bdSMark Fasheh BUG_ON(!el->l_next_free_rec); 3227ccd979bdSMark Fasheh 32283a0782d0SMark Fasheh /* 32293a0782d0SMark Fasheh * Make sure that this extent list will actually be empty 32303a0782d0SMark Fasheh * after we clear away the data. We can shortcut out if 32313a0782d0SMark Fasheh * there's more than one non-empty extent in the 32323a0782d0SMark Fasheh * list. Otherwise, a check of the remaining extent is 32333a0782d0SMark Fasheh * necessary. 32343a0782d0SMark Fasheh */ 32353a0782d0SMark Fasheh next_free = le16_to_cpu(el->l_next_free_rec); 32363a0782d0SMark Fasheh rec = NULL; 3237dcd0538fSMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 32383a0782d0SMark Fasheh if (next_free > 2) 3239dcd0538fSMark Fasheh goto out; 32403a0782d0SMark Fasheh 32413a0782d0SMark Fasheh /* We may have a valid extent in index 1, check it. */ 32423a0782d0SMark Fasheh if (next_free == 2) 32433a0782d0SMark Fasheh rec = &el->l_recs[1]; 32443a0782d0SMark Fasheh 32453a0782d0SMark Fasheh /* 32463a0782d0SMark Fasheh * Fall through - no more nonempty extents, so we want 32473a0782d0SMark Fasheh * to delete this leaf. 32483a0782d0SMark Fasheh */ 32493a0782d0SMark Fasheh } else { 32503a0782d0SMark Fasheh if (next_free > 1) 3251dcd0538fSMark Fasheh goto out; 3252ccd979bdSMark Fasheh 32533a0782d0SMark Fasheh rec = &el->l_recs[0]; 32543a0782d0SMark Fasheh } 32553a0782d0SMark Fasheh 32563a0782d0SMark Fasheh if (rec) { 32573a0782d0SMark Fasheh /* 32583a0782d0SMark Fasheh * Check it we'll only be trimming off the end of this 32593a0782d0SMark Fasheh * cluster. 32603a0782d0SMark Fasheh */ 3261e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) > clusters_to_del) 32623a0782d0SMark Fasheh goto out; 32633a0782d0SMark Fasheh } 32643a0782d0SMark Fasheh 3265dcd0538fSMark Fasheh ret = ocfs2_find_cpos_for_left_leaf(inode->i_sb, path, &cpos); 3266dcd0538fSMark Fasheh if (ret) { 3267dcd0538fSMark Fasheh mlog_errno(ret); 3268dcd0538fSMark Fasheh goto out; 3269ccd979bdSMark Fasheh } 3270ccd979bdSMark Fasheh 3271dcd0538fSMark Fasheh ret = ocfs2_find_leaf(inode, path_root_el(path), cpos, &bh); 3272dcd0538fSMark Fasheh if (ret) { 3273dcd0538fSMark Fasheh mlog_errno(ret); 3274dcd0538fSMark Fasheh goto out; 3275ccd979bdSMark Fasheh } 3276dcd0538fSMark Fasheh 3277ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) bh->b_data; 3278ccd979bdSMark Fasheh el = &eb->h_list; 3279ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 3280ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 3281dcd0538fSMark Fasheh ret = -EROFS; 3282dcd0538fSMark Fasheh goto out; 3283ccd979bdSMark Fasheh } 3284ccd979bdSMark Fasheh 3285ccd979bdSMark Fasheh *new_last_eb = bh; 3286ccd979bdSMark Fasheh get_bh(*new_last_eb); 3287dcd0538fSMark Fasheh mlog(0, "returning block %llu, (cpos: %u)\n", 3288dcd0538fSMark Fasheh (unsigned long long)le64_to_cpu(eb->h_blkno), cpos); 3289dcd0538fSMark Fasheh out: 3290ccd979bdSMark Fasheh brelse(bh); 3291ccd979bdSMark Fasheh 3292dcd0538fSMark Fasheh return ret; 3293ccd979bdSMark Fasheh } 3294ccd979bdSMark Fasheh 32953a0782d0SMark Fasheh /* 32963a0782d0SMark Fasheh * Trim some clusters off the rightmost edge of a tree. Only called 32973a0782d0SMark Fasheh * during truncate. 32983a0782d0SMark Fasheh * 32993a0782d0SMark Fasheh * The caller needs to: 33003a0782d0SMark Fasheh * - start journaling of each path component. 33013a0782d0SMark Fasheh * - compute and fully set up any new last ext block 33023a0782d0SMark Fasheh */ 33033a0782d0SMark Fasheh static int ocfs2_trim_tree(struct inode *inode, struct ocfs2_path *path, 33043a0782d0SMark Fasheh handle_t *handle, struct ocfs2_truncate_context *tc, 33053a0782d0SMark Fasheh u32 clusters_to_del, u64 *delete_start) 33063a0782d0SMark Fasheh { 33073a0782d0SMark Fasheh int ret, i, index = path->p_tree_depth; 33083a0782d0SMark Fasheh u32 new_edge = 0; 33093a0782d0SMark Fasheh u64 deleted_eb = 0; 33103a0782d0SMark Fasheh struct buffer_head *bh; 33113a0782d0SMark Fasheh struct ocfs2_extent_list *el; 33123a0782d0SMark Fasheh struct ocfs2_extent_rec *rec; 33133a0782d0SMark Fasheh 33143a0782d0SMark Fasheh *delete_start = 0; 33153a0782d0SMark Fasheh 33163a0782d0SMark Fasheh while (index >= 0) { 33173a0782d0SMark Fasheh bh = path->p_node[index].bh; 33183a0782d0SMark Fasheh el = path->p_node[index].el; 33193a0782d0SMark Fasheh 33203a0782d0SMark Fasheh mlog(0, "traveling tree (index = %d, block = %llu)\n", 33213a0782d0SMark Fasheh index, (unsigned long long)bh->b_blocknr); 33223a0782d0SMark Fasheh 33233a0782d0SMark Fasheh BUG_ON(le16_to_cpu(el->l_next_free_rec) == 0); 33243a0782d0SMark Fasheh 33253a0782d0SMark Fasheh if (index != 33263a0782d0SMark Fasheh (path->p_tree_depth - le16_to_cpu(el->l_tree_depth))) { 33273a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 33283a0782d0SMark Fasheh "Inode %lu has invalid ext. block %llu", 33293a0782d0SMark Fasheh inode->i_ino, 33303a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr); 33313a0782d0SMark Fasheh ret = -EROFS; 33323a0782d0SMark Fasheh goto out; 33333a0782d0SMark Fasheh } 33343a0782d0SMark Fasheh 33353a0782d0SMark Fasheh find_tail_record: 33363a0782d0SMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 33373a0782d0SMark Fasheh rec = &el->l_recs[i]; 33383a0782d0SMark Fasheh 33393a0782d0SMark Fasheh mlog(0, "Extent list before: record %d: (%u, %u, %llu), " 33403a0782d0SMark Fasheh "next = %u\n", i, le32_to_cpu(rec->e_cpos), 3341e48edee2SMark Fasheh ocfs2_rec_clusters(el, rec), 33423a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 33433a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 33443a0782d0SMark Fasheh 3345e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, rec) < clusters_to_del); 33463a0782d0SMark Fasheh 33473a0782d0SMark Fasheh if (le16_to_cpu(el->l_tree_depth) == 0) { 33483a0782d0SMark Fasheh /* 33493a0782d0SMark Fasheh * If the leaf block contains a single empty 33503a0782d0SMark Fasheh * extent and no records, we can just remove 33513a0782d0SMark Fasheh * the block. 33523a0782d0SMark Fasheh */ 33533a0782d0SMark Fasheh if (i == 0 && ocfs2_is_empty_extent(rec)) { 33543a0782d0SMark Fasheh memset(rec, 0, 33553a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33563a0782d0SMark Fasheh el->l_next_free_rec = cpu_to_le16(0); 33573a0782d0SMark Fasheh 33583a0782d0SMark Fasheh goto delete; 33593a0782d0SMark Fasheh } 33603a0782d0SMark Fasheh 33613a0782d0SMark Fasheh /* 33623a0782d0SMark Fasheh * Remove any empty extents by shifting things 33633a0782d0SMark Fasheh * left. That should make life much easier on 33643a0782d0SMark Fasheh * the code below. This condition is rare 33653a0782d0SMark Fasheh * enough that we shouldn't see a performance 33663a0782d0SMark Fasheh * hit. 33673a0782d0SMark Fasheh */ 33683a0782d0SMark Fasheh if (ocfs2_is_empty_extent(&el->l_recs[0])) { 33693a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 33703a0782d0SMark Fasheh 33713a0782d0SMark Fasheh for(i = 0; 33723a0782d0SMark Fasheh i < le16_to_cpu(el->l_next_free_rec); i++) 33733a0782d0SMark Fasheh el->l_recs[i] = el->l_recs[i + 1]; 33743a0782d0SMark Fasheh 33753a0782d0SMark Fasheh memset(&el->l_recs[i], 0, 33763a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 33773a0782d0SMark Fasheh 33783a0782d0SMark Fasheh /* 33793a0782d0SMark Fasheh * We've modified our extent list. The 33803a0782d0SMark Fasheh * simplest way to handle this change 33813a0782d0SMark Fasheh * is to being the search from the 33823a0782d0SMark Fasheh * start again. 33833a0782d0SMark Fasheh */ 33843a0782d0SMark Fasheh goto find_tail_record; 33853a0782d0SMark Fasheh } 33863a0782d0SMark Fasheh 3387e48edee2SMark Fasheh le16_add_cpu(&rec->e_leaf_clusters, -clusters_to_del); 33883a0782d0SMark Fasheh 33893a0782d0SMark Fasheh /* 33903a0782d0SMark Fasheh * We'll use "new_edge" on our way back up the 33913a0782d0SMark Fasheh * tree to know what our rightmost cpos is. 33923a0782d0SMark Fasheh */ 3393e48edee2SMark Fasheh new_edge = le16_to_cpu(rec->e_leaf_clusters); 33943a0782d0SMark Fasheh new_edge += le32_to_cpu(rec->e_cpos); 33953a0782d0SMark Fasheh 33963a0782d0SMark Fasheh /* 33973a0782d0SMark Fasheh * The caller will use this to delete data blocks. 33983a0782d0SMark Fasheh */ 33993a0782d0SMark Fasheh *delete_start = le64_to_cpu(rec->e_blkno) 34003a0782d0SMark Fasheh + ocfs2_clusters_to_blocks(inode->i_sb, 3401e48edee2SMark Fasheh le16_to_cpu(rec->e_leaf_clusters)); 34023a0782d0SMark Fasheh 34033a0782d0SMark Fasheh /* 34043a0782d0SMark Fasheh * If it's now empty, remove this record. 34053a0782d0SMark Fasheh */ 3406e48edee2SMark Fasheh if (le16_to_cpu(rec->e_leaf_clusters) == 0) { 34073a0782d0SMark Fasheh memset(rec, 0, 34083a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 34093a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 34103a0782d0SMark Fasheh } 34113a0782d0SMark Fasheh } else { 34123a0782d0SMark Fasheh if (le64_to_cpu(rec->e_blkno) == deleted_eb) { 34133a0782d0SMark Fasheh memset(rec, 0, 34143a0782d0SMark Fasheh sizeof(struct ocfs2_extent_rec)); 34153a0782d0SMark Fasheh le16_add_cpu(&el->l_next_free_rec, -1); 34163a0782d0SMark Fasheh 34173a0782d0SMark Fasheh goto delete; 34183a0782d0SMark Fasheh } 34193a0782d0SMark Fasheh 34203a0782d0SMark Fasheh /* Can this actually happen? */ 34213a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) 34223a0782d0SMark Fasheh goto delete; 34233a0782d0SMark Fasheh 34243a0782d0SMark Fasheh /* 34253a0782d0SMark Fasheh * We never actually deleted any clusters 34263a0782d0SMark Fasheh * because our leaf was empty. There's no 34273a0782d0SMark Fasheh * reason to adjust the rightmost edge then. 34283a0782d0SMark Fasheh */ 34293a0782d0SMark Fasheh if (new_edge == 0) 34303a0782d0SMark Fasheh goto delete; 34313a0782d0SMark Fasheh 3432e48edee2SMark Fasheh rec->e_int_clusters = cpu_to_le32(new_edge); 3433e48edee2SMark Fasheh le32_add_cpu(&rec->e_int_clusters, 34343a0782d0SMark Fasheh -le32_to_cpu(rec->e_cpos)); 34353a0782d0SMark Fasheh 34363a0782d0SMark Fasheh /* 34373a0782d0SMark Fasheh * A deleted child record should have been 34383a0782d0SMark Fasheh * caught above. 34393a0782d0SMark Fasheh */ 3440e48edee2SMark Fasheh BUG_ON(le32_to_cpu(rec->e_int_clusters) == 0); 34413a0782d0SMark Fasheh } 34423a0782d0SMark Fasheh 34433a0782d0SMark Fasheh delete: 34443a0782d0SMark Fasheh ret = ocfs2_journal_dirty(handle, bh); 34453a0782d0SMark Fasheh if (ret) { 34463a0782d0SMark Fasheh mlog_errno(ret); 34473a0782d0SMark Fasheh goto out; 34483a0782d0SMark Fasheh } 34493a0782d0SMark Fasheh 34503a0782d0SMark Fasheh mlog(0, "extent list container %llu, after: record %d: " 34513a0782d0SMark Fasheh "(%u, %u, %llu), next = %u.\n", 34523a0782d0SMark Fasheh (unsigned long long)bh->b_blocknr, i, 3453e48edee2SMark Fasheh le32_to_cpu(rec->e_cpos), ocfs2_rec_clusters(el, rec), 34543a0782d0SMark Fasheh (unsigned long long)le64_to_cpu(rec->e_blkno), 34553a0782d0SMark Fasheh le16_to_cpu(el->l_next_free_rec)); 34563a0782d0SMark Fasheh 34573a0782d0SMark Fasheh /* 34583a0782d0SMark Fasheh * We must be careful to only attempt delete of an 34593a0782d0SMark Fasheh * extent block (and not the root inode block). 34603a0782d0SMark Fasheh */ 34613a0782d0SMark Fasheh if (index > 0 && le16_to_cpu(el->l_next_free_rec) == 0) { 34623a0782d0SMark Fasheh struct ocfs2_extent_block *eb = 34633a0782d0SMark Fasheh (struct ocfs2_extent_block *)bh->b_data; 34643a0782d0SMark Fasheh 34653a0782d0SMark Fasheh /* 34663a0782d0SMark Fasheh * Save this for use when processing the 34673a0782d0SMark Fasheh * parent block. 34683a0782d0SMark Fasheh */ 34693a0782d0SMark Fasheh deleted_eb = le64_to_cpu(eb->h_blkno); 34703a0782d0SMark Fasheh 34713a0782d0SMark Fasheh mlog(0, "deleting this extent block.\n"); 34723a0782d0SMark Fasheh 34733a0782d0SMark Fasheh ocfs2_remove_from_cache(inode, bh); 34743a0782d0SMark Fasheh 3475e48edee2SMark Fasheh BUG_ON(ocfs2_rec_clusters(el, &el->l_recs[0])); 34763a0782d0SMark Fasheh BUG_ON(le32_to_cpu(el->l_recs[0].e_cpos)); 34773a0782d0SMark Fasheh BUG_ON(le64_to_cpu(el->l_recs[0].e_blkno)); 34783a0782d0SMark Fasheh 347959a5e416SMark Fasheh ret = ocfs2_cache_extent_block_free(&tc->tc_dealloc, eb); 34803a0782d0SMark Fasheh /* An error here is not fatal. */ 34813a0782d0SMark Fasheh if (ret < 0) 34823a0782d0SMark Fasheh mlog_errno(ret); 34833a0782d0SMark Fasheh } else { 34843a0782d0SMark Fasheh deleted_eb = 0; 34853a0782d0SMark Fasheh } 34863a0782d0SMark Fasheh 34873a0782d0SMark Fasheh index--; 34883a0782d0SMark Fasheh } 34893a0782d0SMark Fasheh 34903a0782d0SMark Fasheh ret = 0; 34913a0782d0SMark Fasheh out: 34923a0782d0SMark Fasheh return ret; 34933a0782d0SMark Fasheh } 34943a0782d0SMark Fasheh 3495ccd979bdSMark Fasheh static int ocfs2_do_truncate(struct ocfs2_super *osb, 3496ccd979bdSMark Fasheh unsigned int clusters_to_del, 3497ccd979bdSMark Fasheh struct inode *inode, 3498ccd979bdSMark Fasheh struct buffer_head *fe_bh, 34991fabe148SMark Fasheh handle_t *handle, 3500dcd0538fSMark Fasheh struct ocfs2_truncate_context *tc, 3501dcd0538fSMark Fasheh struct ocfs2_path *path) 3502ccd979bdSMark Fasheh { 35033a0782d0SMark Fasheh int status; 3504ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 3505ccd979bdSMark Fasheh struct ocfs2_extent_block *last_eb = NULL; 3506ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 3507ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 3508ccd979bdSMark Fasheh u64 delete_blk = 0; 3509ccd979bdSMark Fasheh 3510ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 3511ccd979bdSMark Fasheh 35123a0782d0SMark Fasheh status = ocfs2_find_new_last_ext_blk(inode, clusters_to_del, 3513dcd0538fSMark Fasheh path, &last_eb_bh); 3514ccd979bdSMark Fasheh if (status < 0) { 3515ccd979bdSMark Fasheh mlog_errno(status); 3516ccd979bdSMark Fasheh goto bail; 3517ccd979bdSMark Fasheh } 3518ccd979bdSMark Fasheh 3519dcd0538fSMark Fasheh /* 3520dcd0538fSMark Fasheh * Each component will be touched, so we might as well journal 3521dcd0538fSMark Fasheh * here to avoid having to handle errors later. 3522dcd0538fSMark Fasheh */ 35233a0782d0SMark Fasheh status = ocfs2_journal_access_path(inode, handle, path); 3524ccd979bdSMark Fasheh if (status < 0) { 3525ccd979bdSMark Fasheh mlog_errno(status); 3526ccd979bdSMark Fasheh goto bail; 3527ccd979bdSMark Fasheh } 3528dcd0538fSMark Fasheh 3529dcd0538fSMark Fasheh if (last_eb_bh) { 3530dcd0538fSMark Fasheh status = ocfs2_journal_access(handle, inode, last_eb_bh, 3531dcd0538fSMark Fasheh OCFS2_JOURNAL_ACCESS_WRITE); 3532dcd0538fSMark Fasheh if (status < 0) { 3533dcd0538fSMark Fasheh mlog_errno(status); 3534dcd0538fSMark Fasheh goto bail; 3535dcd0538fSMark Fasheh } 3536dcd0538fSMark Fasheh 3537dcd0538fSMark Fasheh last_eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 3538dcd0538fSMark Fasheh } 3539dcd0538fSMark Fasheh 3540ccd979bdSMark Fasheh el = &(fe->id2.i_list); 3541ccd979bdSMark Fasheh 3542dcd0538fSMark Fasheh /* 3543dcd0538fSMark Fasheh * Lower levels depend on this never happening, but it's best 3544dcd0538fSMark Fasheh * to check it up here before changing the tree. 3545dcd0538fSMark Fasheh */ 3546e48edee2SMark Fasheh if (el->l_tree_depth && el->l_recs[0].e_int_clusters == 0) { 3547dcd0538fSMark Fasheh ocfs2_error(inode->i_sb, 3548dcd0538fSMark Fasheh "Inode %lu has an empty extent record, depth %u\n", 3549dcd0538fSMark Fasheh inode->i_ino, le16_to_cpu(el->l_tree_depth)); 35503a0782d0SMark Fasheh status = -EROFS; 3551dcd0538fSMark Fasheh goto bail; 3552dcd0538fSMark Fasheh } 3553dcd0538fSMark Fasheh 3554ccd979bdSMark Fasheh spin_lock(&OCFS2_I(inode)->ip_lock); 3555ccd979bdSMark Fasheh OCFS2_I(inode)->ip_clusters = le32_to_cpu(fe->i_clusters) - 3556ccd979bdSMark Fasheh clusters_to_del; 3557ccd979bdSMark Fasheh spin_unlock(&OCFS2_I(inode)->ip_lock); 3558ccd979bdSMark Fasheh le32_add_cpu(&fe->i_clusters, -clusters_to_del); 3559ccd979bdSMark Fasheh 35603a0782d0SMark Fasheh status = ocfs2_trim_tree(inode, path, handle, tc, 35613a0782d0SMark Fasheh clusters_to_del, &delete_blk); 35623a0782d0SMark Fasheh if (status) { 35633a0782d0SMark Fasheh mlog_errno(status); 35643a0782d0SMark Fasheh goto bail; 3565ccd979bdSMark Fasheh } 3566ccd979bdSMark Fasheh 3567dcd0538fSMark Fasheh if (le32_to_cpu(fe->i_clusters) == 0) { 3568ccd979bdSMark Fasheh /* trunc to zero is a special case. */ 3569ccd979bdSMark Fasheh el->l_tree_depth = 0; 3570ccd979bdSMark Fasheh fe->i_last_eb_blk = 0; 3571ccd979bdSMark Fasheh } else if (last_eb) 3572ccd979bdSMark Fasheh fe->i_last_eb_blk = last_eb->h_blkno; 3573ccd979bdSMark Fasheh 3574ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, fe_bh); 3575ccd979bdSMark Fasheh if (status < 0) { 3576ccd979bdSMark Fasheh mlog_errno(status); 3577ccd979bdSMark Fasheh goto bail; 3578ccd979bdSMark Fasheh } 3579ccd979bdSMark Fasheh 3580ccd979bdSMark Fasheh if (last_eb) { 3581ccd979bdSMark Fasheh /* If there will be a new last extent block, then by 3582ccd979bdSMark Fasheh * definition, there cannot be any leaves to the right of 3583ccd979bdSMark Fasheh * him. */ 3584ccd979bdSMark Fasheh last_eb->h_next_leaf_blk = 0; 3585ccd979bdSMark Fasheh status = ocfs2_journal_dirty(handle, last_eb_bh); 3586ccd979bdSMark Fasheh if (status < 0) { 3587ccd979bdSMark Fasheh mlog_errno(status); 3588ccd979bdSMark Fasheh goto bail; 3589ccd979bdSMark Fasheh } 3590ccd979bdSMark Fasheh } 3591ccd979bdSMark Fasheh 35923a0782d0SMark Fasheh if (delete_blk) { 3593ccd979bdSMark Fasheh status = ocfs2_truncate_log_append(osb, handle, delete_blk, 3594ccd979bdSMark Fasheh clusters_to_del); 3595ccd979bdSMark Fasheh if (status < 0) { 3596ccd979bdSMark Fasheh mlog_errno(status); 3597ccd979bdSMark Fasheh goto bail; 3598ccd979bdSMark Fasheh } 35993a0782d0SMark Fasheh } 3600ccd979bdSMark Fasheh status = 0; 3601ccd979bdSMark Fasheh bail: 3602dcd0538fSMark Fasheh 3603ccd979bdSMark Fasheh mlog_exit(status); 3604ccd979bdSMark Fasheh return status; 3605ccd979bdSMark Fasheh } 3606ccd979bdSMark Fasheh 360760b11392SMark Fasheh static int ocfs2_writeback_zero_func(handle_t *handle, struct buffer_head *bh) 360860b11392SMark Fasheh { 360960b11392SMark Fasheh set_buffer_uptodate(bh); 361060b11392SMark Fasheh mark_buffer_dirty(bh); 361160b11392SMark Fasheh return 0; 361260b11392SMark Fasheh } 361360b11392SMark Fasheh 361460b11392SMark Fasheh static int ocfs2_ordered_zero_func(handle_t *handle, struct buffer_head *bh) 361560b11392SMark Fasheh { 361660b11392SMark Fasheh set_buffer_uptodate(bh); 361760b11392SMark Fasheh mark_buffer_dirty(bh); 361860b11392SMark Fasheh return ocfs2_journal_dirty_data(handle, bh); 361960b11392SMark Fasheh } 362060b11392SMark Fasheh 362160b11392SMark Fasheh static void ocfs2_zero_cluster_pages(struct inode *inode, loff_t isize, 362260b11392SMark Fasheh struct page **pages, int numpages, 362360b11392SMark Fasheh u64 phys, handle_t *handle) 362460b11392SMark Fasheh { 362560b11392SMark Fasheh int i, ret, partial = 0; 362660b11392SMark Fasheh void *kaddr; 362760b11392SMark Fasheh struct page *page; 362860b11392SMark Fasheh unsigned int from, to = PAGE_CACHE_SIZE; 362960b11392SMark Fasheh struct super_block *sb = inode->i_sb; 363060b11392SMark Fasheh 363160b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 363260b11392SMark Fasheh 363360b11392SMark Fasheh if (numpages == 0) 363460b11392SMark Fasheh goto out; 363560b11392SMark Fasheh 363660b11392SMark Fasheh from = isize & (PAGE_CACHE_SIZE - 1); /* 1st page offset */ 363760b11392SMark Fasheh if (PAGE_CACHE_SHIFT > OCFS2_SB(sb)->s_clustersize_bits) { 363860b11392SMark Fasheh /* 363960b11392SMark Fasheh * Since 'from' has been capped to a value below page 364060b11392SMark Fasheh * size, this calculation won't be able to overflow 364160b11392SMark Fasheh * 'to' 364260b11392SMark Fasheh */ 364360b11392SMark Fasheh to = ocfs2_align_bytes_to_clusters(sb, from); 364460b11392SMark Fasheh 364560b11392SMark Fasheh /* 364660b11392SMark Fasheh * The truncate tail in this case should never contain 364760b11392SMark Fasheh * more than one page at maximum. The loop below also 364860b11392SMark Fasheh * assumes this. 364960b11392SMark Fasheh */ 365060b11392SMark Fasheh BUG_ON(numpages != 1); 365160b11392SMark Fasheh } 365260b11392SMark Fasheh 365360b11392SMark Fasheh for(i = 0; i < numpages; i++) { 365460b11392SMark Fasheh page = pages[i]; 365560b11392SMark Fasheh 365660b11392SMark Fasheh BUG_ON(from > PAGE_CACHE_SIZE); 365760b11392SMark Fasheh BUG_ON(to > PAGE_CACHE_SIZE); 365860b11392SMark Fasheh 365960b11392SMark Fasheh ret = ocfs2_map_page_blocks(page, &phys, inode, from, to, 0); 366060b11392SMark Fasheh if (ret) 366160b11392SMark Fasheh mlog_errno(ret); 366260b11392SMark Fasheh 366360b11392SMark Fasheh kaddr = kmap_atomic(page, KM_USER0); 366460b11392SMark Fasheh memset(kaddr + from, 0, to - from); 366560b11392SMark Fasheh kunmap_atomic(kaddr, KM_USER0); 366660b11392SMark Fasheh 366760b11392SMark Fasheh /* 366860b11392SMark Fasheh * Need to set the buffers we zero'd into uptodate 366960b11392SMark Fasheh * here if they aren't - ocfs2_map_page_blocks() 367060b11392SMark Fasheh * might've skipped some 367160b11392SMark Fasheh */ 367260b11392SMark Fasheh if (ocfs2_should_order_data(inode)) { 367360b11392SMark Fasheh ret = walk_page_buffers(handle, 367460b11392SMark Fasheh page_buffers(page), 367560b11392SMark Fasheh from, to, &partial, 367660b11392SMark Fasheh ocfs2_ordered_zero_func); 367760b11392SMark Fasheh if (ret < 0) 367860b11392SMark Fasheh mlog_errno(ret); 367960b11392SMark Fasheh } else { 368060b11392SMark Fasheh ret = walk_page_buffers(handle, page_buffers(page), 368160b11392SMark Fasheh from, to, &partial, 368260b11392SMark Fasheh ocfs2_writeback_zero_func); 368360b11392SMark Fasheh if (ret < 0) 368460b11392SMark Fasheh mlog_errno(ret); 368560b11392SMark Fasheh } 368660b11392SMark Fasheh 368760b11392SMark Fasheh if (!partial) 368860b11392SMark Fasheh SetPageUptodate(page); 368960b11392SMark Fasheh 369060b11392SMark Fasheh flush_dcache_page(page); 369160b11392SMark Fasheh 369260b11392SMark Fasheh /* 369360b11392SMark Fasheh * Every page after the 1st one should be completely zero'd. 369460b11392SMark Fasheh */ 369560b11392SMark Fasheh from = 0; 369660b11392SMark Fasheh } 369760b11392SMark Fasheh out: 369860b11392SMark Fasheh if (pages) { 369960b11392SMark Fasheh for (i = 0; i < numpages; i++) { 370060b11392SMark Fasheh page = pages[i]; 370160b11392SMark Fasheh unlock_page(page); 370260b11392SMark Fasheh mark_page_accessed(page); 370360b11392SMark Fasheh page_cache_release(page); 370460b11392SMark Fasheh } 370560b11392SMark Fasheh } 370660b11392SMark Fasheh } 370760b11392SMark Fasheh 370860b11392SMark Fasheh static int ocfs2_grab_eof_pages(struct inode *inode, loff_t isize, struct page **pages, 370960b11392SMark Fasheh int *num, u64 *phys) 371060b11392SMark Fasheh { 371160b11392SMark Fasheh int i, numpages = 0, ret = 0; 371260b11392SMark Fasheh unsigned int csize = OCFS2_SB(inode->i_sb)->s_clustersize; 371349cb8d2dSMark Fasheh unsigned int ext_flags; 371460b11392SMark Fasheh struct super_block *sb = inode->i_sb; 371560b11392SMark Fasheh struct address_space *mapping = inode->i_mapping; 371660b11392SMark Fasheh unsigned long index; 371760b11392SMark Fasheh u64 next_cluster_bytes; 371860b11392SMark Fasheh 371960b11392SMark Fasheh BUG_ON(!ocfs2_sparse_alloc(OCFS2_SB(sb))); 372060b11392SMark Fasheh 372160b11392SMark Fasheh /* Cluster boundary, so we don't need to grab any pages. */ 372260b11392SMark Fasheh if ((isize & (csize - 1)) == 0) 372360b11392SMark Fasheh goto out; 372460b11392SMark Fasheh 372560b11392SMark Fasheh ret = ocfs2_extent_map_get_blocks(inode, isize >> sb->s_blocksize_bits, 372649cb8d2dSMark Fasheh phys, NULL, &ext_flags); 372760b11392SMark Fasheh if (ret) { 372860b11392SMark Fasheh mlog_errno(ret); 372960b11392SMark Fasheh goto out; 373060b11392SMark Fasheh } 373160b11392SMark Fasheh 373260b11392SMark Fasheh /* Tail is a hole. */ 373360b11392SMark Fasheh if (*phys == 0) 373460b11392SMark Fasheh goto out; 373560b11392SMark Fasheh 373649cb8d2dSMark Fasheh /* Tail is marked as unwritten, we can count on write to zero 373749cb8d2dSMark Fasheh * in that case. */ 373849cb8d2dSMark Fasheh if (ext_flags & OCFS2_EXT_UNWRITTEN) 373949cb8d2dSMark Fasheh goto out; 374049cb8d2dSMark Fasheh 374160b11392SMark Fasheh next_cluster_bytes = ocfs2_align_bytes_to_clusters(inode->i_sb, isize); 374260b11392SMark Fasheh index = isize >> PAGE_CACHE_SHIFT; 374360b11392SMark Fasheh do { 374460b11392SMark Fasheh pages[numpages] = grab_cache_page(mapping, index); 374560b11392SMark Fasheh if (!pages[numpages]) { 374660b11392SMark Fasheh ret = -ENOMEM; 374760b11392SMark Fasheh mlog_errno(ret); 374860b11392SMark Fasheh goto out; 374960b11392SMark Fasheh } 375060b11392SMark Fasheh 375160b11392SMark Fasheh numpages++; 375260b11392SMark Fasheh index++; 375360b11392SMark Fasheh } while (index < (next_cluster_bytes >> PAGE_CACHE_SHIFT)); 375460b11392SMark Fasheh 375560b11392SMark Fasheh out: 375660b11392SMark Fasheh if (ret != 0) { 375760b11392SMark Fasheh if (pages) { 375860b11392SMark Fasheh for (i = 0; i < numpages; i++) { 375960b11392SMark Fasheh if (pages[i]) { 376060b11392SMark Fasheh unlock_page(pages[i]); 376160b11392SMark Fasheh page_cache_release(pages[i]); 376260b11392SMark Fasheh } 376360b11392SMark Fasheh } 376460b11392SMark Fasheh } 376560b11392SMark Fasheh numpages = 0; 376660b11392SMark Fasheh } 376760b11392SMark Fasheh 376860b11392SMark Fasheh *num = numpages; 376960b11392SMark Fasheh 377060b11392SMark Fasheh return ret; 377160b11392SMark Fasheh } 377260b11392SMark Fasheh 377360b11392SMark Fasheh /* 377460b11392SMark Fasheh * Zero the area past i_size but still within an allocated 377560b11392SMark Fasheh * cluster. This avoids exposing nonzero data on subsequent file 377660b11392SMark Fasheh * extends. 377760b11392SMark Fasheh * 377860b11392SMark Fasheh * We need to call this before i_size is updated on the inode because 377960b11392SMark Fasheh * otherwise block_write_full_page() will skip writeout of pages past 378060b11392SMark Fasheh * i_size. The new_i_size parameter is passed for this reason. 378160b11392SMark Fasheh */ 378260b11392SMark Fasheh int ocfs2_zero_tail_for_truncate(struct inode *inode, handle_t *handle, 378360b11392SMark Fasheh u64 new_i_size) 378460b11392SMark Fasheh { 378560b11392SMark Fasheh int ret, numpages; 3786fa41045fSMark Fasheh loff_t endbyte; 378760b11392SMark Fasheh struct page **pages = NULL; 378860b11392SMark Fasheh u64 phys; 378960b11392SMark Fasheh 379060b11392SMark Fasheh /* 379160b11392SMark Fasheh * File systems which don't support sparse files zero on every 379260b11392SMark Fasheh * extend. 379360b11392SMark Fasheh */ 379460b11392SMark Fasheh if (!ocfs2_sparse_alloc(OCFS2_SB(inode->i_sb))) 379560b11392SMark Fasheh return 0; 379660b11392SMark Fasheh 379760b11392SMark Fasheh pages = kcalloc(ocfs2_pages_per_cluster(inode->i_sb), 379860b11392SMark Fasheh sizeof(struct page *), GFP_NOFS); 379960b11392SMark Fasheh if (pages == NULL) { 380060b11392SMark Fasheh ret = -ENOMEM; 380160b11392SMark Fasheh mlog_errno(ret); 380260b11392SMark Fasheh goto out; 380360b11392SMark Fasheh } 380460b11392SMark Fasheh 380560b11392SMark Fasheh ret = ocfs2_grab_eof_pages(inode, new_i_size, pages, &numpages, &phys); 380660b11392SMark Fasheh if (ret) { 380760b11392SMark Fasheh mlog_errno(ret); 380860b11392SMark Fasheh goto out; 380960b11392SMark Fasheh } 381060b11392SMark Fasheh 381160b11392SMark Fasheh if (numpages == 0) 381260b11392SMark Fasheh goto out; 381360b11392SMark Fasheh 381460b11392SMark Fasheh ocfs2_zero_cluster_pages(inode, new_i_size, pages, numpages, phys, 381560b11392SMark Fasheh handle); 381660b11392SMark Fasheh 381760b11392SMark Fasheh /* 381860b11392SMark Fasheh * Initiate writeout of the pages we zero'd here. We don't 381960b11392SMark Fasheh * wait on them - the truncate_inode_pages() call later will 382060b11392SMark Fasheh * do that for us. 382160b11392SMark Fasheh */ 3822fa41045fSMark Fasheh endbyte = ocfs2_align_bytes_to_clusters(inode->i_sb, new_i_size); 3823fa41045fSMark Fasheh ret = do_sync_mapping_range(inode->i_mapping, new_i_size, 3824fa41045fSMark Fasheh endbyte - 1, SYNC_FILE_RANGE_WRITE); 382560b11392SMark Fasheh if (ret) 382660b11392SMark Fasheh mlog_errno(ret); 382760b11392SMark Fasheh 382860b11392SMark Fasheh out: 382960b11392SMark Fasheh if (pages) 383060b11392SMark Fasheh kfree(pages); 383160b11392SMark Fasheh 383260b11392SMark Fasheh return ret; 383360b11392SMark Fasheh } 383460b11392SMark Fasheh 3835ccd979bdSMark Fasheh /* 3836ccd979bdSMark Fasheh * It is expected, that by the time you call this function, 3837ccd979bdSMark Fasheh * inode->i_size and fe->i_size have been adjusted. 3838ccd979bdSMark Fasheh * 3839ccd979bdSMark Fasheh * WARNING: This will kfree the truncate context 3840ccd979bdSMark Fasheh */ 3841ccd979bdSMark Fasheh int ocfs2_commit_truncate(struct ocfs2_super *osb, 3842ccd979bdSMark Fasheh struct inode *inode, 3843ccd979bdSMark Fasheh struct buffer_head *fe_bh, 3844ccd979bdSMark Fasheh struct ocfs2_truncate_context *tc) 3845ccd979bdSMark Fasheh { 3846ccd979bdSMark Fasheh int status, i, credits, tl_sem = 0; 3847dcd0538fSMark Fasheh u32 clusters_to_del, new_highest_cpos, range; 3848ccd979bdSMark Fasheh struct ocfs2_extent_list *el; 38491fabe148SMark Fasheh handle_t *handle = NULL; 3850ccd979bdSMark Fasheh struct inode *tl_inode = osb->osb_tl_inode; 3851dcd0538fSMark Fasheh struct ocfs2_path *path = NULL; 3852ccd979bdSMark Fasheh 3853ccd979bdSMark Fasheh mlog_entry_void(); 3854ccd979bdSMark Fasheh 3855dcd0538fSMark Fasheh new_highest_cpos = ocfs2_clusters_for_bytes(osb->sb, 3856ccd979bdSMark Fasheh i_size_read(inode)); 3857ccd979bdSMark Fasheh 3858dcd0538fSMark Fasheh path = ocfs2_new_inode_path(fe_bh); 3859dcd0538fSMark Fasheh if (!path) { 3860dcd0538fSMark Fasheh status = -ENOMEM; 3861ccd979bdSMark Fasheh mlog_errno(status); 3862ccd979bdSMark Fasheh goto bail; 3863ccd979bdSMark Fasheh } 386483418978SMark Fasheh 386583418978SMark Fasheh ocfs2_extent_map_trunc(inode, new_highest_cpos); 386683418978SMark Fasheh 3867dcd0538fSMark Fasheh start: 3868dcd0538fSMark Fasheh /* 38693a0782d0SMark Fasheh * Check that we still have allocation to delete. 38703a0782d0SMark Fasheh */ 38713a0782d0SMark Fasheh if (OCFS2_I(inode)->ip_clusters == 0) { 38723a0782d0SMark Fasheh status = 0; 38733a0782d0SMark Fasheh goto bail; 38743a0782d0SMark Fasheh } 38753a0782d0SMark Fasheh 38763a0782d0SMark Fasheh /* 3877dcd0538fSMark Fasheh * Truncate always works against the rightmost tree branch. 3878dcd0538fSMark Fasheh */ 3879dcd0538fSMark Fasheh status = ocfs2_find_path(inode, path, UINT_MAX); 3880dcd0538fSMark Fasheh if (status) { 3881dcd0538fSMark Fasheh mlog_errno(status); 3882ccd979bdSMark Fasheh goto bail; 3883ccd979bdSMark Fasheh } 3884ccd979bdSMark Fasheh 3885dcd0538fSMark Fasheh mlog(0, "inode->ip_clusters = %u, tree_depth = %u\n", 3886dcd0538fSMark Fasheh OCFS2_I(inode)->ip_clusters, path->p_tree_depth); 3887dcd0538fSMark Fasheh 3888dcd0538fSMark Fasheh /* 3889dcd0538fSMark Fasheh * By now, el will point to the extent list on the bottom most 3890dcd0538fSMark Fasheh * portion of this tree. Only the tail record is considered in 3891dcd0538fSMark Fasheh * each pass. 3892dcd0538fSMark Fasheh * 3893dcd0538fSMark Fasheh * We handle the following cases, in order: 3894dcd0538fSMark Fasheh * - empty extent: delete the remaining branch 3895dcd0538fSMark Fasheh * - remove the entire record 3896dcd0538fSMark Fasheh * - remove a partial record 3897dcd0538fSMark Fasheh * - no record needs to be removed (truncate has completed) 3898dcd0538fSMark Fasheh */ 3899dcd0538fSMark Fasheh el = path_leaf_el(path); 39003a0782d0SMark Fasheh if (le16_to_cpu(el->l_next_free_rec) == 0) { 39013a0782d0SMark Fasheh ocfs2_error(inode->i_sb, 39023a0782d0SMark Fasheh "Inode %llu has empty extent block at %llu\n", 39033a0782d0SMark Fasheh (unsigned long long)OCFS2_I(inode)->ip_blkno, 39043a0782d0SMark Fasheh (unsigned long long)path_leaf_bh(path)->b_blocknr); 39053a0782d0SMark Fasheh status = -EROFS; 39063a0782d0SMark Fasheh goto bail; 39073a0782d0SMark Fasheh } 39083a0782d0SMark Fasheh 3909ccd979bdSMark Fasheh i = le16_to_cpu(el->l_next_free_rec) - 1; 3910dcd0538fSMark Fasheh range = le32_to_cpu(el->l_recs[i].e_cpos) + 3911e48edee2SMark Fasheh ocfs2_rec_clusters(el, &el->l_recs[i]); 3912dcd0538fSMark Fasheh if (i == 0 && ocfs2_is_empty_extent(&el->l_recs[i])) { 3913dcd0538fSMark Fasheh clusters_to_del = 0; 3914dcd0538fSMark Fasheh } else if (le32_to_cpu(el->l_recs[i].e_cpos) >= new_highest_cpos) { 3915e48edee2SMark Fasheh clusters_to_del = ocfs2_rec_clusters(el, &el->l_recs[i]); 3916dcd0538fSMark Fasheh } else if (range > new_highest_cpos) { 3917e48edee2SMark Fasheh clusters_to_del = (ocfs2_rec_clusters(el, &el->l_recs[i]) + 3918ccd979bdSMark Fasheh le32_to_cpu(el->l_recs[i].e_cpos)) - 3919dcd0538fSMark Fasheh new_highest_cpos; 3920dcd0538fSMark Fasheh } else { 3921dcd0538fSMark Fasheh status = 0; 3922dcd0538fSMark Fasheh goto bail; 3923dcd0538fSMark Fasheh } 3924ccd979bdSMark Fasheh 3925dcd0538fSMark Fasheh mlog(0, "clusters_to_del = %u in this pass, tail blk=%llu\n", 3926dcd0538fSMark Fasheh clusters_to_del, (unsigned long long)path_leaf_bh(path)->b_blocknr); 3927dcd0538fSMark Fasheh 3928dcd0538fSMark Fasheh BUG_ON(clusters_to_del == 0); 3929ccd979bdSMark Fasheh 39301b1dcc1bSJes Sorensen mutex_lock(&tl_inode->i_mutex); 3931ccd979bdSMark Fasheh tl_sem = 1; 3932ccd979bdSMark Fasheh /* ocfs2_truncate_log_needs_flush guarantees us at least one 3933ccd979bdSMark Fasheh * record is free for use. If there isn't any, we flush to get 3934ccd979bdSMark Fasheh * an empty truncate log. */ 3935ccd979bdSMark Fasheh if (ocfs2_truncate_log_needs_flush(osb)) { 3936ccd979bdSMark Fasheh status = __ocfs2_flush_truncate_log(osb); 3937ccd979bdSMark Fasheh if (status < 0) { 3938ccd979bdSMark Fasheh mlog_errno(status); 3939ccd979bdSMark Fasheh goto bail; 3940ccd979bdSMark Fasheh } 3941ccd979bdSMark Fasheh } 3942ccd979bdSMark Fasheh 3943ccd979bdSMark Fasheh credits = ocfs2_calc_tree_trunc_credits(osb->sb, clusters_to_del, 3944dcd0538fSMark Fasheh (struct ocfs2_dinode *)fe_bh->b_data, 3945dcd0538fSMark Fasheh el); 394665eff9ccSMark Fasheh handle = ocfs2_start_trans(osb, credits); 3947ccd979bdSMark Fasheh if (IS_ERR(handle)) { 3948ccd979bdSMark Fasheh status = PTR_ERR(handle); 3949ccd979bdSMark Fasheh handle = NULL; 3950ccd979bdSMark Fasheh mlog_errno(status); 3951ccd979bdSMark Fasheh goto bail; 3952ccd979bdSMark Fasheh } 3953ccd979bdSMark Fasheh 3954dcd0538fSMark Fasheh status = ocfs2_do_truncate(osb, clusters_to_del, inode, fe_bh, handle, 3955dcd0538fSMark Fasheh tc, path); 3956ccd979bdSMark Fasheh if (status < 0) { 3957ccd979bdSMark Fasheh mlog_errno(status); 3958ccd979bdSMark Fasheh goto bail; 3959ccd979bdSMark Fasheh } 3960ccd979bdSMark Fasheh 39611b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3962ccd979bdSMark Fasheh tl_sem = 0; 3963ccd979bdSMark Fasheh 396402dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3965ccd979bdSMark Fasheh handle = NULL; 3966ccd979bdSMark Fasheh 3967dcd0538fSMark Fasheh ocfs2_reinit_path(path, 1); 3968dcd0538fSMark Fasheh 3969dcd0538fSMark Fasheh /* 39703a0782d0SMark Fasheh * The check above will catch the case where we've truncated 39713a0782d0SMark Fasheh * away all allocation. 3972dcd0538fSMark Fasheh */ 3973ccd979bdSMark Fasheh goto start; 39743a0782d0SMark Fasheh 3975ccd979bdSMark Fasheh bail: 3976ccd979bdSMark Fasheh 3977ccd979bdSMark Fasheh ocfs2_schedule_truncate_log_flush(osb, 1); 3978ccd979bdSMark Fasheh 3979ccd979bdSMark Fasheh if (tl_sem) 39801b1dcc1bSJes Sorensen mutex_unlock(&tl_inode->i_mutex); 3981ccd979bdSMark Fasheh 3982ccd979bdSMark Fasheh if (handle) 398302dc1af4SMark Fasheh ocfs2_commit_trans(osb, handle); 3984ccd979bdSMark Fasheh 398559a5e416SMark Fasheh ocfs2_run_deallocs(osb, &tc->tc_dealloc); 398659a5e416SMark Fasheh 3987dcd0538fSMark Fasheh ocfs2_free_path(path); 3988ccd979bdSMark Fasheh 3989ccd979bdSMark Fasheh /* This will drop the ext_alloc cluster lock for us */ 3990ccd979bdSMark Fasheh ocfs2_free_truncate_context(tc); 3991ccd979bdSMark Fasheh 3992ccd979bdSMark Fasheh mlog_exit(status); 3993ccd979bdSMark Fasheh return status; 3994ccd979bdSMark Fasheh } 3995ccd979bdSMark Fasheh 3996ccd979bdSMark Fasheh /* 399759a5e416SMark Fasheh * Expects the inode to already be locked. 3998ccd979bdSMark Fasheh */ 3999ccd979bdSMark Fasheh int ocfs2_prepare_truncate(struct ocfs2_super *osb, 4000ccd979bdSMark Fasheh struct inode *inode, 4001ccd979bdSMark Fasheh struct buffer_head *fe_bh, 4002ccd979bdSMark Fasheh struct ocfs2_truncate_context **tc) 4003ccd979bdSMark Fasheh { 400459a5e416SMark Fasheh int status; 4005ccd979bdSMark Fasheh unsigned int new_i_clusters; 4006ccd979bdSMark Fasheh struct ocfs2_dinode *fe; 4007ccd979bdSMark Fasheh struct ocfs2_extent_block *eb; 4008ccd979bdSMark Fasheh struct buffer_head *last_eb_bh = NULL; 4009ccd979bdSMark Fasheh 4010ccd979bdSMark Fasheh mlog_entry_void(); 4011ccd979bdSMark Fasheh 4012ccd979bdSMark Fasheh *tc = NULL; 4013ccd979bdSMark Fasheh 4014ccd979bdSMark Fasheh new_i_clusters = ocfs2_clusters_for_bytes(osb->sb, 4015ccd979bdSMark Fasheh i_size_read(inode)); 4016ccd979bdSMark Fasheh fe = (struct ocfs2_dinode *) fe_bh->b_data; 4017ccd979bdSMark Fasheh 4018ccd979bdSMark Fasheh mlog(0, "fe->i_clusters = %u, new_i_clusters = %u, fe->i_size =" 40191ca1a111SMark Fasheh "%llu\n", le32_to_cpu(fe->i_clusters), new_i_clusters, 40201ca1a111SMark Fasheh (unsigned long long)le64_to_cpu(fe->i_size)); 4021ccd979bdSMark Fasheh 4022cd861280SRobert P. J. Day *tc = kzalloc(sizeof(struct ocfs2_truncate_context), GFP_KERNEL); 4023ccd979bdSMark Fasheh if (!(*tc)) { 4024ccd979bdSMark Fasheh status = -ENOMEM; 4025ccd979bdSMark Fasheh mlog_errno(status); 4026ccd979bdSMark Fasheh goto bail; 4027ccd979bdSMark Fasheh } 402859a5e416SMark Fasheh ocfs2_init_dealloc_ctxt(&(*tc)->tc_dealloc); 4029ccd979bdSMark Fasheh 4030ccd979bdSMark Fasheh if (fe->id2.i_list.l_tree_depth) { 4031ccd979bdSMark Fasheh status = ocfs2_read_block(osb, le64_to_cpu(fe->i_last_eb_blk), 4032ccd979bdSMark Fasheh &last_eb_bh, OCFS2_BH_CACHED, inode); 4033ccd979bdSMark Fasheh if (status < 0) { 4034ccd979bdSMark Fasheh mlog_errno(status); 4035ccd979bdSMark Fasheh goto bail; 4036ccd979bdSMark Fasheh } 4037ccd979bdSMark Fasheh eb = (struct ocfs2_extent_block *) last_eb_bh->b_data; 4038ccd979bdSMark Fasheh if (!OCFS2_IS_VALID_EXTENT_BLOCK(eb)) { 4039ccd979bdSMark Fasheh OCFS2_RO_ON_INVALID_EXTENT_BLOCK(inode->i_sb, eb); 4040ccd979bdSMark Fasheh 4041ccd979bdSMark Fasheh brelse(last_eb_bh); 4042ccd979bdSMark Fasheh status = -EIO; 4043ccd979bdSMark Fasheh goto bail; 4044ccd979bdSMark Fasheh } 4045ccd979bdSMark Fasheh } 4046ccd979bdSMark Fasheh 4047ccd979bdSMark Fasheh (*tc)->tc_last_eb_bh = last_eb_bh; 4048ccd979bdSMark Fasheh 4049ccd979bdSMark Fasheh status = 0; 4050ccd979bdSMark Fasheh bail: 4051ccd979bdSMark Fasheh if (status < 0) { 4052ccd979bdSMark Fasheh if (*tc) 4053ccd979bdSMark Fasheh ocfs2_free_truncate_context(*tc); 4054ccd979bdSMark Fasheh *tc = NULL; 4055ccd979bdSMark Fasheh } 4056ccd979bdSMark Fasheh mlog_exit_void(); 4057ccd979bdSMark Fasheh return status; 4058ccd979bdSMark Fasheh } 4059ccd979bdSMark Fasheh 4060ccd979bdSMark Fasheh static void ocfs2_free_truncate_context(struct ocfs2_truncate_context *tc) 4061ccd979bdSMark Fasheh { 406259a5e416SMark Fasheh /* 406359a5e416SMark Fasheh * The caller is responsible for completing deallocation 406459a5e416SMark Fasheh * before freeing the context. 406559a5e416SMark Fasheh */ 406659a5e416SMark Fasheh if (tc->tc_dealloc.c_first_suballocator != NULL) 406759a5e416SMark Fasheh mlog(ML_NOTICE, 406859a5e416SMark Fasheh "Truncate completion has non-empty dealloc context\n"); 4069ccd979bdSMark Fasheh 4070ccd979bdSMark Fasheh if (tc->tc_last_eb_bh) 4071ccd979bdSMark Fasheh brelse(tc->tc_last_eb_bh); 4072ccd979bdSMark Fasheh 4073ccd979bdSMark Fasheh kfree(tc); 4074ccd979bdSMark Fasheh } 4075